<h2>PROJECT: Twitter Topic Modelling and Sentiment Analysis</h2>

**Loading Packages**

In [None]:
import re,json, requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from gensim import corpora, models

import pyLDAvis
import pyLDAvis.gensim_models 

**Loading Data**

In [None]:
url = 'https://raw.githubusercontent.com/katenjoki/Twitter-Data-Analysis/main/data/covid19.json'
df = pd.read_json(url,lines=True)

In [None]:
#unique languages check?
df['lang'].unique()

<h2>Exploratory Data Analysis</h2>

**Cleaning Data**

* Drop null values
* make all words lower_case
* remove punctuation marks

In [None]:
tweets=pd.DataFrame(columns=['original_text'])
tweets['original_text']=df['text']
tweets.head()

In [None]:
#check for null values
tweets.isnull().any()

In [None]:
#Checking random original tweet to see if the tweets are standardised
print('Original text\n',tweets.at[55,'original_text'])

In [None]:
tweets['clean_text']=tweets['original_text'].str.lower()

tweets['clean_text'] = tweets['clean_text'].str.replace('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ')

In [None]:
#function to drop punctuation marks

import string
def clean_text_column(tweet):
    for punctuation in string.punctuation:
        tweet = tweet.replace(punctuation," ")
    return tweet

tweets['clean_text']=tweets['clean_text'].apply(clean_text_column)

print('Clean tweet: \n',tweets.at[33,'clean_text'])

In [None]:
tweets.head()

In [None]:
tweets=tweets.drop('original_text',axis=1)

<h2>Data Visualization using WordCloud</h2>

In [None]:
from wordcloud import WordCloud, ImageColorGenerator,STOPWORDS
#STOPWORDS are words that don't contain enough significance
plt.figure(figsize=(15, 10))
plt.imshow(WordCloud(width=800,height=300,stopwords=STOPWORDS).generate(' '.join(tweets.clean_text .values)))
plt.axis('off')
plt.title('Most Frequent Words In Our Tweets',fontsize=12)
plt.show()

**Plot Top 10 words to appear in Tweets**

In [None]:
from collections import Counter

results = Counter()
tweets['clean_text'].str.lower().str.split().apply(results.update)
results =  pd.DataFrame.from_dict(results,orient='index',columns=['count'])
results=results.sort_values(by='count',ascending=False)
results.head(10)

In [None]:
#Visualising Top 10 words in tweets

plt.figure(figsize=(12,8))
results[:10].plot.barh(colormap='Spectral')
plt.title('Top 10 words in tweets')
plt.show()

<h3>Topic Modelling </h3>

**Dictionary, Corpora, LDA**

In [None]:
sentences = [tweet for tweet in tweets['clean_text']]
words = [sent.split() for sent in sentences]

print('Sentence\n',sentences[:1],'\n')#shows the first sentence
print('The words split from the first sentence\n',words[:1])

In [None]:
#hide
#Create dictionary which contains Id and word 
word_id = corpora.Dictionary(words)
for k, v in word_id.items():
    print(k,"........",v)
    
tweet_list = [word_id.doc2bow(tweet) for tweet in words]

In [None]:
print(tweet_list[:1])

id_words = [[(word_id[id],count) for id, count in tweet]for tweet in tweet_list]
print('\n First document: \n',sentences[:1])#print actual words

print('\n The actual words in the first document \n',id_words[:1])

**Interpretation of the above code**

* shows that the word with ID=0 appears once,ID=11 appears thrice... in the first document

<h3> Fitting the LDA model </h3>

In [None]:
lda = models.LdaModel(corpus=tweet_list,id2word=word_id,num_topics=3,alpha='auto',per_word_topics=True)

In [None]:
print(lda.print_topics())
doc_model= lda[tweet_list]

***Interpretation of the LDA model results***

* Topic 0 is represented by 0.036*"covaxin" + 0.028*"vaccines" + 0.027*"capacity" + 0.024*"hospital" + 0.023*"dose2" + 0.021*"18" + 0.021*"age" + 0.020*"pin" + 0.020*"limit" + 0.020*"min"
* Meaning the top 10 keywords for topic 0 are covaxin, vaccines,capacity,hospital,dose2,18, age, pin,limit,min
*  Where the weight of covaxin in topic 0 is 0.036

***Visualising the Topics***

In [None]:
pyLDAvis.enable_notebook()

visuals = pyLDAvis.gensim_models.prepare(lda,tweet_list, word_id)
visuals

<h2>Sentiment Analysis</h2>

* Import TextBlob to determine polarity of the clean text
* Create new column score
* Visualise score column

In [None]:
from textblob import TextBlob

cleanTweet = pd.DataFrame(columns=['clean_text','polarity'])
cleanTweet['clean_text'] = tweets['clean_text']

tweet_blob = [TextBlob(tweet) for tweet in cleanTweet['clean_text']]
cleanTweet['polarity'] = [b.sentiment.polarity for b in tweet_blob]
cleanTweet.head()

In [None]:
cleanTweet.shape

In [None]:
polarity=[]
def text_category(pol):
    for p in pol:
        if p > 0:
            score = 'positive'
        elif p == 0:
            score = 'neutral'
        else:
            score = 'negative'
        polarity.append(score)
    return polarity

In [None]:
cleanTweet['score']=text_category(cleanTweet['polarity'])
cleanTweet.head()

In [None]:
#Piechart visualisation
plt.pie(cleanTweet['score'].value_counts(),labels=cleanTweet['score'].unique(),autopct='%1.2f%%')
plt.show()

In [None]:
#Barchart Visualisation
sns.barplot(x=cleanTweet['score'].unique(),y=cleanTweet['score'].value_counts(),data=cleanTweet,palette='Accent')
plt.xlabel('Score')
plt.ylabel('Count')
plt.show()

<h2>Building the Classification Model </h2>

In [None]:
cleanTweet = cleanTweet[cleanTweet['score']!='neutral']
cleanTweet.reset_index()
cleanTweet.head(20)

In [None]:
cleanTweet['scoremap']=""
cleanTweet.head()

In [None]:
cleanTweet['scoremap'] = cleanTweet['score'].apply(lambda x:1 if x=='positive' else 0)
cleanTweet.head(20)

In [None]:
from sklearn.model_selection import train_test_split
tweet_train, tweet_test = train_test_split(cleanTweet, test_size=0.3)
tweet_train.tail()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import NMF, LatentDirichletAllocation

In [None]:
#Trigram(3,3)
trigram_vectorizer = CountVectorizer(ngram_range=(3,3))
trigram_vectorizer.fit(tweet_train['clean_text'].values)

X_train_trigram = trigram_vectorizer.transform(tweet_train['clean_text'].values)

In [None]:
trigram_tf_idf_transformer = TfidfTransformer()
trigram_tf_idf_transformer.fit(X_train_trigram)

X_train_trigram_tf_idf = trigram_tf_idf_transformer.transform(X_train_trigram)

<h2> Train SDGClassifier </h2>

In [None]:
y_train = tweet_train['scoremap'].values

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from scipy.sparse import csr_matrix

def SDG_scores(X: csr_matrix, y: np.array, title: str) -> None:
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.75, stratify=y)

    clf = SGDClassifier()
    clf.fit(X_train, y_train)
    train_score = clf.score(X_train, y_train)
    valid_score = clf.score(X_valid, y_valid)
    print(f'{title}\nTrain score: {round(train_score, 2)} ; Validation score: {round(valid_score, 2)}\n')

In [None]:
SDG_scores(X_train_trigram, y_train, 'Trigram Counts')
SDG_scores(X_train_trigram_tf_idf, y_train, 'Trigram Tf-Idf')

* Validation score of 0.95 is pretty good
* Next we test the model using the test data

In [None]:
X_test = trigram_vectorizer.transform(tweet_test['clean_text'].values)
X_test = trigram_tf_idf_transformer.transform(X_test)
y_test = tweet_test['scoremap'].values

In [None]:
score = SDG_scores(X_test, y_test,'Test Scores')

* The test accuracy is 94% 