### Model the topics discussed in the tweets and their sentiments

In [1]:
# import libraries
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import STOPWORDS,WordCloud
import gensim
from gensim.models import CoherenceModel
from gensim import corpora
from pprint import pprint
import string
import os
import re

### Topic Modelling

In [2]:
import pandas as pd
df = pd.read_csv('../data/model_data.csv')

In [3]:
# converting tweets to list of words For feature engineering
sentence_list = [tweet for tweet in df['clean_text']]
word_list = [sent.split() for sent in sentence_list]
word_list[:2]

[['africa',
  'is',
  'in',
  'the',
  'midst',
  'of',
  'a',
  'fullblown',
  'third',
  'wave',
  'of',
  'coronavirus',
  'the',
  'head',
  'of',
  'whoafro',
  'has',
  'warnedsscases',
  'have',
  'risen',
  'across',
  'the',
  'continent',
  'by',
  'more',
  'than',
  'and',
  'deaths',
  'have',
  'also',
  'risen',
  'by',
  'in',
  'the',
  'last',
  'weekssjriggers',
  'reports',
  'shttpstcocrdhqphfwm'],
 ['dr',
  'moeti',
  'is',
  'head',
  'of',
  'who',
  'in',
  'africa',
  'and',
  'one',
  'of',
  'the',
  'best',
  'public',
  'health',
  'experts',
  'and',
  'leaders',
  'i',
  'know',
  'hers',
  'is',
  'a',
  'desperate',
  'request',
  'for',
  'vaccines',
  'to',
  'africa',
  'we',
  'plead',
  'with',
  'germany',
  'and',
  'the',
  'uk',
  'to',
  'lift',
  'patent',
  'restrictions',
  'and',
  'urgently',
  'transfer',
  'technology',
  'to',
  'enable',
  'production',
  'in',
  'africa',
  'httpstcosogiroihoc']]

In [5]:
# create dictionary which contains Id and word 
word_to_id = corpora.Dictionary(word_list)
corpus = [word_to_id.doc2bow(tweet) for tweet in word_list]

In [7]:
# Build LDA model; it maps each teweets in our corpus to a set of topics which covers a good deal of the words in the tweet
lda_model = gensim.models.ldamodel.LdaModel(corpus,
                                           id2word=word_to_id,
                                           num_topics=5, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [8]:
pprint(lda_model.show_topics(formatted=False))

[(0,
  [('the', 0.029466843),
   ('covid', 0.02805267),
   ('of', 0.023706144),
   ('vaccines', 0.022156278),
   ('in', 0.019397339),
   ('and', 0.01846069),
   ('to', 0.017154872),
   ('a', 0.013795926),
   ('people', 0.012778343),
   ('vaccinated', 0.012094214)]),
 (1,
  [('the', 0.045241017),
   ('to', 0.039863296),
   ('covid', 0.027612576),
   ('of', 0.024481608),
   ('vaccines', 0.022700375),
   ('are', 0.021122461),
   ('in', 0.015582494),
   ('amp', 0.015417857),
   ('is', 0.013467655),
   ('a', 0.012081895)]),
 (2,
  [('to', 0.05975064),
   ('you', 0.053836454),
   ('rt', 0.045495514),
   ('your', 0.04375733),
   ('vaccines', 0.04135175),
   ('israel', 0.04034832),
   ('just', 0.03610929),
   ('by', 0.03564749),
   ('expired', 0.033574816),
   ('another', 0.031833116)]),
 (3,
  [('the', 0.04838843),
   ('africa', 0.037850708),
   ('of', 0.037773605),
   ('in', 0.037295036),
   ('and', 0.031801805),
   ('india', 0.026686076),
   ('to', 0.026571812),
   ('is', 0.022459943),
   (

In [10]:
# Evaluating trained topic model using perplexity which measures how good the model is and cherence score

# perplexity
print('Perplexity: ', lda_model.log_perplexity(corpus))  
doc_lda = lda_model[corpus]

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=word_list, dictionary=word_to_id, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Ldamodel Coherence Score/Accuracy on Tweets: ', coherence_lda)

Perplexity:  -6.444959458779572
Ldamodel Coherence Score/Accuracy on Tweets:  0.5120588689368838


In [None]:
# visualize topic model using pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import pickle 
import pyLDAvis
# visualize the topics
pyLDAvis.enable_notebook()

LDAvis_prepared = gensimvis.prepare(lda_model, corpus, word_to_id)
LDAvis_prepared

### Sentiment Analysis

In [36]:
# more library importation
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from scipy.sparse import csr_matrix
from sklearn.pipeline import make_pipeline
import numpy as np
from sklearn.linear_model import SGDClassifier
from joblib import dump, load # used for saving and loading sklearn objects
from scipy.sparse import save_npz, load_npz # used for saving and loading sparse matrices
from sklearn.decomposition import NMF, LatentDirichletAllocation

In [14]:
df.head()

Unnamed: 0,clean_text,polarity
0,africa is in the midst of a fullblown third wa...,0.166667
1,dr moeti is head of who in africa and one of t...,0.133333
2,thank you researchnote for creating this amazi...,0.316667
3,former pfizer vp and virologist dr michael yea...,0.086111
4,i think its important that we dont sell covax ...,0.28


In [15]:
def text_category(p):
    ''' A function  that takes a value p and returns, depending on the value of p, 
    a string 'positive', 'negative' or 'neutral' '''
    
    if p > 0:
        return 'positive'
    elif p < 0:
        return 'negative'
    else:
        return 'neutral'

In [16]:
# apply above function on polarity column
df['score'] = df['polarity'].apply(text_category)

In [17]:
# Remove rows from where  score is neutral 
df.drop(df.loc[df['score']=='neutral'].index, inplace=True)
df = df.reset_index(drop=True) # reset frame index

# create score map column
mapping = {'positive':1,'negative':0} # create mapping dictionary
df['scoremap']= df['score'].map(mapping)
df.head()

Unnamed: 0,clean_text,polarity,score,scoremap
0,africa is in the midst of a fullblown third wa...,0.166667,positive,1
1,dr moeti is head of who in africa and one of t...,0.133333,positive,1
2,thank you researchnote for creating this amazi...,0.316667,positive,1
3,former pfizer vp and virologist dr michael yea...,0.086111,positive,1
4,i think its important that we dont sell covax ...,0.28,positive,1


In [18]:
# create feature and target variables; X and y
X = df['clean_text']
y = df['scoremap']

In [27]:
# SGDClassifier
from sklearn.model_selection import train_test_split # import library
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 1) # split data

# vectorize data with a trigram parameter
from sklearn.feature_extraction.text import CountVectorizer # import library
cv = CountVectorizer(ngram_range=(1, 3))
X_train_cv = cv.fit_transform(X_train)
X_test_cv = cv.transform(X_test)

In [28]:
# build model
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier()
clf.fit(X_train_cv, y_train)
predictions = clf.predict(X_test_cv)

In [29]:
# evaluate model on test data
clf.score(X_test_cv, y_test)

0.9553666312433581

In [32]:
bigram_vectorizer = CountVectorizer(ngram_range=(1, 2))
bigram_vectorizer.fit(X.values)

CountVectorizer(ngram_range=(1, 2))

In [38]:
X_bigram_vectorizer = bigram_vectorizer.transform(X.values)

In [39]:
def train_and_show_scores(X: csr_matrix, y: np.array, title: str) -> None:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, train_size=0.75, stratify=y
    )

    clf = SGDClassifier()
    clf.fit(X_train, y_train)
    train_score = clf.score(X_train, y_train)
    test_score = clf.score(X_test, y_test)
    print(f'{title}\nTrain score: {round(train_score, 2)} ; Test score: {round(test_score, 2)}\n')

In [40]:
train_and_show_scores(X_bigram_vectorizer, y.values, 'Bigram Counts')

Bigram Counts
Train score: 1.0 ; Test score: 0.97

