## Sentiment Analysis Experiment
Adapted from COLX565 Lab 1

### 1. Load Datasets

In [60]:
import pandas as pd

In [61]:
vaccine_df = pd.read_csv('../../data/vaccine_full.csv')
travel_df = pd.read_csv('../../data/travel_full.csv')

In [62]:
vaccine_df = vaccine_df.dropna()[['Comment', 'Tags']]
travel_df = travel_df.dropna()[['Comment', 'Tags']]

In [63]:
vaccine_df.head()

Unnamed: 0,Comment,Tags
0,I want to see how many of the current covid ca...,Data and tracking vaccines
2,I wanted to know the number of deaths.,Data and tracking vaccines
3,Are you hiding the deaths from vaccination?,Data and tracking vaccines
4,How come there are so few reported deaths in t...,Data and tracking vaccines
5,Each province should state total vaccinations ...,Data and tracking vaccines


In [64]:
travel_df.head()

Unnamed: 0,Comment,Tags
0,Quero viajar de Portugal para a Inglaterra,Restrictions or Requirements
2,Why if fully vaccinated would you need to quar...,Quarantine
4,I'm trying to figure out if I can quarantine o...,Quarantine
5,What if you are fully vacinated? \r\n####@####...,Restrictions or Requirements
6,where do I go to check in daily?\r\n,Other (Contact / Travel outside / ArriveCan / ...


### 2. Fetch Sentiment Lexicon

In [65]:
import gensim
from nltk.data import find
from nltk.corpus import sentence_polarity,pros_cons,opinion_lexicon
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet as wn
import numpy as np
from scipy.spatial.distance import cosine
from scipy.sparse import csr_matrix, lil_matrix
import urllib.request

In [66]:
## Fetch and create VADER lexicon
VADER_lexicon = {}

location = "https://raw.githubusercontent.com/cjhutto/vaderSentiment/master/vaderSentiment/vader_lexicon.txt"
f = urllib.request.urlopen(location)
for line in f:
    line = line.decode("utf-8")
    stuff = line.strip().split("\t")
    VADER_lexicon[stuff[0]] = float(stuff[1])

In [67]:
## Fetch and creat SO_CAL intensifier lexicon
SO_CAL_int = {}

location = "https://raw.githubusercontent.com/sfu-discourse-lab/SO-CAL/master/Resources/dictionaries/English/"
f = urllib.request.urlopen(location + "int_dictionary1.11.txt")
for line in f:
    line = line.decode("latin-1")
    if line.strip() and "_" not in line:
        word, intensity = line.strip().split()
        SO_CAL_int[word] = float(intensity) + 1

### 3. Calculating Sentiment

In [68]:
negatives = {"no","not"}

def calculate_SO(text,lexicon,ints=None, negs=None,verbose=False):
    '''calculate a semantic orientation for a text as the sum of the semantic orientation
    of the words of the text as provided by lexicon, modified by the effect of intensifiers
    if ints is True, and negators if negs is true. Returns the SO unless verbose is True,
    at which point it returns both the SO and a list of tuples correspond to each SO-bearing
    word and the SO calculated for that word'''
    total = 0
    if verbose:
        SO_word_list = []
    for i, word in enumerate(text.strip().split()):
        if word in lexicon:
            SO = lexicon[word]
            if i > 0:
                prev_word = text[i-1].lower()
                if ints and prev_word in ints:
                    SO *= ints[prev_word]
                elif negs and prev_word in negs:
                    if SO < 0:
                        SO += 4
                    elif SO > 0:
                        SO -= 4
                        
            if verbose:
                SO_word_list.append((word,SO))
                
            total += SO
            
        elif '??' in word or '!!' in word:
            SO = -2
            total += SO
            if verbose:
                SO_word_list.append((word,SO))
    
    total = round(total, 2)
    if verbose:
        return (total,SO_word_list)
    else:
        return total

In [69]:
def evaluate_corpus_sentiment(corpus_df, lexicon, ints=None, negs=None, verbose=False):
    '''gives the general sentiment of the given corpus, or when verbose is True, provides the best and worst comments in the corpus
    and their respective SO scores'''
    comments = corpus_df.Comment
    total_SO = 0
    highest_SO = 0
    lowest_SO = 0
    most_neg_comment = ""
    most_pos_comment = ""
    for comment in comments:
        SO = calculate_SO(comment, lexicon, ints=ints, negs=negs)
        
        if SO > highest_SO:
            highest_SO = SO
            most_pos_comment = comment
            
        if SO < lowest_SO:
            lowest_SO = SO
            most_neg_comment = comment
            
        total_SO+=SO
        
    if verbose:
        return f"Overal sentiment score: {round(total_SO/len(corpus_df), 4)}", f"Best comment: {most_pos_comment, calculate_SO(most_pos_comment, lexicon, ints=ints, negs=negs, verbose=True)}", f"Worst comment: {most_neg_comment, calculate_SO(most_neg_comment, lexicon, ints=ints, negs=negs, verbose=True)}"
    else:
        return f"Overal sentiment score: {round(total_SO/len(corpus_df), 4)}"
            

#### 3.a Calculate Vaccine Sentiment 

In [70]:
evaluate_corpus_sentiment(vaccine_df, VADER_lexicon, ints=SO_CAL_int, negs=negatives, verbose=True)

('Overal sentiment score: 0.0113',
 "Best comment: ('What physical distancing and other safety measures does Health Canada recommend for people who have COVID 19 vaccines? For example, is it safe to invite vaccinated friends and family into our home to share a meal?', (9.1, [('safety', 1.8), ('recommend', 1.5), ('safe', 1.9), ('invite', 0.6), ('friends', 2.1), ('share', 1.2)]))",
 "Worst comment: ('Question asked: what does covid-19 vaccination do? Does it mean you are orotected from becoming infected and from being contagious m, or does it mean you will suffer fewer effects if you become infected with reduced likelihood of death or hospitalisation. ', (-11.2, [('infected', -2.2), ('contagious', -1.4), ('suffer', -2.5), ('infected', -2.2), ('death', -2.9)]))")

#### 3.b Calculate Travel Sentiment

In [71]:
evaluate_corpus_sentiment(travel_df, VADER_lexicon, ints=SO_CAL_int, negs=negatives, verbose=True)

('Overal sentiment score: 0.0102',
 "Best comment: ('What about Canadians returning home who had left to take care of their parents for compassionate reasons. Taking care of non Canadians does not qualify as compassion ? Shouldn’t those Canadians be exempted as well as they did not travel for vacation?', (9.7, [('care', 2.2), ('compassionate', 2.2), ('care', 2.2), ('compassion', 2.0), ('well', 1.1)]))",
 "Worst comment: ('WHAT TYPE OF COVID TESTS ???? NOT ANTIGEN?????\\r\\nNOT ANTIGEN??? WHAT KIND OF COVID TESTS?????? VERY POORLY EXPLAINED!!!!!\\r\\n', (-10, [('????', -2), ('ANTIGEN?????', -2), ('ANTIGEN???', -2), ('TESTS??????', -2), ('EXPLAINED!!!!!', -2)]))")

### 4. Evaluating Sample Comments

In [72]:
sample_comments = ["I got my jab on March 29. Your literature says I need my 3rd shot within 3 weeks of my first as I was undergoing cancer treatment causing immunosuppression. I got my jab at St Mike's from which it was stated my 2nd shot is due July 16. Can you change this to April 19. HILDY Sinclair. 01/03/4",
                  "How reliable the shipment is ?? Spending on vaccine is not the solution. Why Canada can not make vaccine ?? Aren't we a develop country ??? No technology at all !! Aren't this TRUDO GOVT. realize that Canada is to vulnerable when is come to safety ?? We need strong leadership. This guy NO GOOD",
                  "Critical missing info: Fed Govt needs to make it mandatory that Ontario provide detailedinfo 1) Cumulative percent of adult population who have received at least one dose of a COVID-19 vaccine 2) Distribution by PH Unit geographic catchment 3) Naming of # distributed to each health care institution",
                  "When coming from Portugal and the itinerary is Porto -frankfurt FRANKFURT - Toronto TORONTO - EDMONTON Being Toronto the first point of entry in CANADA, however not the final destination.. where is the hotel quarantine done?  Cannot be done in Toronto as there is a connecting flight to Edmonton.",
                  "Pre-entry test requirements: You must show proof of your test results even if you recovered from COVID-19 and no longer test positive. limited number of exceptions Resolved COVID-19 infection - persons who have molecular test proof to show they had a positive COVID-19 test taken between 14-90 days"]

In [73]:
for comment in sample_comments:
    print(comment)
    print(calculate_SO(comment, VADER_lexicon, ints=SO_CAL_int, negs=negatives, verbose=True))

I got my jab on March 29. Your literature says I need my 3rd shot within 3 weeks of my first as I was undergoing cancer treatment causing immunosuppression. I got my jab at St Mike's from which it was stated my 2nd shot is due July 16. Can you change this to April 19. HILDY Sinclair. 01/03/4
(-3.4, [('cancer', -3.4)])
How reliable the shipment is ?? Spending on vaccine is not the solution. Why Canada can not make vaccine ?? Aren't we a develop country ??? No technology at all !! Aren't this TRUDO GOVT. realize that Canada is to vulnerable when is come to safety ?? We need strong leadership. This guy NO GOOD
(-6.8, [('??', -2), ('??', -2), ('???', -2), ('!!', -2), ('vulnerable', -0.9), ('safety', 1.8), ('??', -2), ('strong', 2.3)])
Critical missing info: Fed Govt needs to make it mandatory that Ontario provide detailedinfo 1) Cumulative percent of adult population who have received at least one dose of a COVID-19 vaccine 2) Distribution by PH Unit geographic catchment 3) Naming of # dis

### 5. Unsupervised Sentiment Analysis using K-Means
Adapted from the [work](https://towardsdatascience.com/unsupervised-sentiment-analysis-a38bf1906483) of Rafał Wójcik

In [74]:
import pandas as pd
import numpy as np
import re
from re import sub
import multiprocessing
from unidecode import unidecode

from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec
from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors

from sklearn.cluster import KMeans

from time import time 
from collections import defaultdict

#### 5.a. Preprocessing Data
Clean the dataset

In [75]:
def text_to_word_list(text):
    ''' Pre process and convert texts to a list of words'''
    text = str(text)
    text = text.lower()

    # Clean the text
    text = sub(r"[^A-Za-z0-9^,!?.\/'+]", " ", text)
    text = sub(r"\+", " plus ", text)
    text = sub(r",", " ", text)
    text = sub(r"\.", " ", text)
    text = sub(r"!", " ! ", text)
    text = sub(r"\?", " ? ", text)
    text = sub(r"'", " ", text)
    text = sub(r":", " : ", text)
    text = sub(r"\s{2,}", " ", text)

    text = text.split()

    return text

In [76]:
vaccine_sentiment_df = vaccine_df.copy()
vaccine_sentiment_df.Comment = vaccine_df.Comment.apply(lambda x: text_to_word_list(x))
vaccine_sentiment_df.Comment = vaccine_sentiment_df.Comment.str.join(' ')
vaccine_sentiment_df.head()

Unnamed: 0,Comment,Tags
0,i want to see how many of the current covid ca...,Data and tracking vaccines
2,i wanted to know the number of deaths,Data and tracking vaccines
3,are you hiding the deaths from vaccination ?,Data and tracking vaccines
4,how come there are so few reported deaths in t...,Data and tracking vaccines
5,each province should state total vaccinations ...,Data and tracking vaccines


#### 5.a. Separate English and French sentences using fasttext
Before clustering, it is important to only use 1 language. Otherwise, the model will cluster based on the languages instead of sentiment.

In [77]:
import fasttext
lid_model = fasttext.load_model("lid.176.bin") ## download "lid.176.bin" from fasttext website



In [78]:
def predict_en(comment):
    '''predicts if a comment is English (0) or non-English (1)'''
    prediction = lid_model.predict(comment)[0][0].split('_')[-1]
    
    if prediction == 'en':
        return 0
    else:
        return 1

In [79]:
vaccine_sentiment_df['Language'] = vaccine_sentiment_df.Comment
vaccine_sentiment_df.Language = vaccine_sentiment_df.Language.apply(lambda x: predict_en(x))
vaccine_sentiment_df.head()

Unnamed: 0,Comment,Tags,Language
0,i want to see how many of the current covid ca...,Data and tracking vaccines,0
2,i wanted to know the number of deaths,Data and tracking vaccines,0
3,are you hiding the deaths from vaccination ?,Data and tracking vaccines,0
4,how come there are so few reported deaths in t...,Data and tracking vaccines,0
5,each province should state total vaccinations ...,Data and tracking vaccines,0


In [80]:
en_vaccine = vaccine_sentiment_df[vaccine_sentiment_df["Language"]==0][['Comment', 'Tags']]

In [81]:
en_vaccine.Comment = en_vaccine.Comment.str.split()
en_vaccine.head()

Unnamed: 0,Comment,Tags
0,"[i, want, to, see, how, many, of, the, current...",Data and tracking vaccines
2,"[i, wanted, to, know, the, number, of, deaths]",Data and tracking vaccines
3,"[are, you, hiding, the, deaths, from, vaccinat...",Data and tracking vaccines
4,"[how, come, there, are, so, few, reported, dea...",Data and tracking vaccines
5,"[each, province, should, state, total, vaccina...",Data and tracking vaccines


In [93]:
## saving the cleaned data
vaccine_model = en_vaccine.copy()

## only keep comments with more than 1 word
vaccine_model['word_count'] = vaccine_model.Comment.apply(lambda x: len(set(x)))
vaccine_model = vaccine_model[vaccine_model.word_count > 1][['Comment', "Tags"]]
print(f"No. rows before: {len(en_vaccine)}")
print(f"No. rows after: {len(vaccine_model)}")

No. rows before: 17937
No. rows after: 17858


In [100]:
sent = [row for row in vaccine_model.Comment]
phrases = Phrases(sent, min_count=1, progress_per=50000)
bigram = Phraser(phrases)
sentences = bigram[sent]
sentences[2]

['are', 'you_hiding', 'the', 'deaths_from', 'vaccination', '?']

In [101]:
file_export = vaccine_model.copy()
file_export['old_Comment'] = file_export.Comment
file_export.old_Comment = file_export.old_Comment.str.join(' ')
file_export.Comment = file_export.Comment.apply(lambda x: ' '.join(bigram[x]))

In [102]:
file_export.head()

Unnamed: 0,Comment,Tags,old_Comment
0,i want to see how_many of the current covid ca...,Data and tracking vaccines,i want to see how many of the current covid ca...
2,i wanted to know the number of deaths,Data and tracking vaccines,i wanted to know the number of deaths
3,are you_hiding the deaths_from vaccination ?,Data and tracking vaccines,are you hiding the deaths from vaccination ?
4,how come there are so few reported deaths in t...,Data and tracking vaccines,how come there are so few reported deaths in t...
5,each_province should state total vaccinations ...,Data and tracking vaccines,each province should state total vaccinations ...


In [103]:
file_export.to_csv('en_vaccine_sentiment_df', index=False)

In [104]:
%%time

w2v_model = Word2Vec(min_count=3,
                     window=4,
                     size=300,
                     sample=1e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=multiprocessing.cpu_count()-1)


w2v_model.build_vocab(sentences, progress_per=50000)

Wall time: 1.65 s


In [105]:
%%time
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

Wall time: 19.9 s


(1198648, 7629390)

In [106]:
w2v_model.init_sims(replace=True)

In [107]:
w2v_model.save("word2vec.model")

#### 5.b. K-Means Clustering for Sentiment Analysis

In [108]:
word_vectors = Word2Vec.load("word2vec.model").wv

In [109]:
model = KMeans(n_clusters=2, max_iter=1000, random_state=True, n_init=50).fit(X=word_vectors.vectors.astype('double'))

In [110]:
word_vectors.similar_by_vector(model.cluster_centers_[1], topn=10, restrict_vocab=None)

[('frequency', 0.9999629259109497),
 ('yellow', 0.9999617338180542),
 ('shameful', 0.9999614357948303),
 ('reading_this', 0.9999614357948303),
 ('misleading', 0.9999610185623169),
 ('cancer_patients', 0.9999610185623169),
 ('iodine', 0.9999609589576721),
 ('office', 0.9999609589576721),
 ('station', 0.9999607801437378),
 ('never', 0.9999602437019348)]

In [111]:
positive_cluster_index = 1
positive_cluster_center = model.cluster_centers_[positive_cluster_index]
negative_cluster_center = model.cluster_centers_[1-positive_cluster_index]

In [112]:
words = pd.DataFrame(word_vectors.vocab.keys())
words.columns = ['words']
words['vectors'] = words.words.apply(lambda x: word_vectors[f'{x}'])
words['cluster'] = words.vectors.apply(lambda x: model.predict([np.array(x)]))
words.cluster = words.cluster.apply(lambda x: x[0])

In [113]:
words['cluster_value'] = [1 if i==positive_cluster_index else -1 for i in words.cluster]
words['closeness_score'] = words.apply(lambda x: 1/(model.transform([x.vectors]).min()), axis=1)
words['sentiment_coeff'] = words.closeness_score * words.cluster_value

In [137]:
words[['words', 'sentiment_coeff']].sort_values(by='sentiment_coeff').head(20)

Unnamed: 0,words,sentiment_coeff
5029,yellow,-114.628417
3539,station,-114.440245
4284,1962,-114.267294
3365,cancer_patients,-114.04752
3196,office,-113.564575
4131,england,-113.355924
3852,79_yrs,-112.627754
3934,middle,-112.481609
5114,no_evidence,-112.150265
3715,assistance,-112.028285


In [138]:
words[['words', 'sentiment_coeff']].sort_values(by='sentiment_coeff').tail(20)

Unnamed: 0,words,sentiment_coeff
2587,limit,110.875543
726,there_any,110.912796
1644,drug_company,110.966909
2187,march_29th,110.975214
529,look,110.980348
5010,diagnosed,111.126386
5001,vitamin_d,111.144311
4421,deadline,111.15904
2113,message,111.171487
4911,taste,111.451421


In [115]:
words[['words', 'sentiment_coeff']].to_csv('sentiment_dictionary.csv', index=False)

In [116]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from IPython.display import display

In [117]:
final_vaccine_df = pd.read_csv('en_vaccine_sentiment_df')

In [118]:
sentiment_map = pd.read_csv('sentiment_dictionary.csv')
sentiment_dict = dict(zip(sentiment_map.words.values, sentiment_map.sentiment_coeff.values))

In [119]:
file_weighting = final_vaccine_df.copy()

In [120]:
tfidf = TfidfVectorizer(tokenizer=lambda y: y.split(), norm=None)
tfidf.fit(file_weighting.Comment)
features = pd.Series(tfidf.get_feature_names())
transformed = tfidf.transform(file_weighting.Comment)



In [143]:
file_weighting

Unnamed: 0,Comment,Tags,old_Comment
0,i want to see how_many of the current covid ca...,Data and tracking vaccines,i want to see how many of the current covid ca...
1,i wanted to know the number of deaths,Data and tracking vaccines,i wanted to know the number of deaths
2,are you_hiding the deaths_from vaccination ?,Data and tracking vaccines,are you hiding the deaths from vaccination ?
3,how come there are so few reported deaths in t...,Data and tracking vaccines,how come there are so few reported deaths in t...
4,each_province should state total vaccinations ...,Data and tracking vaccines,each province should state total vaccinations ...
...,...,...,...
17853,where is the update for jan 28th ?,Vaccines - Other,where is the update for jan 28th ?
17854,poster to help educate lay coworkers who are v...,Vaccines - Other,poster to help educate lay coworkers who are v...
17855,not updated,Vaccines - Other,not updated
17856,are the vaccine bottles used again,Vaccines - Other,are the vaccine bottles used again


In [121]:
def create_tfidf_dictionary(x, transformed_file, features):
    '''
    create dictionary for each input sentence x, where each word has assigned its tfidf score
    
    inspired  by function from: 
    https://medium.com/analytics-vidhya/automated-keyword-extraction-from-articles-using-nlp-bfd864f41b34
    
    x - row of dataframe, containing sentences, and their indexes,
    transformed_file - all sentences transformed with TfidfVectorizer
    features - names of all words in corpus used in TfidfVectorizer

    '''
    vector_coo = transformed_file[x.name].tocoo()
    vector_coo.col = features.iloc[vector_coo.col].values
    dict_from_coo = dict(zip(vector_coo.col, vector_coo.data))
    return dict_from_coo

def replace_tfidf_words(x, transformed_file, features):
    '''
    replacing each word with it's calculated tfidf dictionary with scores of each word
    x - row of dataframe, containing sentences, and their indexes,
    transformed_file - all sentences transformed with TfidfVectorizer
    features - names of all words in corpus used in TfidfVectorizer
    '''
    dictionary = create_tfidf_dictionary(x, transformed_file, features)   
    return list(map(lambda y:dictionary[f'{y}'], x.Comment.split()))

In [122]:
%%time
replaced_tfidf_scores = file_weighting.apply(lambda x: replace_tfidf_words(x, transformed, features), axis=1)#this step takes around 3-4 minutes minutes to calculate

Wall time: 3.71 s


In [123]:
def replace_sentiment_words(word, sentiment_dict):
    '''
    replacing each word with its associated sentiment score from sentiment dict
    '''
    try:
        out = sentiment_dict[word]
    except KeyError:
        out = 0
    return out

In [124]:
replaced_closeness_scores = file_weighting.Comment.apply(lambda x: list(map(lambda y: replace_sentiment_words(y, sentiment_dict), x.split())))

In [125]:
replacement_df = pd.DataFrame(data=[replaced_closeness_scores, replaced_tfidf_scores, file_weighting.Comment, file_weighting.Tags]).T
replacement_df.columns = ['sentiment_coeff', 'tfidf_scores', 'sentence', 'sentiment']
replacement_df['sentiment_rate'] = replacement_df.apply(lambda x: np.array(x.loc['sentiment_coeff']) @ np.array(x.loc['tfidf_scores']), axis=1)
replacement_df['prediction'] = (replacement_df.sentiment_rate>0).astype('int8')
replacement_df['sentiment'] = [1 if i==1 else 0 for i in replacement_df.sentiment]

In [126]:
replacement_df[['sentence', "sentiment", 'prediction']]

Unnamed: 0,sentence,sentiment,prediction
0,i want to see how_many of the current covid ca...,0,1
1,i wanted to know the number of deaths,0,1
2,are you_hiding the deaths_from vaccination ?,0,1
3,how come there are so few reported deaths in t...,0,1
4,each_province should state total vaccinations ...,0,1
...,...,...,...
17853,where is the update for jan 28th ?,0,1
17854,poster to help educate lay coworkers who are v...,0,0
17855,not updated,0,1
17856,are the vaccine bottles used again,0,1


In [128]:
replacement_df.prediction.value_counts()

0    10128
1     7730
Name: prediction, dtype: int64