In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from IPython.display import display

In [2]:
final_file = pd.read_csv('cleaned_dataset4.csv')

In [24]:
final_file = pd.read_csv('cleaned_dataset5.csv')

In [3]:
sentiment_map = pd.read_csv('sentiment_dictionary1.csv')
sentiment_dict = dict(zip(sentiment_map.words.values, sentiment_map.sentiment_coeff.values))

In [25]:
sentiment_map = pd.read_csv('sentiment_dictionary5.csv')
sentiment_dict = dict(zip(sentiment_map.words.values, sentiment_map.sentiment_coeff.values))

In [37]:
sentiment_map = pd.read_csv('sentiment_dictionary6.csv')
sentiment_dict = dict(zip(sentiment_map.words.values, sentiment_map.sentiment_coeff.values))

In [38]:
#Getting tfidf scores of words in every sentence, and replacing them with their associated tfidf weights:
file_weighting = final_file.copy()

In [39]:
file_weighting.head()

Unnamed: 0,text,created_at,old_text
0,and the bad_news you won't know if you have it...,2021-09-04T08:15:55+0000,and the bad news you won't know if you have it...
1,if people are expressing_concerns then it is a...,2021-09-04T08:11:13+0000,if people are expressing concerns then it is a...
2,how_about our kids_under 12 ? they got no prot...,2021-09-04T08:00:19+0000,how about our kids under 12 ? they got no prot...
3,my past reading n understanding is that now ou...,2021-09-04T08:22:41+0000,my past reading n understanding is that now ou...
4,protect below_12 year_olds don't sign them up ...,2021-09-04T09:57:54+0000,protect below 12 year olds don't sign them up ...


In [40]:
tfidf = TfidfVectorizer(tokenizer=lambda y: y.split(), norm=None)
tfidf.fit(file_weighting.text)
features = pd.Series(tfidf.get_feature_names())
transformed = tfidf.transform(file_weighting.text)


The parameter 'token_pattern' will not be used since 'tokenizer' is not None'



In [41]:
#Replacing words in sentences with their tfidf scores
def create_tfidf_dictionary(x, transformed_file, features):
    '''
    create dictionary for each input sentence x, where each word has assigned its tfidf score
    
    inspired  by function from this wonderful article: 
    https://medium.com/analytics-vidhya/automated-keyword-extraction-from-articles-using-nlp-bfd864f41b34
    
    x - row of dataframe, containing sentences, and their indexes,
    transformed_file - all sentences transformed with TfidfVectorizer
    features - names of all words in corpus used in TfidfVectorizer

    '''
    vector_coo = transformed_file[x.name].tocoo()
    vector_coo.col = features.iloc[vector_coo.col].values
    dict_from_coo = dict(zip(vector_coo.col, vector_coo.data))
    return dict_from_coo

def replace_tfidf_words(x, transformed_file, features):
    '''
    replacing each word with it's calculated tfidf dictionary with scores of each word
    x - row of dataframe, containing sentences, and their indexes,
    transformed_file - all sentences transformed with TfidfVectorizer
    features - names of all words in corpus used in TfidfVectorizer
    '''
    dictionary = create_tfidf_dictionary(x, transformed_file, features)   
    return list(map(lambda y:dictionary[f'{y}'], x.text.split()))

In [42]:
%%time
replaced_tfidf_scores = file_weighting.apply(lambda x: replace_tfidf_words(x, transformed, features), axis=1)#this step takes around 3-4 minutes minutes to calculate

CPU times: user 12.9 s, sys: 464 ms, total: 13.4 s
Wall time: 13.6 s


In [43]:
#Replacing words in sentences with their sentiment score
def replace_sentiment_words(word, sentiment_dict):
    '''
    replacing each word with its associated sentiment score from sentiment dict
    '''
    try:
        out = sentiment_dict[word]
    except KeyError:
        out = 0
    return out

In [44]:
replaced_closeness_scores = file_weighting.text.apply(lambda x: list(map(lambda y: replace_sentiment_words(y, sentiment_dict), x.split())))

In [45]:
#Merging both previous steps and getting the predictions:
replacement_df = pd.DataFrame(data=[replaced_closeness_scores, replaced_tfidf_scores, file_weighting.text, file_weighting.created_at]).T
replacement_df.columns = ['sentiment_coeff', 'tfidf_scores', 'sentence', 'created_at']
replacement_df['sentiment_rate'] = replacement_df.apply(lambda x: np.array(x.loc['sentiment_coeff']) @ np.array(x.loc['tfidf_scores']), axis=1)
replacement_df['prediction'] = (replacement_df.sentiment_rate>0).astype('int8')

In [46]:
replacement_df

Unnamed: 0,sentiment_coeff,tfidf_scores,sentence,created_at,sentiment_rate,prediction
0,"[0, 0, -13.5625742119864, 0, 0, -3.67013075601...","[2.1432231398633523, 1.770209841682183, 8.4119...",and the bad_news you won't know if you have it...,2021-09-04T08:15:55+0000,-538.216660,0
1,"[0, -3.059424520648905, 0, 0, 0, 0, 0, 0, 0, 0...","[3.242783216632407, 3.3068756079387995, 2.5457...",if people are expressing_concerns then it is a...,2021-09-04T08:11:13+0000,-10.117136,0
2,"[0, 0, 0, 0, 0, 0, -16.107718432142182, 0, -7....","[5.9319367838433275, 3.6042986962500847, 9.047...",how_about our kids_under 12 ? they got no prot...,2021-09-04T08:00:19+0000,-972.153520,0
3,"[0, -11.324734853405168, -32.09323671862448, -...","[4.521911464252132, 6.354793634663361, 7.53157...",my past reading n understanding is that now ou...,2021-09-04T08:22:41+0000,-1345.810819,0
4,"[-5.244494913154813, 0, 0, 0, -9.3218929868911...","[5.869864355200949, 8.680193405423577, 9.45338...",protect below_12 year_olds don't sign them up ...,2021-09-04T09:57:54+0000,-213.095443,0
...,...,...,...,...,...,...
56285,"[-5.066872859082645, 0, 0, 0, 0, -7.0448758653...","[7.063092620243908, 7.373941751977224, 2.41248...",enforce 40 of the drinks offer at vending_mach...,2019-01-20T11:54:46+0000,-687.247724,0
56286,"[0, -11.9387828321299, 0, -14.247202155948887,...","[4.4584258122800335, 9.299232613829801, 1.7702...",1 skip the sweet sauce for popiah it is tasty ...,2019-01-20T11:34:10+0000,-4355.816618,0
56287,"[0, -6.221026331200389, 0, -11.152110846560342...","[4.021482452507456, 7.7041834388478, 1.7702098...",also educate the true source of health hazards...,2019-01-24T23:36:28+0000,-265.986915,0
56288,"[-7.639219386652212, 0, 0, 0, 0, -14.137342387...","[6.30350034027581, 3.896555231957522, 5.455182...",fine how much ? or jus warning shiok_shiok only,2019-01-01T13:58:27+0000,-645.866981,0


In [63]:
print(replacement_df['sentence'][29])

as of 3rd september 580 are warded_in the hospitals why no breakdown of the 3 groups for a clearer_picture ?


In [65]:
pos = []
for i in range (len(replacement_df)):    
    if replacement_df['prediction'][i] == 1:
        pos.append(replacement_df['sentence'][i])

In [66]:
pos

['as of 3rd september 580 are warded_in the hospitals why no breakdown of the 3 groups for a clearer_picture ?',
 'here are the seriously_ill numbers over the past_14 days up to yesterday seriously_ill numbers 21_aug 29 22_aug 29 23_aug 28 24_aug 27 25 aug_26 26_aug 24 27_aug 19 28_aug 22 29_aug 20_30 aug_24 31_aug 24 1 sep 27 2 sep 32 3 sep 25',
 "don't import more_dangerous one",
 'singaporeans become_white rats',
 "that's_why dine_in ban should never lifted",
 "what if the household_members don't_want to be on quarantine and would rather their family_member go to a community care facility ?",
 'hii i_am from_india ep_holder after fully vaccination any_chance to allow_entry in singapore ? thank_you',
 'i_am work_pass holder i get full_vaccination from my country any_chance to allow_entry in singapore ?',
 'uk had little over 26k cases on_monday august 31 slight_drop from 30k reported on august 30 both singapore and uk have high vaccination_rate compare these figures with singapore if

In [69]:
neg = []
for i in range (len(replacement_df)):    
    if replacement_df['prediction'][i] == 0:
        neg.append(replacement_df['sentence'][i])

In [70]:
neg

["and the bad_news you won't know if you have it unless you get tested people can get complacent if they feel well despite_having it also if you have a positive test_result you can't travel etc",
 'if people are expressing_concerns then it is an_indication to look_into it deeper',
 'how_about our kids_under 12 ? they got no protection but only the masks how to ensure their safety esp when they go school and daily life ? mild_symptoms not means won_t suffer from other lung and heart infection keep opening is very_risky to those unvaccinated kids adults should provide a safe environment for them to grow_up',
 'my past reading n understanding is that now our 98_9 infected individuals reported here as with no symptoms or are asymptomatic cases are actually not captured in their statistics elsewhere as covid19 infection if not wrong thanks for updates so we all continue to stay_safe !_!',
 "protect below_12 year_olds don't sign them up for preschools",
 "brilliant stat to be sharing big mes

In [47]:
replacement_df.groupby('prediction').size()
# 0.  62681
# 1.  12711 

prediction
0    55045
1     1245
dtype: int64

In [48]:
from dateutil.parser import parse
replacement_df['date'] = [parse(date).date() for date in replacement_df['created_at']]
replacement_df['monthyear'] = pd.to_datetime(replacement_df['date']).dt.to_period('M')

In [49]:
replacement_df.head()

Unnamed: 0,sentiment_coeff,tfidf_scores,sentence,created_at,sentiment_rate,prediction,date,monthyear
0,"[0, 0, -13.5625742119864, 0, 0, -3.67013075601...","[2.1432231398633523, 1.770209841682183, 8.4119...",and the bad_news you won't know if you have it...,2021-09-04T08:15:55+0000,-538.21666,0,2021-09-04,2021-09
1,"[0, -3.059424520648905, 0, 0, 0, 0, 0, 0, 0, 0...","[3.242783216632407, 3.3068756079387995, 2.5457...",if people are expressing_concerns then it is a...,2021-09-04T08:11:13+0000,-10.117136,0,2021-09-04,2021-09
2,"[0, 0, 0, 0, 0, 0, -16.107718432142182, 0, -7....","[5.9319367838433275, 3.6042986962500847, 9.047...",how_about our kids_under 12 ? they got no prot...,2021-09-04T08:00:19+0000,-972.15352,0,2021-09-04,2021-09
3,"[0, -11.324734853405168, -32.09323671862448, -...","[4.521911464252132, 6.354793634663361, 7.53157...",my past reading n understanding is that now ou...,2021-09-04T08:22:41+0000,-1345.810819,0,2021-09-04,2021-09
4,"[-5.244494913154813, 0, 0, 0, -9.3218929868911...","[5.869864355200949, 8.680193405423577, 9.45338...",protect below_12 year_olds don't sign them up ...,2021-09-04T09:57:54+0000,-213.095443,0,2021-09-04,2021-09


In [57]:
sentiment_mth = replacement_df.groupby(['monthyear', 'prediction']).size().reset_index(name = 'counts')
sentiment_mth

Unnamed: 0,monthyear,prediction,counts
0,2019-01,0,10
1,2019-02,0,10
2,2019-03,0,46
3,2019-04,0,10
4,2019-05,0,14
5,2019-06,0,10
6,2019-07,0,26
7,2019-08,0,8
8,2019-09,0,27
9,2019-09,1,1


In [58]:
sentiment_mth['monthyear'] = sentiment_mth['monthyear'].astype('str')

In [52]:
import plotly.express as px
fig = px.bar(sentiment_mth, x='monthyear', 
                        y='counts', color = 'prediction', barmode = 'group',
                        text="counts")
fig.update_layout({"title": 'Facebook Comments under MOH Posts from Jan 2019 to Sep-04 2021',
                   "xaxis": {"title":"Months"},
                   "yaxis": {"title":"Sentiment Analysis"}})
fig.update_traces(textposition='auto')

fig.show()

In [59]:
for i in range(len(sentiment_mth)):
    if sentiment_mth['prediction'][i] == 0:
        count1 = sentiment_mth['counts'][i]
        sentiment_mth['counts'][i] = -count1



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [60]:
fig = px.bar(sentiment_mth, x='monthyear', 
                        y='counts', color='prediction', barmode = 'relative',
                        text="counts")
fig.update_layout({"title": 'Facebook Comments under MOH Posts from Jan 2019 to Sep-04 2021',
                   "xaxis": {"title":"Months"},
                   "yaxis": {"title":"Sentiment Analysis"}})
fig.update_traces(textposition='auto')

fig.show()