In [1]:
import pandas as pd
import regex as re

In [2]:
import spacy

In [3]:
import nltk

In [4]:
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report

In [5]:
from textblob import TextBlob

In [6]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [7]:
df=pd.read_csv('Task-1 tweets_1000.csv',sep='\t',header=None)

In [8]:
df.head()

Unnamed: 0,0
0,\xf0\x9f\x98\x91\xf0\x9f\x98\x91\xf0\x9f\x98\x...
1,Jasmine Strange shares a message of hope durin...
2,I gotta fight these allergies in public to mak...
3,https://t.co/57NBQ2XQsG On Easter please reme...
4,@lenibriscoe I have a cute one made from recyc...


In [9]:
df.shape

(971, 1)

# cleaning using regex function

In [10]:
l=[]
for i,idx in df.iterrows():
    x=re.sub(r"(http[s]?\://\S+)|([\[\(].*[\)\]])|([#@]\S+)|(\\x)|([0-9])|\n", "",idx[0])
    l.append(x)

In [11]:
df[0]=l

# removing misspelled / non words

In [12]:
words = set(nltk.corpus.words.words())
def remove_word(text,words):
    x=" ".join(w for w in nltk.wordpunct_tokenize(text) if w.lower() in words or not w.isalpha())
    return x

In [13]:
df['edit']=df[0].apply(lambda x: remove_word(x,words))

In [14]:
df.head(20)

Unnamed: 0,0,edit
0,ffffffab yeall suck at social distancing.,suck at social .
1,Jasmine Strange shares a message of hope durin...,Jasmine Strange a message of hope during this ...
2,I gotta fight these allergies in public to mak...,I fight these in public to make sure people th...
3,On Easter please remember the poor and desol...,On Easter please remember the poor and desolate .
4,I have a cute one made from recycled sari sil...,I have a cute one made from sari silk my frien...
5,Told my Mom we should start to work from home ...,Told my we should start to work from home due ...
6,ffffffffffffbdffffffbdffffffbd,
7,So using your logic of not paying any at...,So your logic of not paying any attention to a...
8,We are all in deep doo doo ffdb,We are all in deep
9,"To be honest, everyone was scared of and Govt...","To be honest , everyone was of and had to some..."


# unsupervised data can be analysed using multiple methods that i implemented 

# Using TextBlob 

In [15]:
p=[]
for i,idx in df.iterrows():
    res=TextBlob(idx['edit'])
    res=res.sentiment.polarity
    if res<0:
        p.append('Negative')
    elif res>0:
        p.append('Positive')
    else:
        p.append('neutral')

In [16]:
df['TextBlob']=p

In [17]:
df.head(20)

Unnamed: 0,0,edit,TextBlob
0,ffffffab yeall suck at social distancing.,suck at social .,Positive
1,Jasmine Strange shares a message of hope durin...,Jasmine Strange a message of hope during this ...,Negative
2,I gotta fight these allergies in public to mak...,I fight these in public to make sure people th...,Positive
3,On Easter please remember the poor and desol...,On Easter please remember the poor and desolate .,Negative
4,I have a cute one made from recycled sari sil...,I have a cute one made from sari silk my frien...,Positive
5,Told my Mom we should start to work from home ...,Told my we should start to work from home due ...,Positive
6,ffffffffffffbdffffffbdffffffbd,,neutral
7,So using your logic of not paying any at...,So your logic of not paying any attention to a...,neutral
8,We are all in deep doo doo ffdb,We are all in deep,neutral
9,"To be honest, everyone was scared of and Govt...","To be honest , everyone was of and had to some...",Positive


# using VADER Sentiment Analyser

In [18]:
vader=SentimentIntensityAnalyzer()

In [19]:
print(vader.polarity_scores(df[0][0])['compound'])

-0.4404


In [20]:
v=[]
for i,idx in df.iterrows():
    x=vader.polarity_scores(idx['edit'])['compound']
    if x<0:
        v.append('Negative')
    elif x>0:
        v.append('Positive')
    else:
        v.append('neutral')

In [21]:
df['Vader']=v

In [22]:
df.head(20)

Unnamed: 0,0,edit,TextBlob,Vader
0,ffffffab yeall suck at social distancing.,suck at social .,Positive,Negative
1,Jasmine Strange shares a message of hope durin...,Jasmine Strange a message of hope during this ...,Negative,Positive
2,I gotta fight these allergies in public to mak...,I fight these in public to make sure people th...,Positive,Negative
3,On Easter please remember the poor and desol...,On Easter please remember the poor and desolate .,Negative,Negative
4,I have a cute one made from recycled sari sil...,I have a cute one made from sari silk my frien...,Positive,Positive
5,Told my Mom we should start to work from home ...,Told my we should start to work from home due ...,Positive,Positive
6,ffffffffffffbdffffffbdffffffbd,,neutral,neutral
7,So using your logic of not paying any at...,So your logic of not paying any attention to a...,neutral,Negative
8,We are all in deep doo doo ffdb,We are all in deep,neutral,neutral
9,"To be honest, everyone was scared of and Govt...","To be honest , everyone was of and had to some...",Positive,Negative


# checking for overlap of predicted sentiments of TextBlob and Vader

In [23]:
mis=[]
matched=0
mismatch=0
for i,idx in df.iterrows():
    if idx['TextBlob']==idx['Vader']:
        matched+=1
    else:
        mismatch+=1
        mis.append(idx['edit'])
        
print(matched,mismatch)

620 351


In [24]:
mis[:10]

['suck at social .',
 'Jasmine Strange a message of hope during this life of production by if you like it . me why if you',
 'I fight these in public to make sure people think I got corona',
 'So your logic of not paying any attention to and , Trump a and seven later infected ,+, and ,+ .',
 'To be honest , everyone was of and had to somehow convince the public to comply . Inflated death were used to keep us at home . Regardless , the end result is what . Unfortunately this to be , so they could blame Trump .',
 'There was a concerted effort by the media in briefing to and attack was there and is like a cancer in the room .',
 'Corona Virus : As taught me , in - well the Corona Virus : of ea',
 'Q : To deal with a can you make any good ? A : No , because every decision is a dilemma . There is a ton of outrage about bad choice , rather than a focus on reducing the chance of the next pandemic . , we lose .',
 'Damn ! & ; really Are responsible ...',
 'But since know every damn thing the 

# text preprocessing

In [25]:
import string

# removing punctuations,tokenizations and stemming


In [26]:
def remove_punc(text):
    puncfree="".join([i for i in text if i not in string.punctuation])
    return puncfree
    
df['clean']=df['edit'].apply(lambda x:remove_punc(x))

In [27]:
df.head()

Unnamed: 0,0,edit,TextBlob,Vader,clean
0,ffffffab yeall suck at social distancing.,suck at social .,Positive,Negative,suck at social
1,Jasmine Strange shares a message of hope durin...,Jasmine Strange a message of hope during this ...,Negative,Positive,Jasmine Strange a message of hope during this ...
2,I gotta fight these allergies in public to mak...,I fight these in public to make sure people th...,Positive,Negative,I fight these in public to make sure people th...
3,On Easter please remember the poor and desol...,On Easter please remember the poor and desolate .,Negative,Negative,On Easter please remember the poor and desolate
4,I have a cute one made from recycled sari sil...,I have a cute one made from sari silk my frien...,Positive,Positive,I have a cute one made from sari silk my frien...


In [28]:
df['clean']=df['clean'].apply(lambda x:x.lower())

In [29]:
df.head()

Unnamed: 0,0,edit,TextBlob,Vader,clean
0,ffffffab yeall suck at social distancing.,suck at social .,Positive,Negative,suck at social
1,Jasmine Strange shares a message of hope durin...,Jasmine Strange a message of hope during this ...,Negative,Positive,jasmine strange a message of hope during this ...
2,I gotta fight these allergies in public to mak...,I fight these in public to make sure people th...,Positive,Negative,i fight these in public to make sure people th...
3,On Easter please remember the poor and desol...,On Easter please remember the poor and desolate .,Negative,Negative,on easter please remember the poor and desolate
4,I have a cute one made from recycled sari sil...,I have a cute one made from sari silk my frien...,Positive,Positive,i have a cute one made from sari silk my frien...


In [30]:
def tokenization(text):
    tokens = re.split('W+',text)
    return tokens
df['tokens']=df['clean'].apply(lambda x: tokenization(x))

In [31]:
df.head()

Unnamed: 0,0,edit,TextBlob,Vader,clean,tokens
0,ffffffab yeall suck at social distancing.,suck at social .,Positive,Negative,suck at social,[suck at social ]
1,Jasmine Strange shares a message of hope durin...,Jasmine Strange a message of hope during this ...,Negative,Positive,jasmine strange a message of hope during this ...,[jasmine strange a message of hope during this...
2,I gotta fight these allergies in public to mak...,I fight these in public to make sure people th...,Positive,Negative,i fight these in public to make sure people th...,[i fight these in public to make sure people t...
3,On Easter please remember the poor and desol...,On Easter please remember the poor and desolate .,Negative,Negative,on easter please remember the poor and desolate,[on easter please remember the poor and desola...
4,I have a cute one made from recycled sari sil...,I have a cute one made from sari silk my frien...,Positive,Positive,i have a cute one made from sari silk my frien...,[i have a cute one made from sari silk my frie...


In [32]:
def make_list(text):
    l=list(text.strip().split())
    return l
df['tokens']=df['tokens'].apply(lambda x: make_list(x[0]))

In [33]:
df.head()

Unnamed: 0,0,edit,TextBlob,Vader,clean,tokens
0,ffffffab yeall suck at social distancing.,suck at social .,Positive,Negative,suck at social,"[suck, at, social]"
1,Jasmine Strange shares a message of hope durin...,Jasmine Strange a message of hope during this ...,Negative,Positive,jasmine strange a message of hope during this ...,"[jasmine, strange, a, message, of, hope, durin..."
2,I gotta fight these allergies in public to mak...,I fight these in public to make sure people th...,Positive,Negative,i fight these in public to make sure people th...,"[i, fight, these, in, public, to, make, sure, ..."
3,On Easter please remember the poor and desol...,On Easter please remember the poor and desolate .,Negative,Negative,on easter please remember the poor and desolate,"[on, easter, please, remember, the, poor, and,..."
4,I have a cute one made from recycled sari sil...,I have a cute one made from sari silk my frien...,Positive,Positive,i have a cute one made from sari silk my frien...,"[i, have, a, cute, one, made, from, sari, silk..."


In [34]:
st = nltk.PorterStemmer()
def stemming_on_text(data):
    text = [st.stem(word) for word in data]
    return data
df['stem']= df['tokens'].apply(lambda x: stemming_on_text(x))

In [35]:
lm = nltk.WordNetLemmatizer()
def lemmatizer_on_text(data):
    text = [lm.lemmatize(word) for word in data]
    return data
df['lem'] = df['stem'].apply(lambda x: lemmatizer_on_text(x))

# another approach for unsupervised sentiment analysis 

# Non negative matrix factorization

In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
dtm = tfidf.fit_transform(df['clean'])

In [37]:
from sklearn.decomposition import NMF
nmf_model = NMF(n_components=3,random_state=42)
nmf_model.fit(dtm)

In [38]:
for index,topic in enumerate(nmf_model.components_):
    print(f'THE TOP 15 WORDS FOR  #{index}')
    print([tfidf.get_feature_names()[i] for i in topic.argsort()[-30:]])
    print('\n')

THE TOP 15 WORDS FOR  #0
['gone', 'needs', 'looking', 'touch', 'yes', 'yeah', 'china', 'today', 'mad', 'house', 'life', 'fight', 'ea', 'tell', 'came', 'clearly', 'reason', 'thing', 'said', 'mouth', 'entire', 'time', 'stop', 'just', 'catch', 'beer', 'think', 'got', 'virus', 'corona']


THE TOP 15 WORDS FOR  #1
['times', 'business', 'testing', 'thought', 'come', 'die', 'hope', 'test', 'county', 'new', 'bitch', 'know', 'positive', 'wearing', 'tho', 'available', 'miss', 'tested', 'death', 'march', 'la', 'closed', 'true', 'getting', 'el', 'did', 'just', 'days', 'real', 'covid']


THE TOP 15 WORDS FOR  #2
['comes', 'thank', 'little', 'weekend', 'big', 'trying', 'going', 'feel', 'ass', 'safe', 'know', 'good', 'want', 'new', 'thing', 'hope', 'work', 'stop', 'china', 'need', 'think', 'right', 'time', 'day', 'stay', 'home', 'trump', 'just', 'people', 'like']






In [39]:
sentiment = nmf_model.transform(dtm)
df['targets'] = sentiment.argmax(axis=1)

In [40]:
dic={0:'neutral',1:'Negative',2:'Positive'}
df['nmf']=df['targets'].map(dic)

In [41]:
df.head()

Unnamed: 0,0,edit,TextBlob,Vader,clean,tokens,stem,lem,targets,nmf
0,ffffffab yeall suck at social distancing.,suck at social .,Positive,Negative,suck at social,"[suck, at, social]","[suck, at, social]","[suck, at, social]",1,Negative
1,Jasmine Strange shares a message of hope durin...,Jasmine Strange a message of hope during this ...,Negative,Positive,jasmine strange a message of hope during this ...,"[jasmine, strange, a, message, of, hope, durin...","[jasmine, strange, a, message, of, hope, durin...","[jasmine, strange, a, message, of, hope, durin...",2,Positive
2,I gotta fight these allergies in public to mak...,I fight these in public to make sure people th...,Positive,Negative,i fight these in public to make sure people th...,"[i, fight, these, in, public, to, make, sure, ...","[i, fight, these, in, public, to, make, sure, ...","[i, fight, these, in, public, to, make, sure, ...",0,neutral
3,On Easter please remember the poor and desol...,On Easter please remember the poor and desolate .,Negative,Negative,on easter please remember the poor and desolate,"[on, easter, please, remember, the, poor, and,...","[on, easter, please, remember, the, poor, and,...","[on, easter, please, remember, the, poor, and,...",2,Positive
4,I have a cute one made from recycled sari sil...,I have a cute one made from sari silk my frien...,Positive,Positive,i have a cute one made from sari silk my frien...,"[i, have, a, cute, one, made, from, sari, silk...","[i, have, a, cute, one, made, from, sari, silk...","[i, have, a, cute, one, made, from, sari, silk...",0,neutral


# overlap of all three algorithms with similar prediction

In [64]:
match=0
misma=0
g=[]
ind=[]
for i,idx in df.iterrows():
    if idx['TextBlob']==idx['Vader']==idx['nmf']:
        match+=1
        g.append(idx[0])
        ind.append(i)
    else:
        misma+=1
print(match,misma)

328 643


In [43]:
df.head()

Unnamed: 0,0,edit,TextBlob,Vader,clean,tokens,stem,lem,targets,nmf
0,ffffffab yeall suck at social distancing.,suck at social .,Positive,Negative,suck at social,"[suck, at, social]","[suck, at, social]","[suck, at, social]",1,Negative
1,Jasmine Strange shares a message of hope durin...,Jasmine Strange a message of hope during this ...,Negative,Positive,jasmine strange a message of hope during this ...,"[jasmine, strange, a, message, of, hope, durin...","[jasmine, strange, a, message, of, hope, durin...","[jasmine, strange, a, message, of, hope, durin...",2,Positive
2,I gotta fight these allergies in public to mak...,I fight these in public to make sure people th...,Positive,Negative,i fight these in public to make sure people th...,"[i, fight, these, in, public, to, make, sure, ...","[i, fight, these, in, public, to, make, sure, ...","[i, fight, these, in, public, to, make, sure, ...",0,neutral
3,On Easter please remember the poor and desol...,On Easter please remember the poor and desolate .,Negative,Negative,on easter please remember the poor and desolate,"[on, easter, please, remember, the, poor, and,...","[on, easter, please, remember, the, poor, and,...","[on, easter, please, remember, the, poor, and,...",2,Positive
4,I have a cute one made from recycled sari sil...,I have a cute one made from sari silk my frien...,Positive,Positive,i have a cute one made from sari silk my frien...,"[i, have, a, cute, one, made, from, sari, silk...","[i, have, a, cute, one, made, from, sari, silk...","[i, have, a, cute, one, made, from, sari, silk...",0,neutral


In [44]:

g[:10]

['Told my Mom we should start to work from home due to Corona. No one else works in our office but is but ya know...working from home sounds nice.',
 'ffffffffffffbdffffffbdffffffbd',
 'We are all in deep doo doo ffdb',
 'QUE ALGUIEN EXPLIQUE ffaffceded',
 'NEW RULE.   If you havenet attempted to make an appointment for a  swab test YOU DO NOT GET TO COMPLAIN ABOUT TESTING.   If youeve made an attempt and been declined, you can complain all you want.',
 ' ',
 'ecefbfPLEA FROM PASSENGERS: PLEASE do the right thing ffffffbb  and EXTEND TRAVEL FUND expiration dates for the MILLIONS OF DOLLARS in travelerse unused funds that are expiring in the next few weeks and summer months ffbffbffb  ',
 '',
 ' Is breaking my heart on  - I can feel her pain, exhaustion &amp; yet she stays resilient fighting    Sending light and love to her, her husband, daughter and her entire family and state of   edaefbfedaefbfedaefbfedaefbfedaefbfedaefbfedaefbfedaefbfedaefbfedaefbfedaefbfedaefbfedaefbf ',
 'The Coro

# The overlapped predictions i took it as train data with target variable, remaining as test data to implement supervised machine learning alogirhtms 

In [45]:
df_train=df[df.index.isin(ind)]

In [46]:
df_train.head()

Unnamed: 0,0,edit,TextBlob,Vader,clean,tokens,stem,lem,targets,nmf
5,Told my Mom we should start to work from home ...,Told my we should start to work from home due ...,Positive,Positive,told my we should start to work from home due ...,"[told, my, we, should, start, to, work, from, ...","[told, my, we, should, start, to, work, from, ...","[told, my, we, should, start, to, work, from, ...",2,Positive
6,ffffffffffffbdffffffbdffffffbd,,neutral,neutral,,[],[],[],0,neutral
8,We are all in deep doo doo ffdb,We are all in deep,neutral,neutral,we are all in deep,"[we, are, all, in, deep]","[we, are, all, in, deep]","[we, are, all, in, deep]",0,neutral
11,QUE ALGUIEN EXPLIQUE ffaffceded,,neutral,neutral,,[],[],[],0,neutral
14,NEW RULE. If you havenet attempted to make a...,NEW RULE . If you havenet to make an appointme...,Positive,Positive,new rule if you havenet to make an appointmen...,"[new, rule, if, you, havenet, to, make, an, ap...","[new, rule, if, you, havenet, to, make, an, ap...","[new, rule, if, you, havenet, to, make, an, ap...",2,Positive


In [47]:
df_test=df[~df.index.isin(ind)]

In [48]:
df_test.head()

Unnamed: 0,0,edit,TextBlob,Vader,clean,tokens,stem,lem,targets,nmf
0,ffffffab yeall suck at social distancing.,suck at social .,Positive,Negative,suck at social,"[suck, at, social]","[suck, at, social]","[suck, at, social]",1,Negative
1,Jasmine Strange shares a message of hope durin...,Jasmine Strange a message of hope during this ...,Negative,Positive,jasmine strange a message of hope during this ...,"[jasmine, strange, a, message, of, hope, durin...","[jasmine, strange, a, message, of, hope, durin...","[jasmine, strange, a, message, of, hope, durin...",2,Positive
2,I gotta fight these allergies in public to mak...,I fight these in public to make sure people th...,Positive,Negative,i fight these in public to make sure people th...,"[i, fight, these, in, public, to, make, sure, ...","[i, fight, these, in, public, to, make, sure, ...","[i, fight, these, in, public, to, make, sure, ...",0,neutral
3,On Easter please remember the poor and desol...,On Easter please remember the poor and desolate .,Negative,Negative,on easter please remember the poor and desolate,"[on, easter, please, remember, the, poor, and,...","[on, easter, please, remember, the, poor, and,...","[on, easter, please, remember, the, poor, and,...",2,Positive
4,I have a cute one made from recycled sari sil...,I have a cute one made from sari silk my frien...,Positive,Positive,i have a cute one made from sari silk my frien...,"[i, have, a, cute, one, made, from, sari, silk...","[i, have, a, cute, one, made, from, sari, silk...","[i, have, a, cute, one, made, from, sari, silk...",0,neutral


In [60]:
df_test['nmf'].value_counts()

Positive    360
Negative    164
neutral     119
Name: nmf, dtype: int64

In [62]:
df_test['TextBlob'].value_counts()

neutral     287
Positive    211
Negative    145
Name: TextBlob, dtype: int64

In [63]:
df_test['Vader'].value_counts()

Negative    249
Positive    200
neutral     194
Name: Vader, dtype: int64

In [49]:
df.head()

Unnamed: 0,0,edit,TextBlob,Vader,clean,tokens,stem,lem,targets,nmf
0,ffffffab yeall suck at social distancing.,suck at social .,Positive,Negative,suck at social,"[suck, at, social]","[suck, at, social]","[suck, at, social]",1,Negative
1,Jasmine Strange shares a message of hope durin...,Jasmine Strange a message of hope during this ...,Negative,Positive,jasmine strange a message of hope during this ...,"[jasmine, strange, a, message, of, hope, durin...","[jasmine, strange, a, message, of, hope, durin...","[jasmine, strange, a, message, of, hope, durin...",2,Positive
2,I gotta fight these allergies in public to mak...,I fight these in public to make sure people th...,Positive,Negative,i fight these in public to make sure people th...,"[i, fight, these, in, public, to, make, sure, ...","[i, fight, these, in, public, to, make, sure, ...","[i, fight, these, in, public, to, make, sure, ...",0,neutral
3,On Easter please remember the poor and desol...,On Easter please remember the poor and desolate .,Negative,Negative,on easter please remember the poor and desolate,"[on, easter, please, remember, the, poor, and,...","[on, easter, please, remember, the, poor, and,...","[on, easter, please, remember, the, poor, and,...",2,Positive
4,I have a cute one made from recycled sari sil...,I have a cute one made from sari silk my frien...,Positive,Positive,i have a cute one made from sari silk my frien...,"[i, have, a, cute, one, made, from, sari, silk...","[i, have, a, cute, one, made, from, sari, silk...","[i, have, a, cute, one, made, from, sari, silk...",0,neutral


# training models by using the train data 

In [51]:
X_train=df_train['lem'].apply(lambda x: ' '.join(x))
y_train=df_train['nmf']

In [52]:
vectoriser = TfidfVectorizer(ngram_range=(1,2), max_features=500000)
vectoriser.fit(X_train)
print('No. of feature_words: ', len(vectoriser.get_feature_names()))

No. of feature_words:  3509




In [53]:
X_train = vectoriser.transform(X_train)

In [54]:
X_test=df_test['lem'].apply(lambda x: ' '.join(x))

In [55]:
X_test  = vectoriser.transform(X_test)

# implementing SVC and Logistic regression models

In [56]:
SVCmodel = LinearSVC()
SVCmodel.fit(X_train, y_train)
#model_Evaluate(SVCmodel)
y_pred2 = SVCmodel.predict(X_test)

In [57]:
pd.Series(y_pred2).value_counts()

Positive    436
neutral     191
Negative     16
dtype: int64

In [58]:
LRmodel = LogisticRegression(C = 2, max_iter = 1000, n_jobs=-1)
LRmodel.fit(X_train, y_train)
#model_Evaluate(LRmodel)
y_pred3 = LRmodel.predict(X_test)

In [59]:
pd.Series(y_pred3).value_counts()

Positive    439
neutral     202
Negative      2
dtype: int64