In [49]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder 
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [50]:
df = pd.read_csv('Preprocessed_labelled_dataset.csv')

In [51]:
df=df.drop(columns="Unnamed: 0")
df

Unnamed: 0,Tweet_text,Text_cleaned,Text_lemmatized,Text_lemmatized_str,sentiment_score,Label
0,Georgia may need to dip deep in its reserve fu...,"['georgia', 'may', 'need', 'dip', 'deep', 'res...","['georgia', 'may', 'need', 'dip', 'deep', 'res...",georgia may need dip deep reserve fund balance...,0,Neutral
1,Vatican: Migrants Are the ‘True Victims’ of th...,"['vatican', 'migrants', 'true', 'victims', 'co...","['vatican', 'migrant', 'true', 'victim', 'coro...",vatican migrant true victim coronavirus pandemic,0,Neutral
2,"Coronavirus outbreak: Trudeau praises ""positiv...","['coronavirus', 'outbreak', 'trudeau', 'praise...","['coronavirus', 'outbreak', 'trudeau', 'praise...",coronavirus outbreak trudeau praise positive c...,1,Positive
3,NOOOOOOO! No corona virus vaccine has EVER bee...,"['noo', 'corona', 'virus', 'vaccine', 'ever', ...","['noo', 'corona', 'virus', 'vaccine', 'ever', ...",noo corona virus vaccine ever safe kill need v...,-3,Negative
4,#HESA is meeting via videoconference today at ...,"['hesa', 'meeting', 'via', 'videoconference', ...","['hesa', 'meeting', 'via', 'videoconference', ...",hesa meeting via videoconference today pm et w...,-1,Negative
...,...,...,...,...,...,...
22526,My 1 year old is obsessed with @TheRealFunyuns...,"['year', 'old', 'obsessed', 'therealfunyuns', ...","['year', 'old', 'obsessed', 'therealfunyuns', ...",year old obsessed therealfunyuns fritolay get ...,2,Positive
22527,#Covid19 Ramaphosa says we must find those who...,"['covid', 'ramaphosa', 'says', 'must', 'find',...","['covid', 'ramaphosa', 'say', 'must', 'find', ...",covid ramaphosa say must find infected early p...,-2,Negative
22528,This is the article you should read today. #CO...,"['article', 'read', 'today', 'covid', 'homeles...","['article', 'read', 'today', 'covid', 'homeles...",article read today covid homelessness nyc,0,Neutral
22529,I thank @GovWike for extending the lockdown da...,"['thank', 'govwike', 'extending', 'lockdown', ...","['thank', 'govwike', 'extending', 'lockdown', ...",thank govwike extending lockdown date obioakpo...,1,Positive


In [52]:
#Negative tweets are labelled 0 , neutral as 1 and positive as 2
#Label encoding of sentiment
le = LabelEncoder() 
df['Label']= le.fit_transform(df['Label'])
df

Unnamed: 0,Tweet_text,Text_cleaned,Text_lemmatized,Text_lemmatized_str,sentiment_score,Label
0,Georgia may need to dip deep in its reserve fu...,"['georgia', 'may', 'need', 'dip', 'deep', 'res...","['georgia', 'may', 'need', 'dip', 'deep', 'res...",georgia may need dip deep reserve fund balance...,0,1
1,Vatican: Migrants Are the ‘True Victims’ of th...,"['vatican', 'migrants', 'true', 'victims', 'co...","['vatican', 'migrant', 'true', 'victim', 'coro...",vatican migrant true victim coronavirus pandemic,0,1
2,"Coronavirus outbreak: Trudeau praises ""positiv...","['coronavirus', 'outbreak', 'trudeau', 'praise...","['coronavirus', 'outbreak', 'trudeau', 'praise...",coronavirus outbreak trudeau praise positive c...,1,2
3,NOOOOOOO! No corona virus vaccine has EVER bee...,"['noo', 'corona', 'virus', 'vaccine', 'ever', ...","['noo', 'corona', 'virus', 'vaccine', 'ever', ...",noo corona virus vaccine ever safe kill need v...,-3,0
4,#HESA is meeting via videoconference today at ...,"['hesa', 'meeting', 'via', 'videoconference', ...","['hesa', 'meeting', 'via', 'videoconference', ...",hesa meeting via videoconference today pm et w...,-1,0
...,...,...,...,...,...,...
22526,My 1 year old is obsessed with @TheRealFunyuns...,"['year', 'old', 'obsessed', 'therealfunyuns', ...","['year', 'old', 'obsessed', 'therealfunyuns', ...",year old obsessed therealfunyuns fritolay get ...,2,2
22527,#Covid19 Ramaphosa says we must find those who...,"['covid', 'ramaphosa', 'says', 'must', 'find',...","['covid', 'ramaphosa', 'say', 'must', 'find', ...",covid ramaphosa say must find infected early p...,-2,0
22528,This is the article you should read today. #CO...,"['article', 'read', 'today', 'covid', 'homeles...","['article', 'read', 'today', 'covid', 'homeles...",article read today covid homelessness nyc,0,1
22529,I thank @GovWike for extending the lockdown da...,"['thank', 'govwike', 'extending', 'lockdown', ...","['thank', 'govwike', 'extending', 'lockdown', ...",thank govwike extending lockdown date obioakpo...,1,2


In [53]:
#Train-test split
x_train, x_test, y_train, y_test = train_test_split(df['Text_lemmatized'], df['Label'], test_size=0.3, random_state=42)


In [54]:
#Model:Multinomial Naive Bayes
nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
nb.fit(x_train, y_train)

from sklearn.metrics import classification_report
y_pred = nb.predict(x_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

accuracy 0.6430473372781065
              precision    recall  f1-score   support

           0       0.61      0.89      0.72      2556
           1       0.60      0.52      0.56      2336
           2       0.86      0.45      0.59      1868

    accuracy                           0.64      6760
   macro avg       0.69      0.62      0.63      6760
weighted avg       0.67      0.64      0.63      6760



In [55]:
#Model : Linear SVM

sgd = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2', random_state=42, max_iter=10, tol=None)),
               ])
sgd.fit(x_train, y_train)

y_pred = sgd.predict(x_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

accuracy 0.7770710059171597
              precision    recall  f1-score   support

           0       0.82      0.85      0.83      2556
           1       0.72      0.68      0.70      2336
           2       0.78      0.81      0.80      1868

    accuracy                           0.78      6760
   macro avg       0.77      0.78      0.78      6760
weighted avg       0.78      0.78      0.78      6760



In [19]:
#Model: Logistic Regression

logreg = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(n_jobs=1, C=1e5, max_iter=1500)),
               ])
logreg.fit(x_train, y_train)

y_pred = logreg.predict(x_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

accuracy 0.7789940828402366
              precision    recall  f1-score   support

           0       0.84      0.83      0.83      2556
           1       0.69      0.72      0.70      2336
           2       0.81      0.79      0.80      1868

    accuracy                           0.78      6760
   macro avg       0.78      0.78      0.78      6760
weighted avg       0.78      0.78      0.78      6760



In [56]:
#Model: Decision Tree
DT = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf',  DecisionTreeClassifier(criterion="entropy")),
               ])

# Train Decision Tree Classifer
DT.fit(x_train,y_train)

#Predict the response for test dataset
y_pred = DT.predict(x_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

accuracy 0.6326923076923077
              precision    recall  f1-score   support

           0       0.69      0.63      0.66      2556
           1       0.57      0.64      0.60      2336
           2       0.64      0.62      0.63      1868

    accuracy                           0.63      6760
   macro avg       0.64      0.63      0.63      6760
weighted avg       0.64      0.63      0.63      6760



In [25]:
#Model Random forest
Rf = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf',  RandomForestClassifier()),
               ])

Rf.fit(x_train,y_train)

y_pred = Rf.predict(x_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

accuracy 0.7423076923076923
              precision    recall  f1-score   support

           0       0.79      0.78      0.78      2556
           1       0.67      0.73      0.70      2336
           2       0.79      0.70      0.75      1868

    accuracy                           0.74      6760
   macro avg       0.75      0.74      0.74      6760
weighted avg       0.75      0.74      0.74      6760

