# Sentiment Analysis with Naive Bayes

* python script for sentiment analysis with different variations of naive bayes classifiers
* multinomial, complement, gaussian, bernoulli

## Install and import required libraries and packages

In [1]:
from google.colab import drive
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import time
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import ComplementNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import pickle

## Naive Bayes classifiers on labeled dataset

* train dataset: 80% of manually labeled data
* test dataset: 20% of manually labeled data

### Read dataset

In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# train data
train_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/nlp project/Sentiment Analysis/final datasets/biden_tweets_labeled_train.csv')

train_df

Unnamed: 0.1,Unnamed: 0,raw_text,clean_text,subjectivity,polarity
0,1320,@Sundae_Gurl Biden's first move should be to p...,sundae gurl biden first move pardon hunter let...,0,2
1,782,@cobiscool @LilNasX Do you think joe Biden dri...,cobiscool lilnasx think joe biden drink caprisun,0,2
2,1128,"@ProudGayPatriot Don't worry, in the sense tha...",proudgaypatriot worry sense not getting electi...,1,1
3,199,@kclasgens @realDonaldTrump Not alone he didn'...,kclasgens realdonaldtrump not alone right even...,1,1
4,704,How Biden’s Climate Plans Will Shake Up Global...,biden climate plan shake global energy market,0,2
...,...,...,...,...,...
1407,1130,"@CNBC @CNBCPro If Biden comes to power, the go...",cnbc cnbcpro biden come power gold price return,1,1
1408,1294,"@KDFildesMBA We have a real President, Biden a...",kdfildesmba real president biden vice harris e...,1,1
1409,860,"What I care about at least as much, maybe more...",care least much maybe whether biden trump pres...,0,2
1410,1459,Joe Biden still doesn’t realize what the voter...,joe biden still realize voter democrat lost ac...,1,0


In [4]:
train_df.columns

Index(['Unnamed: 0', 'raw_text', 'clean_text', 'subjectivity', 'polarity'], dtype='object')

In [5]:
train_df = train_df.drop('Unnamed: 0', axis=1)

train_df

Unnamed: 0,raw_text,clean_text,subjectivity,polarity
0,@Sundae_Gurl Biden's first move should be to p...,sundae gurl biden first move pardon hunter let...,0,2
1,@cobiscool @LilNasX Do you think joe Biden dri...,cobiscool lilnasx think joe biden drink caprisun,0,2
2,"@ProudGayPatriot Don't worry, in the sense tha...",proudgaypatriot worry sense not getting electi...,1,1
3,@kclasgens @realDonaldTrump Not alone he didn'...,kclasgens realdonaldtrump not alone right even...,1,1
4,How Biden’s Climate Plans Will Shake Up Global...,biden climate plan shake global energy market,0,2
...,...,...,...,...
1407,"@CNBC @CNBCPro If Biden comes to power, the go...",cnbc cnbcpro biden come power gold price return,1,1
1408,"@KDFildesMBA We have a real President, Biden a...",kdfildesmba real president biden vice harris e...,1,1
1409,"What I care about at least as much, maybe more...",care least much maybe whether biden trump pres...,0,2
1410,Joe Biden still doesn’t realize what the voter...,joe biden still realize voter democrat lost ac...,1,0


In [6]:
# test data
test_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/nlp project/Sentiment Analysis/final datasets/biden_tweets_labeled_test.csv')

test_df

Unnamed: 0.1,Unnamed: 0,raw_text,clean_text,subjectivity,polarity
0,0,Sidney Powell amends court filing that said Ge...,sidney powell amends court filing said georgia...,1,1
1,1,@LisaMirandoCNN @wolfblitzer What's your thoug...,lisamirandocnn wolfblitzer thought cnn jake ta...,0,2
2,2,Breaking #FoxNews Alert : Joe Biden just revea...,breaking foxnews alert joe biden revealed goin...,0,2
3,3,"@RWPUSA Richard, have you heard there could be...",rwpusa richard heard could several republican ...,1,0
4,4,@LeafCavalier @AFJustinKG1 Is that a real-life...,leafcavalier afjustinkg real life question not...,1,0
...,...,...,...,...,...
349,349,@realDonaldTrump Thank you Biden!!!,realdonaldtrump thank biden,1,1
350,350,Kellyanne Conway acknowledges Biden as apparen...,kellyanne conway acknowledges biden apparent w...,0,2
351,351,@JeffTutorials @realDonaldTrump Ah Jeff. I ag...,jefftutorials realdonaldtrump ah jeff agree bi...,1,0
352,352,CNN Exclusive: Biden says he will ask American...,cnn exclusive biden say ask american wear mask...,0,2


In [7]:
test_df.columns

Index(['Unnamed: 0', 'raw_text', 'clean_text', 'subjectivity', 'polarity'], dtype='object')

In [8]:
test_df = test_df.drop('Unnamed: 0', axis=1)

test_df

Unnamed: 0,raw_text,clean_text,subjectivity,polarity
0,Sidney Powell amends court filing that said Ge...,sidney powell amends court filing said georgia...,1,1
1,@LisaMirandoCNN @wolfblitzer What's your thoug...,lisamirandocnn wolfblitzer thought cnn jake ta...,0,2
2,Breaking #FoxNews Alert : Joe Biden just revea...,breaking foxnews alert joe biden revealed goin...,0,2
3,"@RWPUSA Richard, have you heard there could be...",rwpusa richard heard could several republican ...,1,0
4,@LeafCavalier @AFJustinKG1 Is that a real-life...,leafcavalier afjustinkg real life question not...,1,0
...,...,...,...,...
349,@realDonaldTrump Thank you Biden!!!,realdonaldtrump thank biden,1,1
350,Kellyanne Conway acknowledges Biden as apparen...,kellyanne conway acknowledges biden apparent w...,0,2
351,@JeffTutorials @realDonaldTrump Ah Jeff. I ag...,jefftutorials realdonaldtrump ah jeff agree bi...,1,0
352,CNN Exclusive: Biden says he will ask American...,cnn exclusive biden say ask american wear mask...,0,2


In [9]:
train_df.shape, test_df.shape

((1412, 4), (354, 4))

## Vectorize data

In [10]:
# Create feature vectors
vectorizer = TfidfVectorizer(min_df = 1,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)

In [11]:
train_vectors = vectorizer.fit_transform(train_df['clean_text'])
test_vectors = vectorizer.transform(test_df['clean_text'])

### Multinomial naive bayes model

In [12]:
# define & compile the model
MNB = MultinomialNB()

# fit model
mnb_t0 = time.time()
MNB.fit(train_vectors, train_df['polarity'])
mnb_t1 = time.time()

# predict with model
mnb_prediction_linear = MNB.predict(test_vectors)
mnb_t2 = time.time()

# get train and test timings
mnb_time_linear_train = mnb_t1-mnb_t0
mnb_time_linear_predict = mnb_t2-mnb_t1

In [13]:
mnb_prediction_linear

array([0, 0, 2, 0, 0, 0, 1, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 1,
       2, 0, 1, 1, 0, 2, 2, 0, 1, 0, 0, 0, 0, 1, 2, 0, 1, 0, 2, 2, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 2, 2, 0, 0, 0, 1,
       0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 1, 2, 0, 0, 0, 0, 0, 2,
       0, 0, 0, 0, 2, 0, 2, 2, 2, 2, 1, 2, 0, 0, 0, 2, 1, 0, 2, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 2, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 2, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 1, 1, 2, 0, 1,
       2, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 0, 0, 0, 0, 0, 0, 2, 0,
       0, 0, 1, 0, 0, 0, 2, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 2, 0, 0,
       1, 2, 0, 2, 1, 0, 0, 0, 2, 0, 0, 1, 1, 2, 1,

#### Evaluate model

In [14]:
print("Training time: %fs; Prediction time: %fs" % (mnb_time_linear_train, mnb_time_linear_predict))

Training time: 0.003932s; Prediction time: 0.000671s


In [15]:
mnb_acc_score = accuracy_score(test_df['polarity'], mnb_prediction_linear)

print("mnb accuracy: ", str(mnb_acc_score))

mnb accuracy:  0.7288135593220338


In [16]:
mnb_report_dict = classification_report(test_df['polarity'], mnb_prediction_linear, output_dict=True)

print('negative: ', mnb_report_dict['0'])
print('positive: ', mnb_report_dict['1'])
print('neutral: ', mnb_report_dict['2'])

negative:  {'precision': 0.6666666666666666, 'recall': 0.9580838323353293, 'f1-score': 0.7862407862407863, 'support': 167}
positive:  {'precision': 0.7931034482758621, 'recall': 0.4791666666666667, 'f1-score': 0.5974025974025974, 'support': 96}
neutral:  {'precision': 0.9285714285714286, 'recall': 0.5714285714285714, 'f1-score': 0.7074829931972789, 'support': 91}


In [17]:
mnb_report = classification_report(test_df['polarity'], mnb_prediction_linear)

print(mnb_report)

              precision    recall  f1-score   support

           0       0.67      0.96      0.79       167
           1       0.79      0.48      0.60        96
           2       0.93      0.57      0.71        91

    accuracy                           0.73       354
   macro avg       0.80      0.67      0.70       354
weighted avg       0.77      0.73      0.71       354



### Complement naive bayes model

In [18]:
# define & compile model
CNB =ComplementNB()

# fit model
cnb_t0 = time.time()
CNB.fit(train_vectors, train_df['polarity'])
cnb_t1 = time.time()

# predict with model
cnb_prediction_linear = CNB.predict(test_vectors)
cnb_t2 = time.time()

# get train and test timings
cnb_time_linear_train = cnb_t1-cnb_t0
cnb_time_linear_predict = cnb_t2-cnb_t1

In [19]:
cnb_prediction_linear

array([0, 2, 2, 0, 0, 2, 1, 0, 2, 2, 0, 2, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 1, 0, 2, 2, 0, 2, 0, 0, 1, 0, 1, 1,
       2, 1, 2, 0, 0, 1, 2, 2, 1, 1, 0, 0, 2, 2, 1, 0, 0, 0, 2, 1, 0, 1,
       2, 2, 1, 1, 0, 2, 2, 0, 1, 0, 0, 0, 1, 1, 2, 1, 1, 0, 2, 2, 1, 1,
       0, 0, 0, 1, 0, 2, 0, 1, 0, 0, 1, 2, 1, 0, 1, 1, 2, 2, 0, 0, 2, 1,
       2, 2, 1, 2, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 1, 2, 2, 0, 0, 0, 0, 2,
       0, 0, 0, 0, 2, 0, 2, 2, 2, 2, 1, 2, 0, 1, 0, 2, 1, 0, 2, 0, 0, 0,
       0, 0, 0, 0, 2, 0, 0, 1, 0, 0, 0, 2, 0, 2, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 2, 0, 2, 1, 2, 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 1, 1, 1, 2, 1, 1,
       2, 1, 0, 1, 1, 1, 1, 1, 0, 2, 0, 1, 1, 0, 1, 1, 2, 0, 0, 0, 1, 0,
       1, 2, 1, 0, 1, 0, 0, 0, 2, 2, 0, 1, 1, 2, 0, 0, 0, 0, 0, 2, 2, 1,
       0, 2, 1, 0, 1, 0, 2, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 2,
       1, 1, 0, 0, 0, 0, 2, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 2, 0, 2, 0, 0,
       2, 2, 1, 2, 1, 1, 1, 1, 2, 0, 0, 1, 1, 2, 1,

#### Evaluate model

In [20]:
print("Training time: %fs; Prediction time: %fs" % (cnb_time_linear_train, cnb_time_linear_predict))

Training time: 0.005341s; Prediction time: 0.000580s


In [21]:
cnb_acc_score = accuracy_score(test_df['polarity'], cnb_prediction_linear)

print("accuracy: ", str(cnb_acc_score))

accuracy:  0.8305084745762712


In [22]:
cnb_report_dict = classification_report(test_df['polarity'], cnb_prediction_linear, output_dict=True)

print('negative: ', cnb_report_dict['0'])
print('positive: ', cnb_report_dict['1'])
print('neutral: ', cnb_report_dict['2'])

negative:  {'precision': 0.8411764705882353, 'recall': 0.8562874251497006, 'f1-score': 0.8486646884272998, 'support': 167}
positive:  {'precision': 0.7938144329896907, 'recall': 0.8020833333333334, 'f1-score': 0.7979274611398963, 'support': 96}
neutral:  {'precision': 0.8505747126436781, 'recall': 0.8131868131868132, 'f1-score': 0.8314606741573034, 'support': 91}


In [23]:
cnb_report = classification_report(test_df['polarity'], cnb_prediction_linear)

print(cnb_report)

              precision    recall  f1-score   support

           0       0.84      0.86      0.85       167
           1       0.79      0.80      0.80        96
           2       0.85      0.81      0.83        91

    accuracy                           0.83       354
   macro avg       0.83      0.82      0.83       354
weighted avg       0.83      0.83      0.83       354



### Gaussian naive bayes model

In [24]:
# define & compile the model
GNB =GaussianNB()

# fit model
gnb_t0 = time.time()
GNB.fit(train_vectors.todense(), train_df['polarity'])
gnb_t1 = time.time()

# predict with model
gnb_prediction_linear = GNB.predict(test_vectors.todense())
gnb_t2 = time.time()

# get train and test timings
gnb_time_linear_train = gnb_t1-gnb_t0
gnb_time_linear_predict = gnb_t2-gnb_t1



In [25]:
gnb_prediction_linear

array([0, 2, 2, 0, 0, 2, 2, 0, 2, 2, 0, 2, 1, 2, 0, 0, 0, 2, 1, 0, 0, 0,
       0, 0, 2, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 1, 2, 0, 0, 1, 0, 0, 2,
       1, 0, 2, 0, 0, 1, 2, 1, 2, 2, 0, 1, 2, 2, 0, 1, 0, 0, 2, 1, 1, 2,
       2, 2, 2, 1, 0, 2, 2, 0, 1, 1, 1, 0, 1, 1, 2, 2, 0, 0, 2, 2, 1, 2,
       2, 0, 2, 1, 0, 2, 0, 2, 0, 2, 2, 0, 2, 0, 0, 1, 2, 2, 2, 0, 2, 1,
       0, 2, 2, 2, 0, 1, 0, 1, 2, 2, 2, 0, 0, 1, 0, 2, 2, 0, 0, 0, 0, 2,
       1, 0, 1, 0, 1, 0, 2, 2, 2, 2, 2, 2, 0, 0, 0, 2, 1, 0, 1, 0, 0, 0,
       0, 0, 2, 0, 0, 0, 0, 2, 2, 2, 0, 2, 0, 1, 0, 0, 0, 1, 0, 0, 2, 0,
       2, 2, 0, 1, 0, 2, 0, 0, 0, 0, 0, 1, 0, 0, 2, 1, 2, 1, 1, 2, 2, 1,
       2, 1, 0, 0, 1, 1, 1, 2, 0, 2, 0, 0, 1, 1, 1, 1, 2, 0, 0, 0, 1, 0,
       2, 0, 1, 0, 1, 0, 1, 0, 2, 2, 1, 1, 1, 2, 0, 2, 2, 0, 0, 2, 2, 1,
       1, 0, 1, 2, 0, 2, 2, 2, 0, 2, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 1, 0, 0, 0, 0, 1, 2, 2, 0, 0, 1, 2, 0, 1, 2, 0, 2, 0, 1,
       0, 2, 0, 2, 1, 1, 1, 2, 2, 0, 0, 1, 1, 2, 0,

#### Evaluate model

In [26]:
print("Training time: %fs; Prediction time: %fs" % (gnb_time_linear_train, gnb_time_linear_predict))

Training time: 0.179024s; Prediction time: 0.045822s


In [27]:
gnb_acc_score = accuracy_score(test_df['polarity'], gnb_prediction_linear)

print("accuracy: ", str(gnb_acc_score))

accuracy:  0.6271186440677966


In [28]:
gnb_report_dict = classification_report(test_df['polarity'], gnb_prediction_linear, output_dict=True)

print('negative: ', gnb_report_dict['0'])
print('positive: ', gnb_report_dict['1'])
print('neutral: ', gnb_report_dict['2'])

negative:  {'precision': 0.7189542483660131, 'recall': 0.6586826347305389, 'f1-score': 0.6875, 'support': 167}
positive:  {'precision': 0.5238095238095238, 'recall': 0.4583333333333333, 'f1-score': 0.4888888888888889, 'support': 96}
neutral:  {'precision': 0.5811965811965812, 'recall': 0.7472527472527473, 'f1-score': 0.6538461538461539, 'support': 91}


In [29]:
gnb_report = classification_report(test_df['polarity'], gnb_prediction_linear)

print(gnb_report)

              precision    recall  f1-score   support

           0       0.72      0.66      0.69       167
           1       0.52      0.46      0.49        96
           2       0.58      0.75      0.65        91

    accuracy                           0.63       354
   macro avg       0.61      0.62      0.61       354
weighted avg       0.63      0.63      0.62       354



### Bernoulli naive bayes model

In [30]:
# define & compile model
BNB =BernoulliNB()

# fit model
bnb_t0 = time.time()
BNB.fit(train_vectors, train_df['polarity'])
bnb_t1 = time.time()

# predict with model
bnb_prediction_linear = BNB.predict(test_vectors)
bnb_t2 = time.time()

# get train and test timings
bnb_time_linear_train = bnb_t1-bnb_t0
bnb_time_linear_predict = bnb_t2-bnb_t1

In [31]:
bnb_prediction_linear

array([0, 2, 0, 0, 0, 0, 1, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 2, 0, 0, 1, 0, 1, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 2, 0, 0, 0,
       2, 2, 1, 1, 0, 2, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 2, 2, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 2, 0, 0, 0, 0, 1,
       0, 0, 1, 2, 1, 0, 0, 0, 0, 0, 2, 0, 0, 2, 1, 2, 0, 0, 0, 0, 0, 2,
       0, 0, 0, 0, 2, 0, 2, 0, 2, 2, 1, 0, 0, 0, 0, 2, 1, 0, 2, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 1, 1, 2, 0, 1,
       2, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 2, 0, 0, 0, 0, 0, 0, 2, 1,
       0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 2, 0, 0,
       1, 2, 1, 2, 1, 0, 0, 0, 2, 0, 0, 1, 1, 2, 1,

#### Evaluate model

In [32]:
print("Training time: %fs; Prediction time: %fs" % (bnb_time_linear_train, bnb_time_linear_predict))

Training time: 0.003946s; Prediction time: 0.002102s


In [33]:
bnb_acc_score = accuracy_score(test_df['polarity'], bnb_prediction_linear)

print("accuracy: ", str(bnb_acc_score))

accuracy:  0.7090395480225988


In [34]:
bnb_report_dict = classification_report(test_df['polarity'], bnb_prediction_linear, output_dict=True)

print('negative: ', bnb_report_dict['0'])
print('positive: ', bnb_report_dict['1'])
print('neutral: ', bnb_report_dict['2'])

negative:  {'precision': 0.654320987654321, 'recall': 0.9520958083832335, 'f1-score': 0.775609756097561, 'support': 167}
positive:  {'precision': 0.7419354838709677, 'recall': 0.4791666666666667, 'f1-score': 0.5822784810126582, 'support': 96}
neutral:  {'precision': 0.9387755102040817, 'recall': 0.5054945054945055, 'f1-score': 0.6571428571428571, 'support': 91}


In [35]:
bnb_report = classification_report(test_df['polarity'], bnb_prediction_linear)

print(bnb_report)

              precision    recall  f1-score   support

           0       0.65      0.95      0.78       167
           1       0.74      0.48      0.58        96
           2       0.94      0.51      0.66        91

    accuracy                           0.71       354
   macro avg       0.78      0.65      0.67       354
weighted avg       0.75      0.71      0.69       354



### Choosing best model

In [36]:
# comparing accuracies
print("MNB accuracy: ", str(mnb_acc_score))
print("CNB accuracy: ", str(cnb_acc_score))
print("GNB accuracy: ", str(gnb_acc_score))
print("BNB accuracy: ", str(bnb_acc_score))

MNB accuracy:  0.7288135593220338
CNB accuracy:  0.8305084745762712
GNB accuracy:  0.6271186440677966
BNB accuracy:  0.7090395480225988
