# Sentiment Analysis using Naive Bayes

In [121]:
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/kaushambigujral/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kaushambigujral/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/kaushambigujral/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [122]:
train_df = pd.read_csv ('Corona_NLP_train.csv', encoding= 'latin-1')
train_df.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [123]:
train_df = train_df[['OriginalTweet', 'Sentiment']]
train_df.size

82314

In [124]:
test_df = pd.read_csv('Corona_NLP_test.csv', encoding = 'latin-1')
test_df = test_df[['OriginalTweet', 'Sentiment']]
test_df.size

7596

In [125]:
test_df

Unnamed: 0,OriginalTweet,Sentiment
0,TRENDING: New Yorkers encounter empty supermar...,Extremely Negative
1,When I couldn't find hand sanitizer at Fred Me...,Positive
2,Find out how you can protect yourself and love...,Extremely Positive
3,#Panic buying hits #NewYork City as anxious sh...,Negative
4,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral
...,...,...
3793,Meanwhile In A Supermarket in Israel -- People...,Positive
3794,Did you panic buy a lot of non-perishable item...,Negative
3795,Asst Prof of Economics @cconces was on @NBCPhi...,Neutral
3796,Gov need to do somethings instead of biar je r...,Extremely Negative


In [135]:
test_df['OriginalTweet'][:3]


0    TRENDING: New Yorkers encounter empty supermar...
1    When I couldn't find hand sanitizer at Fred Me...
2    Find out how you can protect yourself and love...
Name: OriginalTweet, dtype: object

## Exploratory Analysis

Check for null values in the dataset

In [127]:
for column in train_df.columns:
    print(train_df[column].isnull().value_counts())
print()  
for column in test_df.columns:
    print(test_df[column].isnull().value_counts())

False    41157
Name: OriginalTweet, dtype: int64
False    41157
Name: Sentiment, dtype: int64

False    3798
Name: OriginalTweet, dtype: int64
False    3798
Name: Sentiment, dtype: int64


Check for duplicate values in the dataset

In [128]:
print(train_df.duplicated().value_counts())
print()
print(test_df.duplicated().value_counts())

False    41157
dtype: int64

False    3798
dtype: int64


## Preprocessing

- Removing links(https) and mentions(@)
- Creating tokens
- Stopword removal
- Lemmatizing tokens
- Un-tokenizing

In [129]:
def preprocess(text_df):
    # converting to lowercase
    text_df = text_df.apply(lambda x: x.lower())
    print('Converted to lower case')
    #Removing links and mentions
    text_df = text_df.apply(lambda x: re.sub(r"((www.[^s]+)|(http\S+)|(@\S+))", '', x))
    print('Removed links and mentions')
    #Tokenization
    text_df = text_df.apply(lambda x: nltk.tokenize.RegexpTokenizer(r"\w+").tokenize(x))
    print('Tokenized')
    #Stopword removal
    text_df = text_df.apply(lambda x: [i for i in x if i not in nltk.corpus.stopwords.words('english')])
    print('Removed stop words')
    #lemmatization
    lm = nltk.WordNetLemmatizer()
    print('Lemmatized')
    text_df = text_df.apply(lambda x: [lm.lemmatize(i) for i in x])
    # Un- Tokenizing
    text_df = text_df.apply(lambda x: ' '.join(x))
    print('Untokenized')
    return text_df

In [130]:
train_tweets = preprocess(train_df['OriginalTweet'])

Converted to lower case
Removed links and mentions
Tokenized
Removed stop words
Lemmatized
Untokenized


In [132]:
train_tweets

0                                                         
1        advice talk neighbour family exchange phone nu...
2        coronavirus australia woolworth give elderly d...
3        food stock one empty please panic enough food ...
4        ready go supermarket covid19 outbreak paranoid...
                               ...                        
41152    airline pilot offering stock supermarket shelf...
41153    response complaint provided citing covid 19 re...
41154    know itâ getting tough rationing toilet paper ...
41155    wrong smell hand sanitizer starting turn coron...
41156    well new used rift going 700 00 amazon rn alth...
Name: OriginalTweet, Length: 41157, dtype: object

In [134]:
train_df['PreprocessedTweet'] = train_tweets
train_df.head()

Unnamed: 0,OriginalTweet,Sentiment,PreprocessedTweet
0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral,
1,advice Talk to your neighbours family to excha...,Positive,advice talk neighbour family exchange phone nu...
2,Coronavirus Australia: Woolworths to give elde...,Positive,coronavirus australia woolworth give elderly d...
3,My food stock is not the only one which is emp...,Positive,food stock one empty please panic enough food ...
4,"Me, ready to go at supermarket during the #COV...",Extremely Negative,ready go supermarket covid19 outbreak paranoid...


## Model Training

In [136]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
tweet_vector = vectorizer.fit_transform(train_df['PreprocessedTweet'])
tweet_vector.shape

(41157, 43471)

In [137]:
#split the data into 80% training and 20% testing
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(tweet_vector, train_df.Sentiment, test_size=0.20,random_state=0)

In [138]:
# create and train the Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB().fit(xtrain, ytrain)

In [139]:
print(classifier.predict(xtest))
print(ytest.values)

['Extremely Positive' 'Extremely Negative' 'Extremely Negative' ...
 'Positive' 'Neutral' 'Positive']
['Neutral' 'Negative' 'Positive' ... 'Neutral' 'Neutral' 'Positive']


In [140]:
# Evaluating the model on the training data set
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
pred = classifier.predict(xtrain)
print(classification_report(ytrain, pred))
print()
print("Confusion Matrix: \n", confusion_matrix(ytrain, pred))
print("Accuracy: \n", accuracy_score(ytrain, pred))

                    precision    recall  f1-score   support

Extremely Negative       0.89      0.63      0.74      4387
Extremely Positive       0.85      0.68      0.76      5293
          Negative       0.68      0.78      0.73      7931
           Neutral       0.91      0.57      0.70      6187
          Positive       0.63      0.87      0.73      9127

          accuracy                           0.73     32925
         macro avg       0.79      0.71      0.73     32925
      weighted avg       0.76      0.73      0.73     32925


Confusion Matrix: 
 [[2765   29 1143   51  399]
 [  22 3625  229   27 1390]
 [ 213  137 6200  150 1231]
 [  50  109  888 3510 1630]
 [  73  340  670  130 7914]]
Accuracy: 
 0.7293545937737281


In [141]:
# Evaluating the model on the testing data set
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
pred = classifier.predict(xtest)
print(classification_report(ytest, pred))
print()
print("Confusion Matrix: \n", confusion_matrix(ytest, pred))
print("Accuracy: \n", accuracy_score(ytest, pred))

                    precision    recall  f1-score   support

Extremely Negative       0.60      0.35      0.44      1094
Extremely Positive       0.56      0.39      0.46      1331
          Negative       0.43      0.53      0.47      1986
           Neutral       0.67      0.37      0.48      1526
          Positive       0.41      0.61      0.49      2295

          accuracy                           0.48      8232
         macro avg       0.54      0.45      0.47      8232
      weighted avg       0.51      0.48      0.47      8232


Confusion Matrix: 
 [[ 385    6  540   24  139]
 [   6  522   83   23  697]
 [ 167   63 1046   93  617]
 [  33   45  326  570  552]
 [  50  290  428  137 1390]]
Accuracy: 
 0.47534013605442177


## Prediction

In [145]:
#test_df['PreprocessedTweet'] = preprocess(test_df['OriginalTweet'])
test_tweet_vector = vectorizer.transform(test_df['OriginalTweet'])
test_tweet_vector.shape

(3798, 43471)

In [146]:
pred = classifier.predict(test_tweet_vector)
print(pred)
print(test_df.Sentiment)

['Extremely Negative' 'Positive' 'Extremely Positive' ... 'Neutral'
 'Extremely Negative' 'Extremely Positive']
0       Extremely Negative
1                 Positive
2       Extremely Positive
3                 Negative
4                  Neutral
               ...        
3793              Positive
3794              Negative
3795               Neutral
3796    Extremely Negative
3797    Extremely Positive
Name: Sentiment, Length: 3798, dtype: object


In [147]:
print(classification_report(test_df.Sentiment, pred))
print()
print("Confusion Matrix: \n", confusion_matrix(test_df.Sentiment, pred))
print("Accuracy: \n", accuracy_score(test_df.Sentiment, pred))

                    precision    recall  f1-score   support

Extremely Negative       0.53      0.35      0.42       592
Extremely Positive       0.56      0.42      0.48       599
          Negative       0.39      0.55      0.45      1041
           Neutral       0.58      0.23      0.33       619
          Positive       0.38      0.49      0.43       947

          accuracy                           0.43      3798
         macro avg       0.49      0.41      0.42      3798
      weighted avg       0.47      0.43      0.43      3798


Confusion Matrix: 
 [[210  10 306  13  53]
 [ 16 249  83   8 243]
 [128  42 570  53 248]
 [ 15  20 212 142 230]
 [ 30 120 302  29 466]]
Accuracy: 
 0.4310163243812533
