# Sentiment Analysis using Naive Bayes

In [27]:
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/kaushambigujral/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kaushambigujral/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/kaushambigujral/nltk_data...


True

In [38]:
train_df = pd.read_csv ('Corona_NLP_train.csv', encoding= 'latin-1')
train_df.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [42]:
train_df = train_df[['OriginalTweet', 'Sentiment']]
train_df.size

82314

In [48]:
test_df = pd.read_csv('Corona_NLP_test.csv', encoding = 'latin-1')
test_df = test_df[['OriginalTweet', 'Sentiment']]
test_df.size

7596

In [50]:
test_df

Unnamed: 0,OriginalTweet,Sentiment
0,TRENDING: New Yorkers encounter empty supermar...,Extremely Negative
1,When I couldn't find hand sanitizer at Fred Me...,Positive
2,Find out how you can protect yourself and love...,Extremely Positive
3,#Panic buying hits #NewYork City as anxious sh...,Negative
4,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral
...,...,...
3793,Meanwhile In A Supermarket in Israel -- People...,Positive
3794,Did you panic buy a lot of non-perishable item...,Negative
3795,Asst Prof of Economics @cconces was on @NBCPhi...,Neutral
3796,Gov need to do somethings instead of biar je r...,Extremely Negative


## Exploratory Analysis

Check for null values in the dataset

In [51]:
for column in train_df.columns:
    print(train_df[column].isnull().value_counts())
print()  
for column in test_df.columns:
    print(test_df[column].isnull().value_counts())

False    41157
Name: OriginalTweet, dtype: int64
False    41157
Name: Sentiment, dtype: int64

False    3798
Name: OriginalTweet, dtype: int64
False    3798
Name: Sentiment, dtype: int64


Check for duplicate values in the dataset

In [52]:
print(train_df.duplicated().value_counts())
print()
print(test_df.duplicated().value_counts())

False    41157
dtype: int64

False    3798
dtype: int64


In [31]:
corpus = df.iloc[:, 4:5]
for c in corpus['OriginalTweet'][:3]:
    print(c)
    print()

TRENDING: New Yorkers encounter empty supermarket shelves (pictured, Wegmans in Brooklyn), sold-out online grocers (FoodKick, MaxDelivery) as #coronavirus-fearing shoppers stock up https://t.co/Gr76pcrLWh https://t.co/ivMKMsqdT1

When I couldn't find hand sanitizer at Fred Meyer, I turned to #Amazon. But $114.97 for a 2 pack of Purell??!!Check out how  #coronavirus concerns are driving up prices. https://t.co/ygbipBflMY

Find out how you can protect yourself and loved ones from #coronavirus. ?



## Preprocessing

- Removing links(https) and mentions(@)
- Creating tokens
- Stopword removal
- Lemmatizing tokens
- Un-tokenizing

In [32]:
def preprocess(text_df):
    # converting to lowercase
    text_df = text_df.apply(lambda x: x.lower())
    #Removing links and mentions
    text_df = text_df.apply(lambda x: re.sub(r"((www.[^s]+)|(http\S+)|(@\S+))", '', x))
    #Tokenization
    text_df = text_df.apply(lambda x: nltk.tokenize.RegexpTokenizer(r"\w+").tokenize(x))
    #Stopword removal
    text_df = text_df.apply(lambda x: [i for i in x if i not in nltk.corpus.stopwords.words('english')])
    #lemmatization
    lm = nltk.WordNetLemmatizer()
    text_df = text_df.apply(lambda x: [lm.lemmatize(i) for i in x])
    print(text_df)
    # Un- Tokenizing
    text_df = text_df.apply(lambda x: ' '.join(x))
    return text_df

In [34]:
preprocessed = preprocess(corpus['OriginalTweet'])
preprocessed.head()

0       [trending, new, yorkers, encounter, empty, sup...
1       [find, hand, sanitizer, fred, meyer, turned, a...
2                [find, protect, loved, one, coronavirus]
3       [panic, buying, hit, newyork, city, anxious, s...
4       [toiletpaper, dunnypaper, coronavirus, coronav...
                              ...                        
3793    [meanwhile, supermarket, israel, people, dance...
3794    [panic, buy, lot, non, perishable, item, echo,...
3795    [asst, prof, economics, talking, recent, resea...
3796    [gov, need, somethings, instead, biar, je, rak...
3797    [member, committed, safety, employee, end, use...
Name: OriginalTweet, Length: 3798, dtype: object


0    trending new yorkers encounter empty supermark...
1    find hand sanitizer fred meyer turned amazon 1...
2                   find protect loved one coronavirus
3    panic buying hit newyork city anxious shopper ...
4    toiletpaper dunnypaper coronavirus coronavirus...
Name: OriginalTweet, dtype: object

## Model Training

In [57]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
tweet_vector = vectorizer.fit_transform(train_df['OriginalTweet'])
tweet_vector.shape

(41157, 80424)

In [58]:
#split the data into 80% training and 20% testing
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(tweet_vector, train_df.Sentiment, test_size=0.20,random_state=0)

In [59]:
# create and train the Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB().fit(xtrain, ytrain)

In [60]:
print(classifier.predict(xtest))
print(ytest.values)

['Extremely Positive' 'Positive' 'Negative' ... 'Neutral' 'Neutral'
 'Positive']
['Neutral' 'Negative' 'Positive' ... 'Neutral' 'Neutral' 'Positive']


In [61]:
# Evaluating the model on the training data set
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
pred = classifier.predict(xtrain)
print(classification_report(ytrain, pred))
print()
print("Confusion Matrix: \n", confusion_matrix(ytrain, pred))
print("Accuracy: \n", accuracy_score(ytrain, pred))

                    precision    recall  f1-score   support

Extremely Negative       0.96      0.54      0.69      4387
Extremely Positive       0.92      0.64      0.76      5293
          Negative       0.70      0.83      0.76      7931
           Neutral       0.94      0.55      0.70      6187
          Positive       0.62      0.93      0.75      9127

          accuracy                           0.74     32925
         macro avg       0.83      0.70      0.73     32925
      weighted avg       0.79      0.74      0.73     32925


Confusion Matrix: 
 [[2380   17 1428   39  523]
 [   6 3407  207   20 1653]
 [  50   70 6589   82 1140]
 [  24   70  853 3410 1830]
 [  24  145  402   73 8483]]
Accuracy: 
 0.7370994684889901


In [62]:
# Evaluating the model on the testing data set
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
pred = classifier.predict(xtest)
print(classification_report(ytest, pred))
print()
print("Confusion Matrix: \n", confusion_matrix(ytest, pred))
print("Accuracy: \n", accuracy_score(ytest, pred))

                    precision    recall  f1-score   support

Extremely Negative       0.67      0.23      0.34      1094
Extremely Positive       0.61      0.30      0.40      1331
          Negative       0.42      0.52      0.46      1986
           Neutral       0.68      0.33      0.44      1526
          Positive       0.40      0.69      0.50      2295

          accuracy                           0.46      8232
         macro avg       0.55      0.41      0.43      8232
      weighted avg       0.52      0.46      0.45      8232


Confusion Matrix: 
 [[ 252    7  655   13  167]
 [   3  402   85   15  826]
 [  83   35 1042   85  741]
 [  15   37  328  499  647]
 [  25  177  391  124 1578]]
Accuracy: 
 0.4583333333333333


## Prediction

In [63]:
test_tweet_vector = vectorizer.transform(test_df['OriginalTweet'])
test_tweet_vector.shape

(3798, 80424)

In [66]:
pred = classifier.predict(test_tweet_vector)
print(pred)
print(test_df.Sentiment)

['Neutral' 'Positive' 'Extremely Positive' ... 'Neutral'
 'Extremely Negative' 'Positive']
0       Extremely Negative
1                 Positive
2       Extremely Positive
3                 Negative
4                  Neutral
               ...        
3793              Positive
3794              Negative
3795               Neutral
3796    Extremely Negative
3797    Extremely Positive
Name: Sentiment, Length: 3798, dtype: object


In [67]:
print(classification_report(test_df.Sentiment, pred))
print()
print("Confusion Matrix: \n", confusion_matrix(test_df.Sentiment, pred))
print("Accuracy: \n", accuracy_score(test_df.Sentiment, pred))

                    precision    recall  f1-score   support

Extremely Negative       0.72      0.13      0.22       592
Extremely Positive       0.69      0.18      0.29       599
          Negative       0.42      0.54      0.47      1041
           Neutral       0.76      0.16      0.27       619
          Positive       0.35      0.76      0.48       947

          accuracy                           0.41      3798
         macro avg       0.59      0.36      0.35      3798
      weighted avg       0.55      0.41      0.37      3798


Confusion Matrix: 
 [[ 78   1 402   2 109]
 [  1 110  39   3 446]
 [ 25   8 563  17 428]
 [  2  10 161 102 344]
 [  2  30 187  11 717]]
Accuracy: 
 0.4133754607688257
