# Sentiment Analysis using NLP on Coronavirus Tweets Dataset

## Importing Required Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random

import nltk 
import string
import re
import unicodedata

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, precision_recall_fscore_support

import pickle
import winsound

In [2]:
def alertme(times, diff):
    frequency = 2500  # Set Frequency To 2500 Hertz
    duration = 500  # Set Duration To 1000 ms == 1 second
    for i in range(times*diff):
        if i % diff == 0:
            winsound.Beep(frequency, duration)

## Loading datasets

In [3]:
df_train = pd.read_csv('./Data/Corona_NLP_train.csv', encoding='latin1').drop(columns=['UserName', 'ScreenName', 'Location', 'TweetAt'])
df_test = pd.read_csv('./Data/Corona_NLP_test.csv', encoding='latin1').drop(columns=['UserName', 'ScreenName', 'Location', 'TweetAt'])

In [4]:
df_train.isnull().sum()

text         0
Sentiment    0
dtype: int64

In [5]:
df_test.isnull().sum()

text         0
Sentiment    0
dtype: int64

### Cleaning the data

In [6]:
stopword = nltk.corpus.stopwords.words('english')
wn = nltk.WordNetLemmatizer()
ps = nltk.PorterStemmer()
            
def strip_accents(text):
    try:
        text = unicode(text, 'utf-8')
    except NameError: # unicode is a default on python 3 
        pass

    text = unicodedata.normalize('NFD', text)\
           .encode('ascii', 'ignore')\
           .decode("utf-8")
    return str(text)

def clean_up_sentence(text):
    
    # Shift to lowercase
    text = text.lower()
    
    # Removing mentions, hashtags and urls
    for i in range(len(text)):
        if text[i] == '#' or text[i] == '@':
            j = 0
            maxj = len(text)-i
            while(j <maxj and text[i+j] != ' '):
                if i+j < len(text):
                    text = text[0:i+j] + '.' + text[i+j+1:]
                    j += 1
        elif text[i] == 'h' and i < len(text)-4:
            if text[i:i+4] == 'http':
                j = 0
                maxj = len(text)-i
                while(j <maxj and text[i+j] != ' '):
                    if i+j < len(text):
                        text = text[0:i+j] + '#' + text[i+j+1:]
                        j += 1
    
    # Removing Punctuations and numbers
    text  = "".join([char for char in text if char not in string.punctuation])
    text = re.sub('[0-9]+', '', text)
    
    # Removing unwanted whitespace and removing accents
    text = strip_accents(" ".join(text.split()))
    
    # Tokenisation
    text = re.split('\W+', text)
    if '' in text:
        text.remove('')
       
    # Removing stop words
    text = [word for word in text if word not in stopword]
    
    # Lemmatization
    text = [wn.lemmatize(word) for word in text]

    # Remove Stopwords
    text = [word for word in text if word not in stopword]
    
    return text

## Splitting the Data

In [7]:
x_train = df_train['text']
x_test = df_test['text']
y_train = df_train['Sentiment']
y_test = df_test['Sentiment']

In [8]:
for i in range(len(y_train)):
    if y_train[i] == 'Extremely Negative':
        y_train[i] = 'Negative'
    elif y_train[i] == 'Extremely Positive':
        y_train[i] = 'Positive'
        
for i in range(len(y_test)):
    if y_test[i] == 'Extremely Negative':
        y_test[i] = 'Negative'
    elif y_test[i] == 'Extremely Positive':
        y_test[i] = 'Positive'

In [9]:
x_train.shape

(41157,)

In [10]:
y_train.shape

(41157,)

## Creating a pipeline to train the model

In [11]:
models = [
        MultinomialNB(),
        BernoulliNB(),
        SGDClassifier(max_iter=10000, n_jobs = 6),
        LogisticRegression(max_iter=10000, n_jobs = 6),
    ]
    
model_names = [
        '\nMultinomial Naive Bayes\n',
        '\nBernoulli Naive Bayes\n',
        '\nStochastic Gradient Descent\n',
        '\nLogistic Regression\n',
    ]

prd = []

for i in range(len(models)):
    pl = Pipeline([
            ('CV',CountVectorizer(analyzer=clean_up_sentence)),
            ('tfidf',TfidfTransformer()),
            ('classifier',models[i])
        ])
        
    print(model_names[i])
        
    pl.fit(x_train,y_train)
    pred = pl.predict(x_test)
    prd.append(pred)
        
    print(confusion_matrix(pred,y_test), end='\n\n')
    print(classification_report(pred,y_test))
    print(accuracy_score(pred,y_test))
    print()

alertme(5,2)


Multinomial Naive Bayes

[[1058  141  161]
 [   1    4    0]
 [ 574  474 1385]]

              precision    recall  f1-score   support

    Negative       0.65      0.78      0.71      1360
     Neutral       0.01      0.80      0.01         5
    Positive       0.90      0.57      0.70      2433

    accuracy                           0.64      3798
   macro avg       0.52      0.72      0.47      3798
weighted avg       0.81      0.64      0.70      3798

0.644286466561348


Bernoulli Naive Bayes

[[1217  141  281]
 [  78  307   49]
 [ 338  171 1216]]

              precision    recall  f1-score   support

    Negative       0.75      0.74      0.74      1639
     Neutral       0.50      0.71      0.58       434
    Positive       0.79      0.70      0.74      1725

    accuracy                           0.72      3798
   macro avg       0.68      0.72      0.69      3798
weighted avg       0.74      0.72      0.73      3798

0.7214323328067404


Stochastic Gradient Descent

[[1300 

**The best model for this dataset is Logistic Regression**

In [12]:
finalPipe = Pipeline([
            ('CV',CountVectorizer(analyzer=clean_up_sentence)),
            ('tfidf',TfidfTransformer()),
            ('classifier',LogisticRegression(max_iter=10000))
        ])
        
finalPipe.fit(x_train,y_train)
prd = finalPipe.predict(x_test)
        
print(confusion_matrix(prd,y_test), end='\n\n')
print(classification_report(prd,y_test))
print(accuracy_score(prd,y_test))
print()

[[1288  128  174]
 [  90  391   49]
 [ 255  100 1323]]

              precision    recall  f1-score   support

    Negative       0.79      0.81      0.80      1590
     Neutral       0.63      0.74      0.68       530
    Positive       0.86      0.79      0.82      1678

    accuracy                           0.79      3798
   macro avg       0.76      0.78      0.77      3798
weighted avg       0.80      0.79      0.79      3798

0.7904160084254871



In [13]:
filename = 'Major_Project_Twitter_Corona.sav'
pickle.dump(finalPipe, open(filename, 'wb'))

Above, we have created the Final Pipeline which should be saved. It contains the model with the highest accuracy.