# Sentiment Analysis using NLP on Indian Tweets Dataset

## Importing Required Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random

import nltk 
import string
import re
import unicodedata

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

import pickle
import winsound

In [2]:
def alertme(times, diff):
    frequency = 2500  # Set Frequency To 2500 Hertz
    duration = 500  # Set Duration To 1000 ms == 1 second
    for i in range(times*diff):
        if i % diff == 0:
            winsound.Beep(frequency, duration)

## Loading datasets

In [3]:
df = pd.read_csv('./Data/Twitter_data.csv', encoding='latin1', names=['text', 'Sentiment'])
df

Unnamed: 0,text,Sentiment
0,when modi promised âminimum government maxim...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0
...,...,...
162974,why these 456 crores paid neerav modi not reco...,-1.0
162975,dear rss terrorist payal gawar what about modi...,-1.0
162976,did you cover her interaction forum where she ...,0.0
162977,there big project came into india modi dream p...,0.0


In [4]:
df.isnull().sum()

text         3
Sentiment    7
dtype: int64

In [5]:
df = df.fillna(-2)
df.isnull().sum()

text         0
Sentiment    0
dtype: int64

In [6]:
text = []
Sentiment = []

for i in range(len(df['text'])):
    if df['text'][i] != -2 and df['Sentiment'][i] != -2:
        text.append(df['text'][i])
        Sentiment.append(df['Sentiment'][i])

df2 = pd.DataFrame([])
df2['text'] = text
df2['Sentiment'] = Sentiment
df2

Unnamed: 0,text,Sentiment
0,when modi promised âminimum government maxim...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0
...,...,...
162964,why these 456 crores paid neerav modi not reco...,-1.0
162965,dear rss terrorist payal gawar what about modi...,-1.0
162966,did you cover her interaction forum where she ...,0.0
162967,there big project came into india modi dream p...,0.0


In [7]:
df = df2
df['Sentiment'].unique()

array([-1.,  0.,  1.])

In [8]:
y = []

for i in range(len(df['Sentiment'])):
    if df['Sentiment'][i] == -1:
        y.append('Negative')
    elif df['Sentiment'][i] == 0:
        y.append('Neutral')
    elif df['Sentiment'][i] == 1:
        y.append('Positive')
        
df['Sentiment'] = y
df

Unnamed: 0,text,Sentiment
0,when modi promised âminimum government maxim...,Negative
1,talk all the nonsense and continue all the dra...,Neutral
2,what did just say vote for modi welcome bjp t...,Positive
3,asking his supporters prefix chowkidar their n...,Positive
4,answer who among these the most powerful world...,Positive
...,...,...
162964,why these 456 crores paid neerav modi not reco...,Negative
162965,dear rss terrorist payal gawar what about modi...,Negative
162966,did you cover her interaction forum where she ...,Neutral
162967,there big project came into india modi dream p...,Neutral


### Cleaning the data

In [9]:
stopword = nltk.corpus.stopwords.words('english')
wn = nltk.WordNetLemmatizer()
ps = nltk.PorterStemmer()
            
def strip_accents(text):
    try:
        text = unicode(text, 'utf-8')
    except NameError: # unicode is a default on python 3 
        pass

    text = unicodedata.normalize('NFD', text)\
           .encode('ascii', 'ignore')\
           .decode("utf-8")
    return str(text)

def clean_up_sentence(text):
    
    # Shift to lowercase
    text = text.lower()
    
    # Removing mentions, hashtags and urls
    for i in range(len(text)):
        if text[i] == '#' or text[i] == '@':
            j = 0
            maxj = len(text)-i
            while(j <maxj and text[i+j] != ' '):
                if i+j < len(text):
                    text = text[0:i+j] + '.' + text[i+j+1:]
                    j += 1
        elif text[i] == 'h' and i < len(text)-4:
            if text[i:i+4] == 'http':
                j = 0
                maxj = len(text)-i
                while(j <maxj and text[i+j] != ' '):
                    if i+j < len(text):
                        text = text[0:i+j] + '#' + text[i+j+1:]
                        j += 1
    
    # Removing Punctuations and numbers
    text  = "".join([char for char in text if char not in string.punctuation])
    text = re.sub('[0-9]+', '', text)
    
    # Removing unwanted whitespace and removing accents
    text = strip_accents(" ".join(text.split()))
    
    # Tokenisation
    text = re.split('\W+', text)
    if '' in text:
        text.remove('')
       
    # Removing stop words
    text = [word for word in text if word not in stopword]
    
    # Lemmatization
    text = [wn.lemmatize(word) for word in text]

    # Remove Stopwords
    text = [word for word in text if word not in stopword]
    
    return text

## Splitting the Data

In [10]:
x = df['text']
y = df['Sentiment']

In [11]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, stratify = y)

## Creating a pipeline to train the model

In [12]:
models = [
        MultinomialNB(),
        BernoulliNB(),
        SGDClassifier(max_iter=10000, n_jobs = 6),
        LogisticRegression(max_iter=10000, n_jobs = 6),
    ]
    
model_names = [
        '\nMultinomial Naive Bayes\n',
        '\nBernoulli Naive Bayes\n',
        '\nStochastic Gradient Descent\n',
        '\nLogistic Regression\n',
    ]
    
for i in range(len(models)):
    pl = Pipeline([
            ('CV',CountVectorizer(analyzer=clean_up_sentence)),
            ('tfidf',TfidfTransformer()),
            ('classifier',models[i])
        ])
        
    print(model_names[i])
        
    pl.fit(x_train,y_train)
    prd = pl.predict(x_test)
        
    print(confusion_matrix(prd,y_test), end='\n\n')
    print(classification_report(prd,y_test))
    print(accuracy_score(prd,y_test))
    print()

alertme(5,2)


Multinomial Naive Bayes

[[ 1209    46    36]
 [  407  5548   386]
 [ 9037 10969 21253]]

              precision    recall  f1-score   support

    Negative       0.11      0.94      0.20      1291
     Neutral       0.33      0.87      0.48      6341
    Positive       0.98      0.52      0.68     41259

    accuracy                           0.57     48891
   macro avg       0.48      0.78      0.45     48891
weighted avg       0.87      0.57      0.64     48891

0.5729070790124972


Bernoulli Naive Bayes

[[ 4663   596  1200]
 [ 1781 13734  1951]
 [ 4209  2233 18524]]

              precision    recall  f1-score   support

    Negative       0.44      0.72      0.54      6459
     Neutral       0.83      0.79      0.81     17466
    Positive       0.85      0.74      0.79     24966

    accuracy                           0.76     48891
   macro avg       0.71      0.75      0.72     48891
weighted avg       0.79      0.76      0.77     48891

0.7551696631281831


Stochastic Gradie

**The best model for this dataset is Logistic Regression**

In [13]:
finalPipe = Pipeline([
            ('CV',CountVectorizer(analyzer=clean_up_sentence)),
            ('tfidf',TfidfTransformer()),
            ('classifier', LogisticRegression(max_iter=10000))
        ])
        
finalPipe.fit(x_train,y_train)
prd = finalPipe.predict(x_test)
        
print(confusion_matrix(prd,y_test), end='\n\n')
print(classification_report(prd,y_test))
print(accuracy_score(prd,y_test))
print()

[[ 7910   283   879]
 [ 1314 15824  1618]
 [ 1429   456 19178]]

              precision    recall  f1-score   support

    Negative       0.74      0.87      0.80      9072
     Neutral       0.96      0.84      0.90     18756
    Positive       0.88      0.91      0.90     21063

    accuracy                           0.88     48891
   macro avg       0.86      0.88      0.87     48891
weighted avg       0.89      0.88      0.88     48891

0.8777075535374609



In [14]:
filename = 'Major_Project_Twitter_India.sav'
pickle.dump(finalPipe, open(filename, 'wb'))

Above, we have created the Final Pipeline which should be saved. It contains the model with the highest accuracy.