# Sentiment Analysis using Voting System

## Importing Required Libraries

In [1]:
import numpy as np
import pandas as pd
import random

import nltk 
import string
import re
import unicodedata

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

import pickle

## Loading datasets

In [2]:
df = pd.read_csv('All_data_combined.csv', encoding='latin1')
df

Unnamed: 0.1,Unnamed: 0,text,Sentiment
0,0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,1,advice Talk to your neighbours family to excha...,Positive
2,2,Coronavirus Australia: Woolworths to give elde...,Positive
3,3,My food stock is not the only one which is emp...,Positive
4,4,"Me, ready to go at supermarket during the #COV...",Negative
...,...,...,...
1845068,1845068,jesus,Neutral
1845069,1845069,kya bhai pure saal chutiya banaya modi aur jab...,Positive
1845070,1845070,downvote karna tha par upvote hogaya,Neutral
1845071,1845071,haha nice,Positive


### Cleaning the data

In [3]:
stopword = nltk.corpus.stopwords.words('english')
wn = nltk.WordNetLemmatizer()
ps = nltk.PorterStemmer()
            
def strip_accents(text):
    try:
        text = unicode(text, 'utf-8')
    except NameError: # unicode is a default on python 3 
        pass

    text = unicodedata.normalize('NFD', text)\
           .encode('ascii', 'ignore')\
           .decode("utf-8")
    return str(text)

def clean_up_sentence(text):
    
    # Shift to lowercase
    text = text.lower()
    
    # Removing mentions, hashtags and urls
    for i in range(len(text)):
        if text[i] == '#' or text[i] == '@':
            j = 0
            maxj = len(text)-i
            while(j <maxj and text[i+j] != ' '):
                if i+j < len(text):
                    text = text[0:i+j] + '.' + text[i+j+1:]
                    j += 1
        elif text[i] == 'h' and i < len(text)-4:
            if text[i:i+4] == 'http':
                j = 0
                maxj = len(text)-i
                while(j <maxj and text[i+j] != ' '):
                    if i+j < len(text):
                        text = text[0:i+j] + '#' + text[i+j+1:]
                        j += 1
    
    # Removing Punctuations and numbers
    text  = "".join([char for char in text if char not in string.punctuation])
    text = re.sub('[0-9]+', '', text)
    
    # Removing unwanted whitespace and removing accents
    text = strip_accents(" ".join(text.split()))
    
    # Tokenisation
    text = re.split('\W+', text)
    if '' in text:
        text.remove('')
       
    # Removing stop words
    text = [word for word in text if word not in stopword]
    
    # Lemmatization
    text = [wn.lemmatize(word) for word in text]

    # Remove Stopwords
    text = [word for word in text if word not in stopword]
    
    return text

## Splitting data to select test values

In [4]:
x = df['text']
y = df['Sentiment']

In [5]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.1, stratify = y)

## Loading previously saved models

In [6]:
reddit = pickle.load(open('Major_Project_Reddit.sav', 'rb'))
corona = pickle.load(open('Major_Project_Twitter_Corona.sav', 'rb'))
india = pickle.load(open('Major_Project_Twitter_India.sav', 'rb'))
mill = pickle.load(open('Major_Project_Twitter_1.6M.sav', 'rb'))
comb = pickle.load(open('Major_Project_Twitter_Combined.sav', 'rb'))

# Creating list of all models
models = [
        reddit,
        corona,
        india,
        mill,
        comb
    ]
    
# Providing names of all the models
model_names = [
        '\nTwitter Reddit\n',
        '\nTwitter Corona\n',
        '\nTwitter India\n',
        '\nTwitter 1.6 M Tweets\n',
        '\nAll data Combined\n'
    ]

# Empty list to store the predictions of all models
prd = []

# Main predictor loop
for i in range(len(models)):
    pl = models[i]
        
    print(model_names[i])
    
    pred = pl.predict(x_test)
    prd.append(pred)
        
    print(confusion_matrix(pred,y_test), end='\n\n')
    print(classification_report(pred,y_test))
    print(accuracy_score(pred,y_test))
    print()


Twitter Reddit

[[19529   179  5104]
 [39956  6886 35365]
 [26597   594 50298]]

              precision    recall  f1-score   support

    Negative       0.23      0.79      0.35     24812
     Neutral       0.90      0.08      0.15     82207
    Positive       0.55      0.65      0.60     77489

    accuracy                           0.42    184508
   macro avg       0.56      0.51      0.37    184508
weighted avg       0.66      0.42      0.37    184508

0.41577058989312116


Twitter Corona

[[35205  1685 12995]
 [15862  3297 18261]
 [35015  2677 59511]]

              precision    recall  f1-score   support

    Negative       0.41      0.71      0.52     49885
     Neutral       0.43      0.09      0.15     37420
    Positive       0.66      0.61      0.63     97203

    accuracy                           0.53    184508
   macro avg       0.50      0.47      0.43    184508
weighted avg       0.54      0.53      0.50    184508

0.5312127387430355


Twitter India

[[25647   205  75

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

    Negative       0.77      0.76      0.77     87236
     Neutral       0.00      0.00      0.00         0
    Positive       0.80      0.75      0.77     97272

    accuracy                           0.75    184508
   macro avg       0.52      0.50      0.51    184508
weighted avg       0.79      0.75      0.77    184508

0.7542762373447222


All data Combined

[[66796   758 15542]
 [  638  5012   724]
 [18648  1889 74501]]

              precision    recall  f1-score   support

    Negative       0.78      0.80      0.79     83096
     Neutral       0.65      0.79      0.71      6374
    Positive       0.82      0.78      0.80     95038

    accuracy                           0.79    184508
   macro avg       0.75      0.79      0.77    184508
weighted avg       0.79      0.79      0.79    184508

0.7929683265766254



## Voting of all the different models

**Let us see how the predictions list looks**

In [7]:
prd

[array(['Positive', 'Neutral', 'Positive', ..., 'Neutral', 'Neutral',
        'Positive'], dtype=object),
 array(['Positive', 'Neutral', 'Negative', ..., 'Positive', 'Positive',
        'Positive'], dtype=object),
 array(['Positive', 'Neutral', 'Negative', ..., 'Positive', 'Positive',
        'Positive'], dtype=object),
 array(['Positive', 'Positive', 'Negative', ..., 'Negative', 'Positive',
        'Positive'], dtype=object),
 array(['Positive', 'Positive', 'Negative', ..., 'Positive', 'Neutral',
        'Positive'], dtype=object)]

We can see that all predictions of different models have been stored separately

**Let's initialise values which will be useful in voting**

In [8]:
def countX(lst, x):
    count = 0
    for ele in lst:
        if (ele == x):
            count = count + 1
    return count

In [9]:
responses = list(y_test.unique())
responses

['Positive', 'Negative', 'Neutral']

In [10]:
resNums = []
for i in responses:
    resNums.append(countX(y_test, i))
resNums

[90767, 86082, 7659]

**Let us create a function to get the f-score of a particular class**

In [11]:
def getFscore(allpredictions, allactuals, response):
    
    tp = 0
    tn = 0
    fp = 0
    fn = 0
    prec = 0
    rec = 0
    
    for i in range(len(allpredictions)):
        if allpredictions[i] == allactuals[i] and allactuals[i] == response:
            tp += 1
            
        elif allpredictions[i] == allactuals[i] and allactuals[i] != response:
            tn += 1
            
        elif allpredictions[i] == response and response != allactuals[i]:
            fp += 1
            
        else:
            fn += 1
    
    if tp != 0:
        prec = tp/(tp+fp)
        rec = tp/(tp+fn)
    
    if prec == 0 and rec == 0:
        f1 = 0
    else:
        f1 = 2*prec*rec/(prec+rec)
            
    return(f1)

We will store the received f-scores of all models, for all classes separately.

In [12]:
fscore = {}
for i in range(len(model_names)):
    fscore[model_names[i]] = {}
    for j in range(len(responses)):
        fscore[model_names[i]][responses[j]] = getFscore(prd[i], np.array(y_test), responses[j])
fscore

{'\nTwitter Reddit\n': {'Positive': 0.4827271811162671,
  'Negative': 0.2659666469190279,
  'Neutral': 0.11328732303996975},
 '\nTwitter Corona\n': {'Positive': 0.579134572809062,
  'Negative': 0.4487428698894236,
  'Neutral': 0.07083543705486148},
 '\nTwitter India\n': {'Positive': 0.5227219671620885,
  'Negative': 0.3432644047380044,
  'Neutral': 0.12461420441367992},
 '\nTwitter 1.6 M Tweets\n': {'Positive': 0.7622099609784753,
  'Negative': 0.7457948326904099,
  'Neutral': 0},
 '\nAll data Combined\n': {'Positive': 0.7959466028493437,
  'Negative': 0.7776426006018942,
  'Neutral': 0.2078676150384671}}

**We can now go ahead with the voting-based predictions**

In [13]:
prdf = []

for i in range(len(prd[0])):
    votes = [0 for res in responses]
    
    for j in range(len(prd)):
        for response in range(len(responses)):
            if responses[response] == prd[j][i]:
                votes[response] += fscore[model_names[j]][responses[response]]/resNums[response]**(3/5)
    max_index = 0
    max_list = []
    for vote in range(len(votes)):
        if votes[vote] > votes[max_index]:
            max_index = vote
            max_list = []
            max_list = [max_index,]
        
        elif votes[vote] == votes[max_index]:
            max_list.append(vote)
    
    prdf.append(responses[random.choice(max_list)])

In [14]:
print(confusion_matrix(prdf,y_test))

[[56378   743 11625]
 [ 1301  5027  1223]
 [28403  1889 77919]]


In [15]:
print(classification_report(prdf,y_test))

              precision    recall  f1-score   support

    Negative       0.65      0.82      0.73     68746
     Neutral       0.66      0.67      0.66      7551
    Positive       0.86      0.72      0.78    108211

    accuracy                           0.76    184508
   macro avg       0.72      0.74      0.72    184508
weighted avg       0.77      0.76      0.76    184508



With our given data, the accuracy is from 70-80%. The accuracy is not too good but it isn't too bad either.