# Sentiment Analysis using NLP on 1.6 M Tweets Dataset

## Importing Required Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random

import nltk 
import string
import re
import unicodedata

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

import pickle
import winsound

In [2]:
def alertme(times, diff):
    frequency = 2500  # Set Frequency To 2500 Hertz
    duration = 500  # Set Duration To 1000 ms == 1 second
    for i in range(times*diff):
        if i % diff == 0:
            winsound.Beep(frequency, duration)

We need this function because model training can take extremely long

## Loading datasets

In [3]:
df = pd.read_csv('./Data/1.6 Mill Tweet data.csv', encoding='latin1', names=['Sentiment', 'id', 'date', 'flag', 'user', 'text'])
df.drop(columns = ['id', 'date', 'flag', 'user'], inplace=True)
df

Unnamed: 0,Sentiment,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."
...,...,...
1599995,4,Just woke up. Having no school is the best fee...
1599996,4,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,Happy 38th Birthday to my boo of alll time!!! ...


In [4]:
df['Sentiment'].unique()

array([0, 4], dtype=int64)

0 represents negative and 4 represents positive. The data is split in a highly polar manner.

In [5]:
df.isnull().sum()

Sentiment    0
text         0
dtype: int64

**Let us replace positive and negative with their respective values**

In [6]:
y = []
for i in range(len(df['Sentiment'])):
    if df['Sentiment'][i] == 0:
        y.append('Negative')
    elif df['Sentiment'][i] == 4:
        y.append('Positive')
df['Sentiment'] = y
df

Unnamed: 0,Sentiment,text
0,Negative,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,Negative,is upset that he can't update his Facebook by ...
2,Negative,@Kenichan I dived many times for the ball. Man...
3,Negative,my whole body feels itchy and like its on fire
4,Negative,"@nationwideclass no, it's not behaving at all...."
...,...,...
1599995,Positive,Just woke up. Having no school is the best fee...
1599996,Positive,TheWDB.com - Very cool to hear old Walt interv...
1599997,Positive,Are you ready for your MoJo Makeover? Ask me f...
1599998,Positive,Happy 38th Birthday to my boo of alll time!!! ...


### Cleaning the data

In [7]:
stopword = nltk.corpus.stopwords.words('english')
wn = nltk.WordNetLemmatizer()
ps = nltk.PorterStemmer()
            
def strip_accents(text):
    try:
        text = unicode(text, 'utf-8')
    except NameError: # unicode is a default on python 3 
        pass

    text = unicodedata.normalize('NFD', text)\
           .encode('ascii', 'ignore')\
           .decode("utf-8")
    return str(text)

def clean_up_sentence(text):
    
    # Shift to lowercase
    text = text.lower()
    
    # Removing mentions, hashtags and urls
    for i in range(len(text)):
        if text[i] == '#' or text[i] == '@':
            j = 0
            maxj = len(text)-i
            while(j <maxj and text[i+j] != ' '):
                if i+j < len(text):
                    text = text[0:i+j] + '.' + text[i+j+1:]
                    j += 1
        elif text[i] == 'h' and i < len(text)-4:
            if text[i:i+4] == 'http':
                j = 0
                maxj = len(text)-i
                while(j <maxj and text[i+j] != ' '):
                    if i+j < len(text):
                        text = text[0:i+j] + '#' + text[i+j+1:]
                        j += 1
    
    # Removing Punctuations and numbers
    text  = "".join([char for char in text if char not in string.punctuation])
    text = re.sub('[0-9]+', '', text)
    
    # Removing unwanted whitespace and removing accents
    text = strip_accents(" ".join(text.split()))
    
    # Tokenisation
    text = re.split('\W+', text)
    if '' in text:
        text.remove('')
       
    # Removing stop words
    text = [word for word in text if word not in stopword]
    
    # Lemmatization
    text = [wn.lemmatize(word) for word in text]

    # Remove Stopwords
    text = [word for word in text if word not in stopword]
    
    return text

## Splitting the Data

In [8]:
x_train, x_test, y_train, y_test = train_test_split(df['text'], df['Sentiment'], test_size=0.3 )

In [9]:
x_train.shape

(1120000,)

In [10]:
y_train.shape

(1120000,)

## Creating a pipeline to train the model

In [11]:
models = [
        MultinomialNB(),
        BernoulliNB(),
        SGDClassifier(max_iter=10000, n_jobs = 6),
        LogisticRegression(max_iter=10000, n_jobs = 6),
    ]
    
model_names = [
        '\nMultinomial Naive Bayes\n',
        '\nBernoulli Naive Bayes\n',
        '\nStochastic Gradient Descent\n',
        '\nLogistic Regression\n',
    ]
    
for i in range(len(models)):
    pl = Pipeline([
            ('CV',CountVectorizer(analyzer=clean_up_sentence)),
            ('tfidf',TfidfTransformer()),
            ('classifier',models[i])
        ])
        
    print(model_names[i])
        
    pl.fit(x_train,y_train)
    prd = pl.predict(x_test)
        
    print(confusion_matrix(prd,y_test), end='\n\n')
    print(classification_report(prd,y_test))
    print(accuracy_score(prd,y_test))
    print()

alertme(5,2)


Multinomial Naive Bayes

[[187761  61427]
 [ 51766 179046]]

              precision    recall  f1-score   support

    Negative       0.78      0.75      0.77    249188
    Positive       0.74      0.78      0.76    230812

    accuracy                           0.76    480000
   macro avg       0.76      0.76      0.76    480000
weighted avg       0.76      0.76      0.76    480000

0.76418125


Bernoulli Naive Bayes

[[184055  53602]
 [ 55472 186871]]

              precision    recall  f1-score   support

    Negative       0.77      0.77      0.77    237657
    Positive       0.78      0.77      0.77    242343

    accuracy                           0.77    480000
   macro avg       0.77      0.77      0.77    480000
weighted avg       0.77      0.77      0.77    480000

0.7727625


Stochastic Gradient Descent

[[169004  43294]
 [ 70523 197179]]

              precision    recall  f1-score   support

    Negative       0.71      0.80      0.75    212298
    Positive       0.82   

**The best model for this dataset is Logistic Regression**

In [12]:
finalPipe = Pipeline([
            ('CV',CountVectorizer(analyzer=clean_up_sentence)),
            ('tfidf',TfidfTransformer()),
            ('classifier',LogisticRegression(max_iter=10000))
        ])
        
finalPipe.fit(x_train,y_train)
prd = finalPipe.predict(x_test)
        
print(confusion_matrix(prd,y_test), end='\n\n')
print(classification_report(prd,y_test))
print(accuracy_score(prd,y_test))
print()

[[183541  47743]
 [ 55986 192730]]

              precision    recall  f1-score   support

    Negative       0.77      0.79      0.78    231284
    Positive       0.80      0.77      0.79    248716

    accuracy                           0.78    480000
   macro avg       0.78      0.78      0.78    480000
weighted avg       0.78      0.78      0.78    480000

0.7838979166666666



In [13]:
filename = 'Major_Project_Twitter_1.6M.sav'
pickle.dump(finalPipe, open(filename, 'wb'))

Above, we have created the Final Pipeline which should be saved. It contains the model with the highest accuracy.