# Sentiment Analysis using NLP on Reddit Dataset

## Importing Required Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random

import nltk 
import string
import re
import unicodedata

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

import pickle
import winsound

In [2]:
def alertme(times, diff):
    frequency = 2500  # Set Frequency To 2500 Hertz
    duration = 500  # Set Duration To 1000 ms == 1 second
    for i in range(times*diff):
        if i % diff == 0:
            winsound.Beep(frequency, duration)

We need this function because model training can take extremely long

## Loading datasets

In [3]:
df = pd.read_csv('./Data/Reddit_data.csv', encoding='latin1', names=['text', 'Sentiment'])
df

Unnamed: 0,text,Sentiment
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1
...,...,...
37244,jesus,0
37245,kya bhai pure saal chutiya banaya modi aur jab...,1
37246,downvote karna tha par upvote hogaya,0
37247,haha nice,1


In [4]:
df.isnull().sum()

text         100
Sentiment      0
dtype: int64

In [5]:
df = df.fillna(-2)
df.isnull().sum()

text         0
Sentiment    0
dtype: int64

In [6]:
text = []
Sentiment = []

for i in range(len(df['text'])):
    if df['text'][i] != -2 and df['Sentiment'][i] != -2:
        text.append(df['text'][i])
        Sentiment.append(df['Sentiment'][i])

df2 = pd.DataFrame([])
df2['text'] = text
df2['Sentiment'] = Sentiment
df2

Unnamed: 0,text,Sentiment
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1
...,...,...
37144,jesus,0
37145,kya bhai pure saal chutiya banaya modi aur jab...,1
37146,downvote karna tha par upvote hogaya,0
37147,haha nice,1


In [7]:
df = df2
df['Sentiment'].unique()

array([ 1, -1,  0], dtype=int64)

In [8]:
y = []

for i in range(len(df['Sentiment'])):
    if df['Sentiment'][i] == -1:
        y.append('Negative')
    elif df['Sentiment'][i] == 0:
        y.append('Neutral')
    elif df['Sentiment'][i] == 1:
        y.append('Positive')
        
df['Sentiment'] = y
df

Unnamed: 0,text,Sentiment
0,family mormon have never tried explain them t...,Positive
1,buddhism has very much lot compatible with chr...,Positive
2,seriously don say thing first all they won get...,Negative
3,what you have learned yours and only yours wha...,Neutral
4,for your own benefit you may want read living ...,Positive
...,...,...
37144,jesus,Neutral
37145,kya bhai pure saal chutiya banaya modi aur jab...,Positive
37146,downvote karna tha par upvote hogaya,Neutral
37147,haha nice,Positive


### Cleaning the data

In [9]:
stopword = nltk.corpus.stopwords.words('english')
wn = nltk.WordNetLemmatizer()
ps = nltk.PorterStemmer()
            
def strip_accents(text):
    try:
        text = unicode(text, 'utf-8')
    except NameError: # unicode is a default on python 3 
        pass

    text = unicodedata.normalize('NFD', text)\
           .encode('ascii', 'ignore')\
           .decode("utf-8")
    return str(text)

def clean_up_sentence(text):
    
    # Shift to lowercase
    text = text.lower()
    
    # Removing mentions, hashtags and urls
    for i in range(len(text)):
        if text[i] == '#' or text[i] == '@':
            j = 0
            maxj = len(text)-i
            while(j <maxj and text[i+j] != ' '):
                if i+j < len(text):
                    text = text[0:i+j] + '.' + text[i+j+1:]
                    j += 1
        elif text[i] == 'h' and i < len(text)-4:
            if text[i:i+4] == 'http':
                j = 0
                maxj = len(text)-i
                while(j <maxj and text[i+j] != ' '):
                    if i+j < len(text):
                        text = text[0:i+j] + '#' + text[i+j+1:]
                        j += 1
    
    # Removing Punctuations and numbers
    text  = "".join([char for char in text if char not in string.punctuation])
    text = re.sub('[0-9]+', '', text)
    
    # Removing unwanted whitespace and removing accents
    text = strip_accents(" ".join(text.split()))
    
    # Tokenisation
    text = re.split('\W+', text)
    if '' in text:
        text.remove('')
       
    # Removing stop words
    text = [word for word in text if word not in stopword]
    
    # Lemmatization
    text = [wn.lemmatize(word) for word in text]

    # Remove Stopwords
    text = [word for word in text if word not in stopword]
    
    return text

## Splitting the Data

In [10]:
x_train, x_test, y_train, y_test = train_test_split(df['text'], df['Sentiment'], stratify=df['Sentiment'] )

## Creating a pipeline to train the model

In [11]:
models = [
        MultinomialNB(),
        BernoulliNB(),
        SGDClassifier(max_iter=10000, n_jobs = 6),
        LogisticRegression(max_iter=10000, n_jobs = 6),
    ]
    
model_names = [
        '\nMultinomial Naive Bayes\n',
        '\nBernoulli Naive Bayes\n',
        '\nStochastic Gradient Descent\n',
        '\nLogistic Regression\n',
    ]
    
for i in range(len(models)):
    pl = Pipeline([
            ('CV',CountVectorizer(analyzer=clean_up_sentence)),
            ('tfidf',TfidfTransformer()),
            ('classifier',models[i])
        ])
        
    print(model_names[i])
        
    pl.fit(x_train,y_train)
    prd = pl.predict(x_test)
        
    print(confusion_matrix(prd,y_test), end='\n\n')
    print(classification_report(prd,y_test))
    print(accuracy_score(prd,y_test))
    print()

alertme(5,2)


Multinomial Naive Bayes

[[ 210   16    5]
 [  71 1055   84]
 [1788 2190 3869]]

              precision    recall  f1-score   support

    Negative       0.10      0.91      0.18       231
     Neutral       0.32      0.87      0.47      1210
    Positive       0.98      0.49      0.66      7847

    accuracy                           0.55      9288
   macro avg       0.47      0.76      0.44      9288
weighted avg       0.87      0.55      0.62      9288

0.5527562446167097


Bernoulli Naive Bayes

[[ 397   33   91]
 [ 922 3061 1626]
 [ 750  167 2241]]

              precision    recall  f1-score   support

    Negative       0.19      0.76      0.31       521
     Neutral       0.94      0.55      0.69      5609
    Positive       0.57      0.71      0.63      3158

    accuracy                           0.61      9288
   macro avg       0.57      0.67      0.54      9288
weighted avg       0.77      0.61      0.65      9288

0.6135874246339362


Stochastic Gradient Descent

[[1138

**The best model for this dataset is Logistic Regression**

In [12]:
finalPipe = Pipeline([
            ('CV',CountVectorizer(analyzer=clean_up_sentence)),
            ('tfidf',TfidfTransformer()),
            ('classifier',LogisticRegression(max_iter=10000))
        ])
        
finalPipe.fit(x_train,y_train)
prd = finalPipe.predict(x_test)
        
print(confusion_matrix(prd,y_test), end='\n\n')
print(classification_report(prd,y_test))
print(accuracy_score(prd,y_test))
print()

[[1241   47  186]
 [ 381 3055  399]
 [ 447  159 3373]]

              precision    recall  f1-score   support

    Negative       0.60      0.84      0.70      1474
     Neutral       0.94      0.80      0.86      3835
    Positive       0.85      0.85      0.85      3979

    accuracy                           0.83      9288
   macro avg       0.80      0.83      0.80      9288
weighted avg       0.85      0.83      0.83      9288

0.8256890611541774



In [13]:
filename = 'Major_Project_Reddit.sav'
pickle.dump(finalPipe, open(filename, 'wb'))

Above, we have created the Final Pipeline which should be saved. It contains the model with the highest accuracy.