# Lexicon Based Approach: Vader Sentiment Analysis

In [18]:
import numpy as np
import pandas as pd
from pandas import DataFrame
import matplotlib.pyplot as plt

from wordcloud import WordCloud
from sklearn.metrics import f1_score

In [19]:
df_initial = pd.read_csv('Reviews.csv')
df_initial.head()


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [20]:
df = df_initial.drop(['Id', 'ProductId', 'UserId', 'ProfileName','HelpfulnessNumerator','HelpfulnessDenominator','Time'], axis = 1)
df.head()

Unnamed: 0,Score,Summary,Text
0,5,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,4,"""Delight"" says it all",This is a confection that has been around a fe...
3,2,Cough Medicine,If you are looking for the secret ingredient i...
4,5,Great taffy,Great taffy at a great price. There was a wid...


In [21]:
df.Text.head()

0    I have bought several of the Vitality canned d...
1    Product arrived labeled as Jumbo Salted Peanut...
2    This is a confection that has been around a fe...
3    If you are looking for the secret ingredient i...
4    Great taffy at a great price.  There was a wid...
Name: Text, dtype: object

In [22]:
! pip install textblob



In [23]:
import nltk
nltk.download('stopwords')
from textblob import TextBlob
from textblob import Word



[nltk_data] Downloading package stopwords to C:\Users\Jahnavi
[nltk_data]     Kolakaluri\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [24]:
df['Text'] = df['Text'].apply(lambda x: " ".join(x.lower() for x in x.split()))

In [25]:
#removing special characters
df['Text'] = df['Text'].str.replace('[^\w\s]',"")
df.Text.head(5)

  


0    i have bought several of the vitality canned d...
1    product arrived labeled as jumbo salted peanut...
2    this is a confection that has been around a fe...
3    if you are looking for the secret ingredient i...
4    great taffy at a great price there was a wide ...
Name: Text, dtype: object

In [26]:
#removal of stopwords
from nltk.corpus import stopwords

stop = stopwords.words('english')
df['Text'] = df['Text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
df.Text.head()

0    bought several vitality canned dog food produc...
1    product arrived labeled jumbo salted peanutsth...
2    confection around centuries light pillowy citr...
3    looking secret ingredient robitussin believe f...
4    great taffy great price wide assortment yummy ...
Name: Text, dtype: object

In [27]:
#spelling correction

#df['Text'] = df['Text'].apply(lambda x: str(TextBlob(x).correct()))
#df.Text.head(5)

In [28]:
#lemmatisation
#import nltk
#.download('wordnet')

#df['Text']  = df['Text'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
#df.Text.head(3)

[nltk_data] Downloading package wordnet to C:\Users\Jahnavi
[nltk_data]     Kolakaluri\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


0    bought several vitality canned dog food produc...
1    product arrived labeled jumbo salted peanutsth...
2    confection around century light pillowy citrus...
Name: Text, dtype: object

In [29]:
! pip install vaderSentiment



In [30]:
from matplotlib import pyplot as plt
import seaborn as sns
import re
import os
import sys
import ast
plt.style.use('fivethirtyeight')

cp = sns.color_palette()
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()


In [31]:
emptyline=[]

for row in df['Text']:
    vs=analyzer.polarity_scores(row)
    emptyline.append(vs)
    
df_senti = pd.DataFrame(emptyline)
df_senti.head()

Unnamed: 0,neg,neu,pos,compound
0,0.0,0.503,0.497,0.9413
1,0.129,0.762,0.11,-0.1027
2,0.13,0.587,0.283,0.8532
3,0.0,0.854,0.146,0.4404
4,0.0,0.369,0.631,0.9468


In [32]:
df_full = pd.concat([df,df_senti], axis = 1)
df_full.head(3)

Unnamed: 0,Score,Summary,Text,neg,neu,pos,compound
0,5,Good Quality Dog Food,bought several vitality canned dog food produc...,0.0,0.503,0.497,0.9413
1,1,Not as Advertised,product arrived labeled jumbo salted peanutsth...,0.129,0.762,0.11,-0.1027
2,4,"""Delight"" says it all",confection around century light pillowy citrus...,0.13,0.587,0.283,0.8532


In [43]:
import numpy as np
df_full['Sentiment'] = np.where(df_full['compound'] >= 0, 'Positive', 'Negative')
df_full.head(10)

Unnamed: 0,Score,Summary,Text,neg,neu,pos,compound,Sentiment,Label
0,5,Good Quality Dog Food,bought several vitality canned dog food produc...,0.0,0.503,0.497,0.9413,Positive,Positive
1,1,Not as Advertised,product arrived labeled jumbo salted peanutsth...,0.129,0.762,0.11,-0.1027,Negative,Negative
2,4,"""Delight"" says it all",confection around century light pillowy citrus...,0.13,0.587,0.283,0.8532,Positive,Positive
3,2,Cough Medicine,looking secret ingredient robitussin believe f...,0.0,0.854,0.146,0.4404,Positive,Negative
4,5,Great taffy,great taffy great price wide assortment yummy ...,0.0,0.369,0.631,0.9468,Positive,Positive
5,4,Nice Taffy,got wild hair taffy ordered five pound bag taf...,0.045,0.652,0.303,0.9136,Positive,Positive
6,5,Great! Just as good as the expensive brands!,saltwater taffy great flavor soft chewy candy ...,0.0,0.548,0.452,0.9463,Positive,Positive
7,5,"Wonderful, tasty taffy",taffy good soft chewy flavor amazing would def...,0.0,0.287,0.713,0.9313,Positive,Positive
8,5,Yay Barley,right im mostly sprouting cat eat grass love r...,0.0,0.724,0.276,0.6369,Positive,Positive
9,5,Healthy Dog Food,healthy dog food good digestion also good smal...,0.0,0.585,0.415,0.8176,Positive,Positive


In [34]:
#result = df_full['Sentiment'].value_counts()
#result.plot(kind = 'bar', rot=0, colour=['green','red']);

In [35]:
#converting score to label sentiment for calucating evalutation metrics
        
import numpy as np
df_full['Label'] = np.where(df_full['Score'] >= 3, 'Positive', 'Negative')
df_full.head(5)
        

Unnamed: 0,Score,Summary,Text,neg,neu,pos,compound,Sentiment,Label
0,5,Good Quality Dog Food,bought several vitality canned dog food produc...,0.0,0.503,0.497,0.9413,Positive,Positive
1,1,Not as Advertised,product arrived labeled jumbo salted peanutsth...,0.129,0.762,0.11,-0.1027,Negative,Negative
2,4,"""Delight"" says it all",confection around century light pillowy citrus...,0.13,0.587,0.283,0.8532,Positive,Positive
3,2,Cough Medicine,looking secret ingredient robitussin believe f...,0.0,0.854,0.146,0.4404,Positive,Negative
4,5,Great taffy,great taffy great price wide assortment yummy ...,0.0,0.369,0.631,0.9468,Positive,Positive


In [36]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [40]:
df_full.Score.value_counts()

5    363122
4     80655
1     52268
3     42640
2     29769
Name: Score, dtype: int64

In [41]:
df_full.Sentiment.value_counts()

Positive    524076
Negative     44378
Name: Sentiment, dtype: int64

In [42]:
df_full.Label.value_counts()

Positive    486417
Negative     82037
Name: Label, dtype: int64

In [37]:
print(confusion_matrix(df_full['Label'],df_full['Sentiment']))

[[ 24738  57299]
 [ 19640 466777]]


In [38]:
print(classification_report(df_full['Label'],df_full['Sentiment']))

              precision    recall  f1-score   support

    Negative       0.56      0.30      0.39     82037
    Positive       0.89      0.96      0.92    486417

    accuracy                           0.86    568454
   macro avg       0.72      0.63      0.66    568454
weighted avg       0.84      0.86      0.85    568454



In [39]:
print(accuracy_score(df_full['Label'],df_full['Sentiment']))

0.8646521970115436


# Machine Learning Approach: LSTM

In [94]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import load_model
import re


In [95]:
#printing dataset
df_initial = pd.read_csv('Reviews.csv')
df_initial.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [105]:
#encode sentiments
import numpy as np
from nltk.corpus import stopwords

def load_dataset():
    df_initial = pd.read_csv('Reviews.csv')
    df = df_initial.drop(['Id', 'ProductId', 'UserId', 'ProfileName','HelpfulnessNumerator','HelpfulnessDenominator','Time'], axis = 1)
    
    #removing special characters
    df['Text'] = df['Text'].str.replace('[^\w\s]',"")
    
    #removing stop words
    stop = stopwords.words('english')
    df['Text'] = df['Text'].apply(lambda x:[x for x in x.split() if x not in stop])
    
    # converting to lowercase
    df['Text'] = df['Text'].apply(lambda x: [x.lower() for x in x])
    
    
    #converting score to label sentiment for calucating evalutation metrics
    df['Label_Sentiment'] = np.where(df['Score'] >= 3, 1, 0)
    
    df.head()
    
    text_data = df['Text']
    sentiment_data = df['Label_Sentiment']
    
    
    sentiment_data = sentiment_data.replace('positive',1)
    sentiment_data = sentiment_data.replace('negative',0)
    
    return text_data, sentiment_data

text_data, sentiment_data = load_dataset()

print('Text')
print(text_data,'\n')
print('sentiment_data')
print(sentiment_data,'\n')

  # Remove the CWD from sys.path while we load stuff.


Text
0         [i, bought, several, vitality, canned, dog, fo...
1         [product, arrived, labeled, jumbo, salted, pea...
2         [this, confection, around, centuries, it, ligh...
3         [if, looking, secret, ingredient, robitussin, ...
4         [great, taffy, great, price, there, wide, asso...
                                ...                        
568449    [great, sesame, chickenthis, good, better, res...
568450    [im, disappointed, flavor, the, chocolate, not...
568451    [these, stars, small, give, 1015, one, trainin...
568452    [these, best, treats, training, rewarding, dog...
568453    [i, satisfied, product, advertised, i, use, ce...
Name: Text, Length: 568454, dtype: object 

sentiment_data
0         1
1         0
2         1
3         0
4         1
         ..
568449    1
568450    0
568451    1
568452    1
568453    1
Name: Label_Sentiment, Length: 568454, dtype: int32 



In [106]:
#30% testing
#splitting dataset
x_train, x_test, y_train, y_test = train_test_split(text_data, sentiment_data, test_size = 0.3)

print('Training set')
print(x_train,'\n')
print(x_test,'\n')
print('Testing set')
print(y_train,'\n')
print(y_test)

Training set
495667    [pb2, amazing, so, much, lower, calories, regu...
1322      [almost, identical, ghiradelis, great, chocola...
110584    [if, youre, via, fan, like, starbucks, italian...
105069    [this, product, really, bring, back, childhood...
119865    [the, first, brand, cedar, litter, i, tried, p...
                                ...                        
300833    [my, dogs, love, although, i, dont, think, eve...
131077    [bought, ran, bought, one, make, sparkling, wa...
511793    [this, flour, phenomenal, i, purchased, variet...
129478    [i, going, agree, dirty, socks, smell, i, actu...
44352     [this, product, fresh, shipped, quickly, it, h...
Name: Text, Length: 397917, dtype: object 

99603     [they, saying, dalmatians, shed, twice, year, ...
526280    [this, real, thing, unlike, web, page, amazon,...
351675    [i, chef, i, absolutely, love, char, crust, it...
368392    [my, grandson, beef, jerky, addict, we, dont, ...
404566    [i, used, granular, swerve, many,

In [107]:
def get_max_length():
    text_length = []
    for text in x_train:
        text_length.append(len(text))
    return int(np.ceil(np.mean(text_length)))

In [108]:
#neural networks accept only numeric data, hence using Tokeniser from 
#tensorflow.keras.preprocessing.text to encode reviews to integers

token = Tokenizer(lower=False)
token.fit_on_texts(x_train)
x_train = token.texts_to_sequences(x_train)
x_test = token.texts_to_sequences(x_test)

max_length = get_max_length()

x_train = pad_sequences(x_train, maxlen = max_length, padding = 'post', truncating='post')
x_test = pad_sequences(x_test, maxlen = max_length, padding = 'post', truncating='post')


total_words = len(token.word_index) + 1

print('Encoded X Training\n', x_train, '\n')
print('Encoded X Testing\n', x_test, '\n')
print('Maximum text length',max_length)

Encoded X Training
 [[ 1979   385    77 ...     0     0     0]
 [  188  3364 70954 ...     0     0     0]
 [   44   275  1154 ...     0     0     0]
 ...
 [   10   558  4692 ...  2280    37   558]
 [    1   173   930 ...  2737     1   571]
 [   10    12   145 ...     0     0     0]] 

Encoded X Testing
 [[   26  1068 53461 ...     0     0     0]
 [   10   185   155 ...     0     0     0]
 [    1  2237     1 ...     0     0     0]
 ...
 [    1    15    12 ...     0     0     0]
 [   10    32  5314 ...     0     0     0]
 [    1    94     3 ...     0     0     0]] 

Maximum text length 48


In [159]:
#Embedding layer
#LSTM Layer
#Forget gate, input gate, cell state, output state
#dense state
#optimiser adam
#loss function binary crossentropy
import tensorflow as tf

EMBED_DIM = 32
LSTM_OUT = 64

model = Sequential()
model.add(Embedding(total_words, EMBED_DIM, input_length = max_length))
model.add(LSTM(LSTM_OUT))
model.add(Dense(1,activation='sigmoid'))
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
model.compile(optimizer = 'adam', loss = 'binary_crossentropy',metrics=[tf.keras.metrics.Precision(),tf.keras.metrics.Recall()])
print(model.summary())



Model: "sequential_14"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_14 (Embedding)     (None, 48, 32)            6369216   
_________________________________________________________________
lstm_14 (LSTM)               (None, 64)                24832     
_________________________________________________________________
dense_14 (Dense)             (None, 1)                 65        
Total params: 6,394,113
Trainable params: 6,394,113
Non-trainable params: 0
_________________________________________________________________
None


In [160]:
model.fit(x_train, y_train, batch_size = 2000, epochs = 5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x1dfefedab88>

In [161]:
#testing

y_pred = model.predict_classes(x_test, batch_size = 128)

true = 0
for i, y in enumerate(y_test):
    if y == y_pred[i]:
        true += 1
print('Correct prediction: {}'.format(true))
print('Wrong Prediction: {}'.format(len(y_pred) - true))
print('Accuracy: {}'.format(true/len(y_pred)*100))




Correct prediction: 157902
Wrong Prediction: 12635
Accuracy: 92.59105062244556


In [163]:

model.evaluate(x_test, y_test, verbose=1)
#F1 = 2 * (precision * recall) / (precision + recall)



[0.2104382961988449, 0.9488890767097473, 0.9654236435890198]

In [173]:
import tensorflow as tf    
from tensorflow import keras  

#tf.keras.metrics.confusion_matrix(y_test, y_pred)
matrix = tf.keras.metrics.confusion_matrix(y_test.argmax(axis=1), y_pred.argmax(axis=1))

AttributeError: module 'tensorflow.keras.metrics' has no attribute 'confusion_matrix'