### APPROACHES for solving Sentiment Analysis

1. Pretrained Model / Library to generate sentiments
2. Supervised learning model approach

In [1]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [2]:
import nltk
nltk.download("vader_lexicon")

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [3]:
sentiment_analyzer = SentimentIntensityAnalyzer()

In [4]:
sentiment_analyzer.polarity_scores("i love this car so much but it is very highly priced")

{'neg': 0.0, 'neu': 0.794, 'pos': 0.206, 'compound': 0.3818}

In [5]:
sentiment_analyzer.polarity_scores("This car is very expensive but i love it very much")

{'neg': 0.0, 'neu': 0.592, 'pos': 0.408, 'compound': 0.8018}

### build a sentiment analysis model

- X - input features or sentences
- y - Target Variables or sentiments (use rating column to generate target variable)

In [6]:
import pandas as pd
import numpy as np

In [7]:
df = pd.read_csv("Restaurant_Reviews.tsv", sep='\t', quoting = 3)

In [8]:
df

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
...,...,...
995,I think food should have flavor and texture an...,0
996,Appetite instantly gone.,0
997,Overall I was not impressed and would not go b...,0
998,"The whole experience was underwhelming, and I ...",0


In [9]:
df.isnull().sum()

Review    0
Liked     0
dtype: int64

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  1000 non-null   object
 1   Liked   1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


### Cleaning the data

In [11]:
import nltk
import re

In [12]:
df['Review'] = df['Review'].apply(lambda x: re.sub('[^a-zA-Z0-9\s]', ' ', x))

In [13]:
df

Unnamed: 0,Review,Liked
0,Wow Loved this place,1
1,Crust is not good,0
2,Not tasty and the texture was just nasty,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
...,...,...
995,I think food should have flavor and texture an...,0
996,Appetite instantly gone,0
997,Overall I was not impressed and would not go b...,0
998,The whole experience was underwhelming and I ...,0


In [14]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [15]:
corpus = list(df['Review'].values)

In [16]:
def cleaning(sentence):
  doc = nlp(sentence)
  return ' '.join(token.lemma_ for token in doc)

In [17]:
clean_corpus = [cleaning(sentence) for sentence in corpus]

In [18]:
clean_corpus

['wow     love this place',
 'crust be not good',
 'not tasty and the texture be just nasty',
 'stop by during the late May bank holiday off Rick Steve recommendation and love it',
 'the selection on the menu be great and so be the price',
 'now I be get angry and I want my damn pho',
 'honeslty it didn t taste that fresh  ',
 'the potato be like rubber and you could tell they have be make up ahead of time be keep under a warm',
 'the fry be great too',
 'a great touch',
 'service be very prompt',
 'would not go back',
 'the cashier have no care what so ever on what I have to say it still end up be wayyy overprice',
 'I try the Cape Cod ravoli   chicken   with cranberry    mmmm',
 'I be disgusted because I be pretty sure that be human hair',
 'I be shocked because no sign indicate cash only',
 'highly recommend',
 'waitress be a little slow in service',
 'this place be not worth your time   let alone Vegas',
 'do not like at all',
 'the Burrittos Blah',
 'the food   amazing',
 'service

In [19]:
# vectorizing the corpus with numbers

In [20]:
from sklearn.feature_extraction.text import CountVectorizer

In [21]:
vec = CountVectorizer(stop_words='english')
vec.fit(clean_corpus)
vec_data = vec.transform(clean_corpus)

In [22]:
vec_data.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [23]:
df_clean = pd.DataFrame(vec_data.toarray(), columns= vec.get_feature_names_out())

In [24]:
X = df_clean.values
y = df['Liked']

In [25]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state= 42)

In [26]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(750, 1546)
(250, 1546)
(750,)
(250,)


In [27]:
from sklearn.ensemble import RandomForestClassifier

In [28]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

In [29]:
y_pred = rf.predict(X_test)

In [30]:
pd.DataFrame({"actual":y_test, "predicted": y_pred, "Error": abs(y_test-y_pred)})

Unnamed: 0,actual,predicted,Error
521,1,0,1
737,1,1,0
740,1,1,0
660,1,1,0
411,1,1,0
...,...,...,...
109,1,1,0
430,0,0,0
77,1,1,0
84,0,0,0


In [31]:
from sklearn.metrics import accuracy_score, classification_report
accuracy_score(y_test, y_pred)

0.732

In [32]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.71      0.80      0.75       128
           1       0.76      0.66      0.70       122

    accuracy                           0.73       250
   macro avg       0.74      0.73      0.73       250
weighted avg       0.74      0.73      0.73       250



In [33]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

In [34]:
pred = dt.predict(X_test)

In [35]:
pd.DataFrame({"actual":y_test, "predicted": pred, "Error": abs(y_test-pred)})

Unnamed: 0,actual,predicted,Error
521,1,0,1
737,1,1,0
740,1,1,0
660,1,1,0
411,1,1,0
...,...,...,...
109,1,0,1
430,0,0,0
77,1,1,0
84,0,0,0


In [36]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.65      0.73      0.69       128
           1       0.68      0.59      0.63       122

    accuracy                           0.66       250
   macro avg       0.67      0.66      0.66       250
weighted avg       0.67      0.66      0.66       250



In [37]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, y_train)

In [38]:
lr_pred = lr.predict(X_test)

In [39]:
print(classification_report(y_test, lr_pred))

              precision    recall  f1-score   support

           0       0.73      0.82      0.77       128
           1       0.78      0.68      0.73       122

    accuracy                           0.75       250
   macro avg       0.76      0.75      0.75       250
weighted avg       0.76      0.75      0.75       250



In [40]:
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(X_train, y_train)

In [41]:
nb_pred = nb.predict(X_test)

In [42]:
print(classification_report(y_test, nb_pred))

              precision    recall  f1-score   support

           0       0.79      0.49      0.61       128
           1       0.62      0.86      0.72       122

    accuracy                           0.67       250
   macro avg       0.70      0.68      0.66       250
weighted avg       0.70      0.67      0.66       250

