In [173]:
import pandas as pd
import numpy as np

In [174]:
df = pd.read_csv("data/Restaurant_Reviews.tsv", delimiter = "\t", )
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [175]:
df["Review"][998]

"The whole experience was underwhelming, and I think we'll just go to Ninja Sushi next time."

In [176]:
# Selecting a review from reviews 
df["Review"][999]

"Then, as if I hadn't wasted enough of my life there, they poured salt in the wound by drawing out the time it took to bring the check."

In [177]:
# Importing the re library 
import re

In [178]:
example = re.sub("[^A-Za-z]",       # Selecting upper cases from A-Z and also lower cases a-z
                 " ",               # Removed character will be replaced by a space
                 df["Review"][999], # where we want to select the alphabets from
       )
example

'Then  as if I hadn t wasted enough of my life there  they poured salt in the wound by drawing out the time it took to bring the check '

In [179]:
# Making all characters of the selected text lower cases
example = example.lower()
example

'then  as if i hadn t wasted enough of my life there  they poured salt in the wound by drawing out the time it took to bring the check '

In [180]:
# Splitting the words in the sentence so that each word becomes an itemin a list
example = example.split()
example

['then',
 'as',
 'if',
 'i',
 'hadn',
 't',
 'wasted',
 'enough',
 'of',
 'my',
 'life',
 'there',
 'they',
 'poured',
 'salt',
 'in',
 'the',
 'wound',
 'by',
 'drawing',
 'out',
 'the',
 'time',
 'it',
 'took',
 'to',
 'bring',
 'the',
 'check']

In [181]:
# Using list comprehension to select words with length greater than two
example = [x for x in example if len(x) > 2]
example

['then',
 'hadn',
 'wasted',
 'enough',
 'life',
 'there',
 'they',
 'poured',
 'salt',
 'the',
 'wound',
 'drawing',
 'out',
 'the',
 'time',
 'took',
 'bring',
 'the',
 'check']

In [182]:
# Importing the stop words
from nltk.corpus import stopwords

In [183]:
# printing the stopwords
print(stopwords.words("english"))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [184]:
# Checking the number of stopwords we have in total 
len(stopwords.words("english"))

179

There are 179 stopwords available for us to use

In [185]:
# Selecting only words not included in stop words
example = [word for word in example if word not in stopwords.words("english")]
example

['wasted',
 'enough',
 'life',
 'poured',
 'salt',
 'wound',
 'drawing',
 'time',
 'took',
 'bring',
 'check']

In [186]:
# Importing PorterStemmer
from nltk.stem.porter import PorterStemmer

In [187]:
# Initializing PorterStemming
ps = PorterStemmer()

In [188]:
# Stemming each word in our list
example = [ps.stem(word) for word in example]
example

['wast',
 'enough',
 'life',
 'pour',
 'salt',
 'wound',
 'draw',
 'time',
 'took',
 'bring',
 'check']

In [189]:
" ".join(example)

'wast enough life pour salt wound draw time took bring check'

In [190]:
# creating a function that performs allthe step
def clean_text(data):
    clean_text = re.sub("[^A-Za-z]", " ", data)
    clean_text = clean_text.lower()
    clean_text = clean_text.split()
    clean_text = [x for x in clean_text if len(x) > 2]
    clean_text = [word for word in clean_text if word not in stopwords.words("english")]
    clean_text = [ps.stem(word) for word in clean_text]
    clean_text = " ".join(clean_text)
    return clean_text

In [191]:
# Viewing the first five row of the original reviews 
df["Review"].head()

0                             Wow... Loved this place.
1                                   Crust is not good.
2            Not tasty and the texture was just nasty.
3    Stopped by during the late May bank holiday of...
4    The selection on the menu was great and so wer...
Name: Review, dtype: object

In [192]:
# applying the function and viewing the first five rows of the cleaned reviews
df["Review"] = df["Review"].apply(clean_text)
df["Review"]

0                                         wow love place
1                                             crust good
2                                     tasti textur nasti
3      stop late may bank holiday rick steve recommen...
4                                select menu great price
                             ...                        
995                        think food flavor textur lack
996                               appetit instantli gone
997                            overal impress would back
998    whole experi underwhelm think ninja sushi next...
999    wast enough life pour salt wound draw time too...
Name: Review, Length: 1000, dtype: object

In [198]:
# Tokenizating our data to create a sparse matrix 
from sklearn.feature_extraction.text import CountVectorizer
tokenizer = CountVectorizer(max_features = 1500)

In [199]:
# Matrix of features 
X = tokenizer.fit_transform(df["Review"]).toarray()

In [204]:
# Target columns 
y = df["Liked"]

In [218]:
from sklearn.model_selection import train_test_split

In [219]:
# splitting our data intotrain and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2,
                                                    random_state=42)

In [220]:
# Importing and initilaing the model 
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()

In [221]:
# Fitting the model to our training set
model.fit(X_train,y_train)

# Making predictions on our test
predictions = model.predict(X_test)

In [222]:
# Evaluating our model 
from sklearn.metrics import classification_report,confusion_matrix

In [223]:
print(confusion_matrix(y_test, predictions))
print("========================================")
print(classification_report(y_test, predictions))

[[49 47]
 [18 86]]
              precision    recall  f1-score   support

           0       0.73      0.51      0.60        96
           1       0.65      0.83      0.73       104

    accuracy                           0.68       200
   macro avg       0.69      0.67      0.66       200
weighted avg       0.69      0.68      0.67       200

