In [443]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk

# Preprocessing

In [444]:
df = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t')

In [445]:
df.head(10)

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday off Rick Steve recommendation and loved it.,1
4,The selection on the menu was great and so were the prices.,1
5,Now I am getting angry and I want my damn pho.,0
6,Honeslty it didn't taste THAT fresh.),0
7,The potatoes were like rubber and you could tell they had been made up ahead of time being kept under a warmer.,0
8,The fries were great too.,1
9,A great touch.,1


In [446]:
rev = df['Review']

In [447]:
df['Review'] = df['Review'].str.lower()

In [448]:
import re
def rem_punc(x):
    x = re.sub(r"[^a-zA-Z0-9]", " ", x)
    return x

In [449]:
df['Review'] = df['Review'].apply(rem_punc)

In [450]:
df['Review'].head(20)

0     wow    loved this place                                                                                        
1     crust is not good                                                                                              
2     not tasty and the texture was just nasty                                                                       
3     stopped by during the late may bank holiday off rick steve recommendation and loved it                         
4     the selection on the menu was great and so were the prices                                                     
5     now i am getting angry and i want my damn pho                                                                  
6     honeslty it didn t taste that fresh                                                                            
7     the potatoes were like rubber and you could tell they had been made up ahead of time being kept under a warmer 
8     the fries were great too                          

In [451]:
nltk.download('punkt')
from nltk.tokenize import word_tokenize
df['Review'] = df['Review'].apply(lambda x: word_tokenize(x))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\CompuTek\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [452]:
df['Review'].head(10)

0    [wow, loved, this, place]                                                                                                            
1    [crust, is, not, good]                                                                                                               
2    [not, tasty, and, the, texture, was, just, nasty]                                                                                    
3    [stopped, by, during, the, late, may, bank, holiday, off, rick, steve, recommendation, and, loved, it]                               
4    [the, selection, on, the, menu, was, great, and, so, were, the, prices]                                                              
5    [now, i, am, getting, angry, and, i, want, my, damn, pho]                                                                            
6    [honeslty, it, didn, t, taste, that, fresh]                                                                                          
7    [the, potatoes, were, 

In [453]:
import string

df['Review'] = df['Review'].apply(lambda x : [w.strip(string.punctuation) for w in x ])

In [454]:
df['Review'].head(10)

0    [wow, loved, this, place]                                                                                                            
1    [crust, is, not, good]                                                                                                               
2    [not, tasty, and, the, texture, was, just, nasty]                                                                                    
3    [stopped, by, during, the, late, may, bank, holiday, off, rick, steve, recommendation, and, loved, it]                               
4    [the, selection, on, the, menu, was, great, and, so, were, the, prices]                                                              
5    [now, i, am, getting, angry, and, i, want, my, damn, pho]                                                                            
6    [honeslty, it, didn, t, taste, that, fresh]                                                                                          
7    [the, potatoes, were, 

In [455]:
#from nltk.corpus import stopwords
#nltk.download('stopwords')
#df['Review'] = df['Review'].apply(lambda words: [w for w in words if w not in stopwords.words('english')] )

In [457]:
neg_words = ['not', 'n\'t']
stop_words = [w for w in stopwords.words('english') if w not in neg_words]

In [458]:
'not' in stop_words

False

In [459]:
from nltk.corpus import stopwords
df['Review'] = df['Review'].apply(lambda words: [w for w in words if w not in stop_words] )

In [460]:
df['Review'].head(10)

0    [wow, loved, place]                                                    
1    [crust, not, good]                                                     
2    [not, tasty, texture, nasty]                                           
3    [stopped, late, may, bank, holiday, rick, steve, recommendation, loved]
4    [selection, menu, great, prices]                                       
5    [getting, angry, want, damn, pho]                                      
6    [honeslty, taste, fresh]                                               
7    [potatoes, like, rubber, could, tell, made, ahead, time, kept, warmer] 
8    [fries, great]                                                         
9    [great, touch]                                                         
Name: Review, dtype: object

In [461]:
from nltk.stem.porter import PorterStemmer
nltk.download('wordnet')
df['Review'] = df['Review'].apply(lambda x: [PorterStemmer().stem(w) for w in x])

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\CompuTek\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [462]:
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('wordnet')
df['Review'] = df['Review'].apply(lambda x: [WordNetLemmatizer().lemmatize(w) for w in x])


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\CompuTek\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [463]:
df['Review'][:10]

0    [wow, love, place]                                                  
1    [crust, not, good]                                                  
2    [not, tasti, textur, nasti]                                         
3    [stop, late, may, bank, holiday, rick, steve, recommend, love]      
4    [select, menu, great, price]                                        
5    [get, angri, want, damn, pho]                                       
6    [honeslti, tast, fresh]                                             
7    [potato, like, rubber, could, tell, made, ahead, time, kept, warmer]
8    [fri, great]                                                        
9    [great, touch]                                                      
Name: Review, dtype: object

In [464]:
def replace(x):
    re.sub("n't","","not".join(x))
    return x 
            

In [465]:
df['Review'] = df['Review'].apply(replace)


In [466]:
df['Review'] = df['Review'].apply(lambda x: " ".join(x))

In [467]:
df.head(10)

Unnamed: 0,Review,Liked
0,wow love place,1
1,crust not good,0
2,not tasti textur nasti,0
3,stop late may bank holiday rick steve recommend love,1
4,select menu great price,1
5,get angri want damn pho,0
6,honeslti tast fresh,0
7,potato like rubber could tell made ahead time kept warmer,0
8,fri great,1
9,great touch,1


# Feature Extraction

In [477]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(min_df = 2)
X = cv.fit_transform(df['Review'])

In [478]:
X

<1000x701 sparse matrix of type '<class 'numpy.int64'>'
	with 4620 stored elements in Compressed Sparse Row format>

In [479]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X, df['Liked'])

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [499]:
test = ['love cake']
cv.transform(test)

<1x701 sparse matrix of type '<class 'numpy.int64'>'
	with 2 stored elements in Compressed Sparse Row format>

In [500]:
clf.predict(cv.transform(test))

array([1], dtype=int64)