Natural Language Processing


In [2]:
%matplotlib inline
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
# importing the data and looking at it 
df = pd.read_table("Restaurant_Reviews.tsv")
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [4]:
df.shape


(1000, 2)

In [5]:
df.columns

Index(['Review', 'Liked'], dtype='object')

## data cleaning
 -  We should clean all phrases that contain 
punctuation mark or other character for exemple @RT,.,!,? 
 -  Transform all letters to lower case
 -  Break the sentences into words
 - delete stopwords


In [6]:
# importing the necessary libraries for text cleaning
# regular expression
import re
# natural language toolkit
import nltk
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\khale\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
from nltk.corpus import stopwords

print(stopwords.words("english"))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [8]:

from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(1000):
    review = re.sub("[^a-zA-Z]"," ",df.iloc[i,0])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words("english"))]
    review = " ".join(review)
    corpus.append(review)


In [9]:
corpus

['wow love place',
 'crust good',
 'tasti textur nasti',
 'stop late may bank holiday rick steve recommend love',
 'select menu great price',
 'get angri want damn pho',
 'honeslti tast fresh',
 'potato like rubber could tell made ahead time kept warmer',
 'fri great',
 'great touch',
 'servic prompt',
 'would go back',
 'cashier care ever say still end wayyy overpr',
 'tri cape cod ravoli chicken cranberri mmmm',
 'disgust pretti sure human hair',
 'shock sign indic cash',
 'highli recommend',
 'waitress littl slow servic',
 'place worth time let alon vega',
 'like',
 'burritto blah',
 'food amaz',
 'servic also cute',
 'could care less interior beauti',
 'perform',
 'right red velvet cake ohhh stuff good',
 'never brought salad ask',
 'hole wall great mexican street taco friendli staff',
 'took hour get food tabl restaur food luke warm sever run around like total overwhelm',
 'worst salmon sashimi',
 'also combo like burger fri beer decent deal',
 'like final blow',
 'found place acc

In [10]:
# encoding of textual data 

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()


In [11]:
X = cv.fit_transform(corpus).toarray()
y = df.iloc[:,-1]

Unnamed: 0,absolut,absolutley,accid,accommod,accomod,accordingli,account,ach,acknowledg,across,...,year,yellow,yellowtail,yelper,yet,yucki,yukon,yum,yummi,zero
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
Xdf = pd.DataFrame(data = X,columns=cv.get_feature_names())
ydf = pd.DataFrame( data = y.values,columns = ["Target"])
df1 = pd.concat([Xdf,ydf] , axis = 1)
df1.T



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
absolut,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
absolutley,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
accid,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
accommod,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
accomod,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
yukon,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
yum,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
yummi,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zero,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Feature engineering

In [13]:
X = df1.iloc[:,0:-1]
y = df1.iloc[:,-1]

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,train_size = 0.8 ,random_state=1)


In [54]:
# data modeling using random forest 

from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()

In [55]:
%timeit model.fit(X_train,y_train)

572 ms ± 34.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [56]:
y_pred = model.predict(X_test)

In [57]:
model.score(X_test,y_test)

0.755

In [18]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_pred)

array([[91, 17],
       [32, 60]], dtype=int64)

In [73]:

[print(value) for value in model.feature_importances_]

0.00038115626816196253
0.00015253328637155632
0.00042542452101422356
4.5687383385060825e-05
2.4294608609436113e-06
0.0009771298236109058
7.342456769016752e-05
0.0
7.533709331323903e-05
0.00033743537034553204
0.00011249128573771239
9.779452463731229e-05
2.7084046996919933e-05
0.00014831613966684532
0.0007973086924610847
1.540317136582929e-05
0.00036659368729830347
5.50020901965198e-05
6.819684919201859e-06
6.275954553122489e-05
0.0002076259239514566
0.00013063709135058908
8.590967080283085e-05
0.00048810046128290287
0.0036877691812132766
0.00023644817206134095
0.004164855926631853
0.01756732977579955
0.001815062227314961
0.0004964462813241368
0.0004470569587484938
0.00020620722293598475
2.893613590135558e-05
0.0001046190476530623
0.0
0.0008077922844378793
0.0001112593659172032
0.0
0.0001388864836742164
0.0010591917009235834
0.0014343706126232088
0.00013218707147489233
0.0
6.27988877046408e-05
0.00026722838285509766
0.00010906207810273544
2.7808638943440886e-05
1.9742820863131205e-06
0.0

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

In [91]:
# Get numerical feature importances
importances = list(model.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(df1.columns, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair) ) for pair in feature_importances]

Variable: great                Importance: 0.04
Variable: amaz                 Importance: 0.02
Variable: delici               Importance: 0.02
Variable: good                 Importance: 0.02
Variable: love                 Importance: 0.02
Variable: awesom               Importance: 0.01
Variable: back                 Importance: 0.01
Variable: bad                  Importance: 0.01
Variable: definit              Importance: 0.01
Variable: disappoint           Importance: 0.01
Variable: excel                Importance: 0.01
Variable: fantast              Importance: 0.01
Variable: food                 Importance: 0.01
Variable: friendli             Importance: 0.01
Variable: go                   Importance: 0.01
Variable: happi                Importance: 0.01
Variable: impress              Importance: 0.01
Variable: minut                Importance: 0.01
Variable: never                Importance: 0.01
Variable: nice                 Importance: 0.01
Variable: one                  Importanc

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

-  Tuning the model using GridsearchCV 

In [19]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
param_grid = { 
    
    'n_estimators':[510],
    'max_features':['auto','log2'],
    'min_samples_leaf':[2,3]
}
CV_rfc = GridSearchCV(estimator=model, param_grid=param_grid, cv= 5)
CV_rfc.fit(X_train, y_train) 

" from sklearn.model_selection import cross_val_score\nfrom sklearn.model_selection import GridSearchCV\nparam_grid = { \n    'n_estimators':[500,530,560],\n\n}\nCV_rfc = GridSearchCV(estimator=model, param_grid=param_grid, cv= 5)\nCV_rfc.fit(X_train, y_train) "

In [20]:
CV_rfc.best_params_

In [35]:
from sklearn.ensemble import RandomForestClassifier

model_optimal = RandomForestClassifier(max_features = 'auto', min_samples_leaf = 2, n_estimators = 500)
model_optimal.fit(X_train,y_train )
model_optimal.score(X_test,y_test)

0.77

In [37]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,model_optimal.predict(X_test))

array([[91, 17],
       [29, 63]], dtype=int64)

In [102]:
test_phrase = "I love the chef, his one authentic Japanese cool dude! "
x_test = cv.transform([test_phrase]).toarray()


In [103]:
model_optimal.predict(x_test)



array([1], dtype=int64)

In [104]:
model_optimal.predict_proba(x_test)



array([[0.252628, 0.747372]])