# You are being assigned as ML engineer by FSSAI and want to create a model that can predict whether the customer like the resturant or not based on the review posted on the community channel

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pnpreprocessor as preprocessor
%matplotlib inline

In [2]:
data = pd.read_csv('Restaurant_Reviews.tsv',sep='\t')
data

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
...,...,...
995,I think food should have flavor and texture an...,0
996,Appetite instantly gone.,0
997,Overall I was not impressed and would not go b...,0
998,"The whole experience was underwhelming, and I ...",0


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
Review    1000 non-null object
Liked     1000 non-null int64
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


In [4]:
data.describe()

Unnamed: 0,Liked
count,1000.0
mean,0.5
std,0.50025
min,0.0
25%,0.0
50%,0.5
75%,1.0
max,1.0


In [5]:
data.Liked.unique() 

array([1, 0], dtype=int64)

In [6]:
data.Liked.value_counts()

1    500
0    500
Name: Liked, dtype: int64

In [7]:
#Sklearn to create WordVector ! 

from sklearn.feature_extraction.text import CountVectorizer
wordVector = CountVectorizer(analyzer=preprocessor.textProcessing)
finalWordVector = wordVector.fit(data['Review'])
finalWordVector

CountVectorizer(analyzer=<function textProcessing at 0x000001FD44821828>,
                binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [8]:
finalWordVector.vocabulary_ 

{'Wow': 442,
 'Loved': 248,
 'place': 1540,
 'Crust': 112,
 'good': 1079,
 'tasty': 1932,
 'texture': 1945,
 'nasty': 1408,
 'Stopped': 381,
 'late': 1255,
 'May': 265,
 'bank': 541,
 'holiday': 1162,
 'Rick': 342,
 'Steve': 380,
 'recommendation': 1651,
 'loved': 1305,
 'selection': 1755,
 'menu': 1359,
 'great': 1089,
 'prices': 1584,
 'getting': 1061,
 'angry': 480,
 'want': 2074,
 'damn': 792,
 'pho': 1528,
 'Honeslty': 204,
 'didnt': 832,
 'taste': 1928,
 'fresh': 1035,
 'potatoes': 1571,
 'like': 1279,
 'rubber': 1707,
 'could': 752,
 'tell': 1937,
 'made': 1318,
 'ahead': 463,
 'time': 1966,
 'kept': 1238,
 'warmer': 2078,
 'fries': 1040,
 'touch': 1987,
 'Service': 361,
 'prompt': 1596,
 'Would': 441,
 'go': 1072,
 'back': 532,
 'cashier': 663,
 'care': 656,
 'ever': 939,
 'say': 1736,
 'still': 1870,
 'ended': 921,
 'wayyy': 2092,
 'overpriced': 1474,
 'tried': 1999,
 'Cape': 89,
 'Cod': 100,
 'ravoli': 1634,
 'chicken': 688,
 'cranberrymmmm': 768,
 'disgusted': 856,
 'pretty'

In [9]:
bagOfWords = finalWordVector.transform(data['Review'])
bagOfWords

<1000x2159 sparse matrix of type '<class 'numpy.int64'>'
	with 5606 stored elements in Compressed Sparse Row format>

In [10]:
#Convert BOW to word and its frequencies which will act as Feature in ML model

from sklearn.feature_extraction.text import TfidfTransformer
tfidfTransform = TfidfTransformer().fit(bagOfWords)
featureData = tfidfTransform.transform(bagOfWords)

# (Go,0.09) collaborate tfidfTransformer

In [11]:
featureData

<1000x2159 sparse matrix of type '<class 'numpy.float64'>'
	with 5606 stored elements in Compressed Sparse Row format>

In [12]:
#Training model 
#Naive Bayes ....MultinomialNB(text),BinomiaLNB (Binary Classification), BernoulliNB(Geospatial data - Longitude,Latitude,graph)
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB().fit(featureData,data['Liked'])

In [13]:
model.score(featureData,data['Liked'])

0.965

In [14]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(data['Liked'],model.predict(featureData))
cm

array([[486,  14],
       [ 21, 479]], dtype=int64)

In [15]:
from sklearn.metrics import classification_report
print(classification_report(data['Liked'],model.predict(featureData)))

              precision    recall  f1-score   support

           0       0.96      0.97      0.97       500
           1       0.97      0.96      0.96       500

    accuracy                           0.96      1000
   macro avg       0.97      0.96      0.96      1000
weighted avg       0.97      0.96      0.96      1000



In [16]:
inputData = input ('Enter the review to check if he liked or not : ')
l1 = preprocessor.textProcessing(inputData)
l2 = finalWordVector.transform(l1)
l3 = tfidfTransform.transform(l2)
prediction = model.predict(l3[0])

print(prediction)

Enter the review to check if he liked or not : food is not good
[1]


In [17]:
#Training model 
#Naive Bayes ....MultinomialNB(text),BinomiaLNB (Binary Classification), BernoulliNB(Geospatial data - Longitude,Latitude,graph)
from sklearn.naive_bayes import BernoulliNB

model1 = BernoulliNB().fit(featureData,data['Liked'])

In [18]:
model1

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [19]:
model1.score(featureData,data['Liked'])

0.949

In [20]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(data['Liked'],model1.predict(featureData))
cm

array([[470,  30],
       [ 21, 479]], dtype=int64)

In [21]:
from sklearn.metrics import classification_report
print(classification_report(data['Liked'],model1.predict(featureData)))

              precision    recall  f1-score   support

           0       0.96      0.94      0.95       500
           1       0.94      0.96      0.95       500

    accuracy                           0.95      1000
   macro avg       0.95      0.95      0.95      1000
weighted avg       0.95      0.95      0.95      1000



In [22]:
inputData = input ('Enter the review to check if he liked or not : ')
l1 = preprocessor.textProcessing(inputData)
l2 = finalWordVector.transform(l1)
l3 = tfidfTransform.transform(l2)
prediction = model1.predict(l3[0])

print(prediction)

Enter the review to check if he liked or not : food was nasty and not like it
[1]


In [23]:
from sklearn.model_selection import train_test_split

label = data.iloc[:,1]
features = featureData

X_train, X_test, y_train, y_test = train_test_split(features,
                                                   label,
                                                   test_size=0.2,
                                                   random_state=10)

In [25]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(max_depth=4)
model.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=4,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [26]:
model.score(X_train,y_train)

0.64125

In [27]:
model.score(X_test,y_test)

0.57

In [28]:
# Bagging for LogistincReggression 

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier

algo = LogisticRegression()
algoKNN = KNeighborsClassifier()

classifierBag = BaggingClassifier(base_estimator=algo,
    n_estimators=101,
    random_state=1)



classifierKNNBag = BaggingClassifier(base_estimator=algoKNN,
    n_estimators=101,
    random_state=1)

classifierBag.fit(X_train,y_train)
classifierKNNBag.fit(X_train,y_train)



BaggingClassifier(base_estimator=KNeighborsClassifier(algorithm='auto',
                                                      leaf_size=30,
                                                      metric='minkowski',
                                                      metric_params=None,
                                                      n_jobs=None,
                                                      n_neighbors=5, p=2,
                                                      weights='uniform'),
                  bootstrap=True, bootstrap_features=False, max_features=1.0,
                  max_samples=1.0, n_estimators=101, n_jobs=None,
                  oob_score=False, random_state=1, verbose=0, warm_start=False)

In [29]:
classifierBag.score(X_train,y_train)

0.96

In [30]:
classifierKNNBag.score(X_train,y_train)

0.8575

In [31]:
classifierKNNBag.score(X_test,y_test)

0.71

In [32]:
classifierBag.score(X_test,y_test)

0.775

In [33]:
# Boosing for LogistincReggression 

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier

algoLR = LogisticRegression()

booster = AdaBoostClassifier(base_estimator=algoLR,
    n_estimators=101,
    random_state=1)
booster.fit(X_train,y_train)



AdaBoostClassifier(algorithm='SAMME.R',
                   base_estimator=LogisticRegression(C=1.0, class_weight=None,
                                                     dual=False,
                                                     fit_intercept=True,
                                                     intercept_scaling=1,
                                                     l1_ratio=None,
                                                     max_iter=100,
                                                     multi_class='warn',
                                                     n_jobs=None, penalty='l2',
                                                     random_state=None,
                                                     solver='warn', tol=0.0001,
                                                     verbose=0,
                                                     warm_start=False),
                   learning_rate=1.0, n_estimators=101, random_state=1)

In [34]:
booster.score(X_train,y_train)

0.88625

In [35]:
booster.score(X_test,y_test)

0.72

In [36]:
# Fold Validation Technique
# K-fold cross validation technique

from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

from sklearn.model_selection import KFold

kfold = KFold(n_splits = 10,
             shuffle = True, # shuffle=True mean randomely records will be selected
             random_state = 1)

i=0
for train,test in kfold.split(features):
    i=i+1
    X_train,X_test = features[train],features[test]
    y_train,y_test = label[train],label[test]
    model.fit(X_train,y_train)
    print('Training score is {} and Testing score is {} for Sample Spilt {}'
          .format(model.score(X_train,y_train),model.score(X_test,y_test),i))

Training score is 0.9633333333333334 and Testing score is 0.77 for Sample Spilt 1
Training score is 0.96 and Testing score is 0.72 for Sample Spilt 2
Training score is 0.9622222222222222 and Testing score is 0.76 for Sample Spilt 3
Training score is 0.9666666666666667 and Testing score is 0.84 for Sample Spilt 4
Training score is 0.9588888888888889 and Testing score is 0.73 for Sample Spilt 5
Training score is 0.96 and Testing score is 0.7 for Sample Spilt 6
Training score is 0.9644444444444444 and Testing score is 0.75 for Sample Spilt 7
Training score is 0.9588888888888889 and Testing score is 0.75 for Sample Spilt 8
Training score is 0.9644444444444444 and Testing score is 0.83 for Sample Spilt 9
Training score is 0.97 and Testing score is 0.82 for Sample Spilt 10
