In [1]:
# Build a model to predict the rating in a review based on the content of the text review, identify any mismatch
# Credentials: kasham1991@gmail.com / karan sharma

# Agenda
# 1. Perform specific data cleanup, 
# 2. Build a rating prediction model using the Random Forest technique and NLTK

In [2]:
# Importing the required libraries
import pandas as pd 
import numpy as np
import re

In [3]:
# loading the data
zomato = pd.read_csv('C://Datasets//ZomatoReviews.csv')
zomato.head()

Unnamed: 0,rating,review_text
0,1.0,"Their service is worst, pricing in menu is dif..."
1,5.0,really appreciate their quality and timing . I...
2,4.0,"Went there on a Friday night, the place was su..."
3,4.0,A very decent place serving good food.\r\nOrde...
4,5.0,One of the BEST places for steaks in the city....


In [4]:
# Statistics of the dataset
# Multiple records in review_text are missing
zomato.describe(include = "all")

Unnamed: 0,rating,review_text
count,27762.0,27748
unique,,10548
top,,good
freq,,278
mean,3.665784,
std,1.284573,
min,1.0,
25%,3.0,
50%,4.0,
75%,5.0,


14 rows are missing the review text - need to get rid of these records

In [5]:
# Dropping the missing values
zomato1 = zomato[~zomato.review_text.isnull()].copy()
zomato1.reset_index(inplace=True, drop=True)

In [6]:
# Checking the shape
zomato.shape, zomato1.shape

((27762, 2), (27748, 2))

In [7]:
# Converting to list
zomato_list = zomato1.review_text.values
len(zomato_list)

27748

In [8]:
# Cleaning the text: Step-by-Step

# 1. Normalizing
# 2. Removing extra line breaks from the text
# 3. Removing stop words
# 4. Removing Punctuation

In [9]:
# Normalizing to lower case
lower = [txt.lower() for txt in zomato_list]
lower[2:4]

['went there on a friday night, the place was surprisingly empty. interesting menu which is almost fully made of dosas. i had bullseye dosa and cheese masala dosa. the bullseye dosa was really good, with the egg perfectly cooked to a half boiled state. the masala in the cheese masala was good, but the cheese was a bit too chewy for my liking. the chutney was good, the sambar was average. the dishes are reasonably priced.',
 'a very decent place serving good food.\r\nordered chilli fish, chicken & pork sizzler.\r\neverything tasted good but pork could have been slightly better cooked.\r\ntried 2 beverages, both were very sweet.']

In [10]:
# Removing extra line breaks
line_break = [" ".join(txt.split()) for txt in lower]
line_break[2:4]

['went there on a friday night, the place was surprisingly empty. interesting menu which is almost fully made of dosas. i had bullseye dosa and cheese masala dosa. the bullseye dosa was really good, with the egg perfectly cooked to a half boiled state. the masala in the cheese masala was good, but the cheese was a bit too chewy for my liking. the chutney was good, the sambar was average. the dishes are reasonably priced.',
 'a very decent place serving good food. ordered chilli fish, chicken & pork sizzler. everything tasted good but pork could have been slightly better cooked. tried 2 beverages, both were very sweet.']

In [11]:
# Tokenization
from nltk.tokenize import word_tokenize
print(word_tokenize(line_break[0]))

['their', 'service', 'is', 'worst', ',', 'pricing', 'in', 'menu', 'is', 'different', 'from', 'bill', '.', 'they', 'can', 'give', 'you', 'a', 'bill', 'with', 'increased', 'pricing', '.', 'even', 'for', 'serving', 'water', ',', 'menu', ',', 'order', 'you', 'need', 'to', 'call', 'them', '3-4', 'times', 'even', 'on', 'a', 'non', 'busy', 'day', '.']


In [12]:
zomato_tokens = [word_tokenize(sent) for sent in line_break]
print(line_break[0])

their service is worst, pricing in menu is different from bill. they can give you a bill with increased pricing. even for serving water,menu, order you need to call them 3-4 times even on a non busy day.


In [13]:
# Removing stop words and punctuation
from nltk.corpus import stopwords
from string import punctuation

In [14]:
stop_nltk = stopwords.words("english")
stop_punct = list(punctuation)
print(stop_nltk)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [15]:
# Removing no, not, don, won from stop words
# These words are important for rating 
stop_nltk.remove("no")
stop_nltk.remove("not")
stop_nltk.remove("don")
stop_nltk.remove("won")

In [16]:
# Checking for the same
"no" in stop_nltk

False

In [17]:
stop_final = stop_nltk + stop_punct + ["...", "``","''", "====", "must"]

In [18]:
# Creating a function for the steps mentioned above
def delete(sent):
    return [term for term in sent if term not in stop_final]

delete(zomato_tokens[1])

['really',
 'appreciate',
 'quality',
 'timing',
 'tried',
 'thattil',
 'kutti',
 'dosa',
 "'ve",
 'addicted',
 'dosa',
 'really',
 'chutney',
 'really',
 'good',
 'money',
 'worth',
 'much',
 'better',
 'thattukada',
 'try']

In [19]:
# Final clean list
zomato_clean = [delete(sent) for sent in zomato_tokens]

In [20]:
final_clean = [" ".join(sent) for sent in zomato_clean]
final_clean[:2]
# len(final_clean)

['service worst pricing menu different bill give bill increased pricing even serving water menu order need call 3-4 times even non busy day',
 "really appreciate quality timing tried thattil kutti dosa 've addicted dosa really chutney really good money worth much better thattukada try"]

In [21]:
# Splitting the dataset by 70/30
from sklearn.model_selection import train_test_split
x = final_clean
y = zomato1.rating

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.30, random_state = 1)

In [22]:
# Creating TfIdf
from sklearn.feature_extraction.text import TfidfVectorizer
vector = TfidfVectorizer(max_features = 5000)
# len(x_train), len(x_test)

x_train_bow = vector.fit_transform(x_train)
x_test_bow = vector.transform(x_test)
x_train_bow.shape, x_test_bow.shape

((19423, 5000), (8325, 5000))

In [23]:
# Model building with RF and GBR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [24]:
model1 = RandomForestRegressor(random_state = 42, n_estimators = 10)

In [25]:
%%time
model1.fit(x_train_bow, y_train)

Wall time: 1min 8s


RandomForestRegressor(n_estimators=10, random_state=42)

In [26]:
# Making the prediction
# RSME score
y_train_preds = model1.predict(x_train_bow)

from sklearn.metrics import mean_squared_error
mean_squared_error(y_train, y_train_preds)**0.5

0.26487948954941426

In [27]:
# Increasing the number of trees
model2 = RandomForestRegressor(random_state = 42, n_estimators = 20)

In [28]:
%%time
model2.fit(x_train_bow, y_train)

Wall time: 2min 21s


RandomForestRegressor(n_estimators=20, random_state=42)

In [29]:
# RSME post 20 tress as estimators
y_train_preds = model2.predict(x_train_bow)
mean_squared_error(y_train, y_train_preds)**0.5

0.24979450237391482

In [30]:
# Finding the best hyper-parameters for the SVM classifier
# Hyperparameter tuning and GridSearch
# max_features – ‘auto’, ‘sqrt’, ‘log2’
# max_depth – 10, 15, 20, 25
from sklearn.model_selection import GridSearchCV

In [31]:
model3 = RandomForestRegressor(random_state = 42, n_estimators = 30)

In [32]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'max_features': [500, "sqrt", "log2", "auto"],
    'max_depth': [10, 15, 20, 25]
}

In [33]:
# Instantiate the grid search model with stratified 5 fold cross-validation scheme
grid_search = GridSearchCV(estimator = model3, param_grid = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 1, scoring = "neg_mean_squared_error" )

In [34]:
grid_search.fit(x_train_bow, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   30.6s
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:  3.0min finished


GridSearchCV(cv=5,
             estimator=RandomForestRegressor(n_estimators=30, random_state=42),
             n_jobs=-1,
             param_grid={'max_depth': [10, 15, 20, 25],
                         'max_features': [500, 'sqrt', 'log2', 'auto']},
             scoring='neg_mean_squared_error', verbose=1)

In [35]:
grid_search.cv_results_.keys()

dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_max_depth', 'param_max_features', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score'])

In [36]:
grid_search.best_estimator_

RandomForestRegressor(max_depth=25, max_features=500, n_estimators=30,
                      random_state=42)

In [37]:
# Making predictions on the test set with RSME
# The score are higher than before
y_train_pred = grid_search.best_estimator_.predict(x_train_bow)
y_test_pred = grid_search.best_estimator_.predict(x_test_bow)

In [38]:
mean_squared_error(y_train, y_train_pred)**0.5

0.5993171473290868

In [39]:
mean_squared_error(y_test, y_test_pred)**0.5

0.6851577785321694

In [40]:
# Looking for any mismatches
# Creating a crosstab for the same
# Calculating the difference
difference = pd.DataFrame({'review':x_test, 'rating':y_test, 'rating_pred':y_test_pred})
difference

Unnamed: 0,review,rating,rating_pred
2932,tum se na ho paye gaa,1.0,3.622235
13512,went place cousin.the onion rings wat get one ...,5.0,4.502563
17968,amazing place even better food one places bang...,5.0,4.525573
3496,truffles known burgers italian food find alway...,4.0,4.117811
17202,like home food liked gatte curry would definit...,4.0,3.879688
...,...,...,...
22372,one restaurants open late hours koramangala ar...,2.0,2.117060
3525,weekend evening one happening places bangalore...,4.0,4.284457
17028,one worst restaurant ever seen ordered paneer ...,1.0,1.002448
22272,yesterday ordered food dinner outlet place act...,4.0,3.920376


In [41]:
a = difference[(difference.rating - difference.rating_pred)>=2]
# a.shape
a

Unnamed: 0,review,rating,rating_pred
4766,not good,5.0,1.665161
25813,ordered almost veg pizzas economic really good...,5.0,1.872747
20400,paneer tikka veg seekh kebab order made place ...,5.0,2.646035
25821,ordered almost veg pizzas economic really good...,5.0,1.872747
4771,not good,5.0,1.665161
4776,not good,5.0,1.665161
4761,not good,5.0,1.665161
15201,sauce not included,4.0,1.76246
7166,first experience worst ordered egg biriyani zo...,4.0,1.738853
25825,ordered almost veg pizzas economic really good...,5.0,1.872747


In [42]:
# Thank You :) 