A provisional restaraunt rating system that estimates the success of an establishment based on it's location, type and cuisine among other factors. The model is a neural network that runs on a Keras framework, trained on the 'Zomato Bangalore Restaurants' dataset curated by Himanshu Poddar.


In [None]:
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt 
import numpy as np 
import os 
import pandas as pd 
from keras.utils import to_categorical
from keras import models
from keras import layers
import math
from sklearn.model_selection import train_test_split
import random

In [80]:
nRowsRead = 50000 # size of dataset
df1 = pd.read_csv('zomato.csv', delimiter=',', nrows = nRowsRead)
df1.dataframeName = 'zomato.csv'
nRow, nCol = df1.shape
print(f'There are {nRow} rows and {nCol} columns')


There are 50000 rows and 17 columns


The dataset is loaded and transformed into the required format before feeding the model. Location and cuisine were one-hot encoded.

In [81]:
D = np.array(df1.values)
D = D[:,[5,6,8,9,11,12]]
print(D)

#One-hot encoded location
LDict = {}
cnt = 0
for i in range(len(D[:,2])):
    if(type(D[i,2])==str):
        x = D[i,2]
        if x not in LDict:
            LDict.update({x : cnt})
            cnt = cnt + 1
print(LDict)
for i in range(len(D[:,2])):
    if (type(D[i,2]) == str):
        D[i,2] = LDict[D[i,2]]
    else:
        D[i,2] = random.randint(0,len(LDict)-1)
L_onehot  = to_categorical(D[:,2])
print(L_onehot)

[['4.1/5' 775 'Banashankari' 'Casual Dining'
  'North Indian, Mughlai, Chinese' '800']
 ['4.1/5' 787 'Banashankari' 'Casual Dining'
  'Chinese, North Indian, Thai' '800']
 ['3.8/5' 918 'Banashankari' 'Cafe, Casual Dining'
  'Cafe, Mexican, Italian' '800']
 ...
 ['3.6 /5' 19 'Bellandur' 'Food Court' 'Continental, Pizza' '500']
 ['3.0 /5' 155 'Bellandur' 'Casual Dining' 'North Indian, Biryani' '800']
 ['3.0 /5' 13 'Bellandur' 'Quick Bites' 'North Indian, Chinese' '400']]
{'Banashankari': 0, 'Basavanagudi': 1, 'Mysore Road': 2, 'Jayanagar': 3, 'Kumaraswamy Layout': 4, 'Rajarajeshwari Nagar': 5, 'Vijay Nagar': 6, 'Uttarahalli': 7, 'JP Nagar': 8, 'South Bangalore': 9, 'City Market': 10, 'Nagarbhavi': 11, 'Bannerghatta Road': 12, 'BTM': 13, 'Kanakapura Road': 14, 'Bommanahalli': 15, 'CV Raman Nagar': 16, 'Electronic City': 17, 'HSR': 18, 'Marathahalli': 19, 'Sarjapur Road': 20, 'Wilson Garden': 21, 'Shanti Nagar': 22, 'Koramangala 5th Block': 23, 'Koramangala 8th Block': 24, 'Richmond Road':

In [82]:
#Restaurant type dictionary
TDict = {}
cnt = 0
for i in range(len(D[:,3])):
    if(type(D[i,3])==str):
        x = D[i,3].split(',')
        #print(x)
        x = np.asarray(x)
        #print(x)
        for j in range(len(x)):
            x[j] = x[j].strip()
            if x[j] not in TDict:
                TDict.update({x[j] : cnt})
                cnt = cnt+1
print(TDict)

{'Casual Dining': 0, 'Cafe': 1, 'Quick Bites': 2, 'Delivery': 3, 'Mess': 4, 'Dessert Parlor': 5, 'Bakery': 6, 'Pub': 7, 'Takeaway': 8, 'Fine Dining': 9, 'Beverage Shop': 10, 'Sweet Shop': 11, 'Bar': 12, 'Confectionery': 13, 'Kiosk': 14, 'Food Truck': 15, 'Microbrewery': 16, 'Lounge': 17, 'Food Court': 18, 'Dhaba': 19, 'Club': 20, 'Irani Cafee': 21, 'Bhojanalya': 22, 'Pop Up': 23, 'Meat Shop': 24}


In [83]:
#Cost per person
Cost = np.zeros((len(D[:,5]),1))
for i in range(len(D[:,5])):
    if(type(D[i,5]) == str):
        #print(D[i,5])
        D[i,5] = D[i,5].replace(',','')
        if(D[i,5] != 'B'):
            Cost[i] = int(D[i,5])
        else:
            Cost[i] = 0
print(Cost)

[[800.]
 [800.]
 [800.]
 ...
 [500.]
 [800.]
 [400.]]


In [84]:
#ratings
S = np.zeros((len(D[:,0]),1))
for i in range(len(D[:,0])):
    if(type(D[i,0]) == str):
        x = D[i,0].split('/')
        x = np.asarray(x)
        #print(x)
        if(x[0] != 'NEW' and x[0] != '-'):
            S[i] = float(x[0])
        else:
            S[i] = 0
#print(S)

In [85]:
#Cuisine dictionary
CDict = {}
cnt = 0
for i in range(len(D[:,4])):
    if(type(D[i,4])==str):
        x = D[i,4].split(',')
        #print(x)
        x = np.asarray(x)
        #print(x)
        for j in range(len(x)):
            x[j] = x[j].strip()
            if x[j] not in CDict:
                CDict.update({x[j] : cnt})
                cnt = cnt+1
print(CDict)
            
    


{'North Indian': 0, 'Mughlai': 1, 'Chinese': 2, 'Thai': 3, 'Cafe': 4, 'Mexican': 5, 'Italian': 6, 'South Indian': 7, 'Rajasthani': 8, 'Andhra': 9, 'Pizza': 10, 'Continental': 11, 'Momos': 12, 'Beverages': 13, 'Fast Food': 14, 'American': 15, 'French': 16, 'European': 17, 'Bakery': 18, 'Burger': 19, 'Desserts': 20, 'Biryani': 21, 'Street Food': 22, 'Rolls': 23, 'Ice Cream': 24, 'Healthy Food': 25, 'Salad': 26, 'Asian': 27, 'Korean': 28, 'Indonesian': 29, 'Japanese': 30, 'Goan': 31, 'Seafood': 32, 'Kebab': 33, 'Steak': 34, 'Mithai': 35, 'Iranian': 36, 'Sandwich': 37, 'Juices': 38, 'Mangalorean': 39, 'Vietnamese': 40, 'Hyderabadi': 41, 'Bengali': 42, 'Arabian': 43, 'BBQ': 44, 'Tea': 45, 'Afghani': 46, 'Lebanese': 47, 'Finger Food': 48, 'Tibetan': 49, 'Charcoal Chicken': 50, 'Middle Eastern': 51, 'Mediterranean': 52, 'Wraps': 53, 'Kerala': 54, 'Oriya': 55, 'Bihari': 56, 'Roast Chicken': 57, 'Maharashtrian': 58, 'Bohri': 59, 'African': 60, 'Nepalese': 61, 'Turkish': 62, 'Tamil': 63, 'Tex-Me

In [86]:
#Final Data Entry
print(len(D))
X = np.zeros((len(D),len(LDict) + len(TDict) + len(CDict) + 2))
for i in range(len(X)):
    y = np.zeros(len(CDict))
    t = np.zeros(len(TDict))
    if(type(D[i,4])==str):
        x = D[i,4].split(',')
        #print(x)
        x = np.asarray(x)
        #print(x)
        cnt = 0
        for j in range(len(x)):
            x[j] = x[j].strip()
            val = CDict[x[j]]
            y[val] = 1
            cnt = cnt+1
        #Checking if every restaurant has a valid cuisine
        if(np.sum(y)==0):
            print("Indefinite cuisine")
    if(type(D[i,3])==str):
        x1 = D[i,3].split(',')
        #print(x)
        x1 = np.asarray(x1)
        #print(x)
        cnt = 0
        for j in range(len(x1)):
            x1[j] = x1[j].strip()
            val = TDict[x1[j]]
            t[val] = 1
            cnt = cnt+1    
    #print(len(L_onehot[i]))
    X[i,:] = np.r_[L_onehot[i],t,y,D[i,1],Cost[i]]
#print(X)

#Cuisine scores
Y  = np.zeros((len(D),1))
for i in range(len(Y)):
    Y[i] = S[i]
        
print(Y)

50000
[[4.1]
 [4.1]
 [3.8]
 ...
 [3.6]
 [3. ]
 [3. ]]


Data now ready for training and testing. Split into training and validation sets.

In [91]:
#Preparing training and test data
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.1)
print(X_train.shape[1])

227


The neural network consists of 2 hidden layers, using Relu activation. The output layer uses a linear activation. Dropout is also included to allow the model to generalize better.

In [92]:
#Constructing NN model
model = models.Sequential()
model.add(layers.Dense(100, activation = "relu", input_shape=(X_train.shape[1], )))
# Hidden - Layers
model.add(layers.Dropout(0.3, noise_shape=None, seed=None))
model.add(layers.Dense(100, activation = "relu"))
#model.add(layers.Dropout(0.2, noise_shape=None, seed=None))
model.add(layers.Dense(10, activation = "relu"))
# Output- Layer
model.add(layers.Dense(y_train.shape[1], activation = "linear"))
model.summary()

W0701 11:41:21.824674 140193057347328 deprecation.py:506] From /home/naveen/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:3445: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_25 (Dense)             (None, 100)               22800     
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_26 (Dense)             (None, 100)               10100     
_________________________________________________________________
dense_27 (Dense)             (None, 10)                1010      
_________________________________________________________________
dense_28 (Dense)             (None, 1)                 11        
Total params: 33,921
Trainable params: 33,921
Non-trainable params: 0
_________________________________________________________________


The model runs using an Adam optimizer, with a mean squared loss function.

In [93]:
#Compiling model
model.compile(
 optimizer = "adam",
 loss = "mean_squared_error",
)

In [94]:
#Training model
results = model.fit(
 X_train, y_train,
 epochs= 10,
 batch_size = 100,
 validation_data = (X_test,y_test )
)


Train on 45000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


The validation loss is quite satisfactory, and is an indication of the overall accuracy of the model.