# <center> RESTAURANT RECOMMENDATION SYSTEM </center>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

import spacy
import regex as re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [151]:
data=pd.read_csv('zomato.csv')
print(data.shape)
data.head()

(51717, 17)


Unnamed: 0,url,address,name,online_order,book_table,rate,votes,phone,location,rest_type,dish_liked,cuisines,approx_cost(for two people),reviews_list,menu_item,listed_in(type),listed_in(city)
0,https://www.zomato.com/bangalore/jalsa-banasha...,"942, 21st Main Road, 2nd Stage, Banashankari, ...",Jalsa,Yes,Yes,4.1/5,775,080 42297555\r\n+91 9743772233,Banashankari,Casual Dining,"Pasta, Lunch Buffet, Masala Papad, Paneer Laja...","North Indian, Mughlai, Chinese",800,"[('Rated 4.0', 'RATED\n A beautiful place to ...",[],Buffet,Banashankari
1,https://www.zomato.com/bangalore/spice-elephan...,"2nd Floor, 80 Feet Road, Near Big Bazaar, 6th ...",Spice Elephant,Yes,No,4.1/5,787,080 41714161,Banashankari,Casual Dining,"Momos, Lunch Buffet, Chocolate Nirvana, Thai G...","Chinese, North Indian, Thai",800,"[('Rated 4.0', 'RATED\n Had been here for din...",[],Buffet,Banashankari
2,https://www.zomato.com/SanchurroBangalore?cont...,"1112, Next to KIMS Medical College, 17th Cross...",San Churro Cafe,Yes,No,3.8/5,918,+91 9663487993,Banashankari,"Cafe, Casual Dining","Churros, Cannelloni, Minestrone Soup, Hot Choc...","Cafe, Mexican, Italian",800,"[('Rated 3.0', ""RATED\n Ambience is not that ...",[],Buffet,Banashankari
3,https://www.zomato.com/bangalore/addhuri-udupi...,"1st Floor, Annakuteera, 3rd Stage, Banashankar...",Addhuri Udupi Bhojana,No,No,3.7/5,88,+91 9620009302,Banashankari,Quick Bites,Masala Dosa,"South Indian, North Indian",300,"[('Rated 4.0', ""RATED\n Great food and proper...",[],Buffet,Banashankari
4,https://www.zomato.com/bangalore/grand-village...,"10, 3rd Floor, Lakshmi Associates, Gandhi Baza...",Grand Village,No,No,3.8/5,166,+91 8026612447\r\n+91 9901210005,Basavanagudi,Casual Dining,"Panipuri, Gol Gappe","North Indian, Rajasthani",600,"[('Rated 4.0', 'RATED\n Very good restaurant ...",[],Buffet,Banashankari


**The dataset is fairly large one with more than 50,000 rows and 17 features**

## Data Preprocessing and EDA

### - Checking Null values

In [152]:
# Checking null values

data.isnull().sum()

url                                0
address                            0
name                               0
online_order                       0
book_table                         0
rate                            7775
votes                              0
phone                           1208
location                          21
rest_type                        227
dish_liked                     28078
cuisines                          45
approx_cost(for two people)      346
reviews_list                       0
menu_item                          0
listed_in(type)                    0
listed_in(city)                    0
dtype: int64

As seen above, there are lot of columns where values are null, let us check percentage of samples that are null below.

In [153]:
(data.isnull().mean()*100).sort_values(ascending=False)

dish_liked                     54.291626
rate                           15.033741
phone                           2.335789
approx_cost(for two people)     0.669026
rest_type                       0.438927
cuisines                        0.087012
location                        0.040606
address                         0.000000
name                            0.000000
online_order                    0.000000
book_table                      0.000000
listed_in(city)                 0.000000
votes                           0.000000
listed_in(type)                 0.000000
reviews_list                    0.000000
menu_item                       0.000000
url                             0.000000
dtype: float64

### - Dropping unnecessary columns

In [154]:
data.drop(columns=['dish_liked', 'phone', 'url'], axis=1, inplace=True)

In [155]:
data.head(3)

Unnamed: 0,address,name,online_order,book_table,rate,votes,location,rest_type,cuisines,approx_cost(for two people),reviews_list,menu_item,listed_in(type),listed_in(city)
0,"942, 21st Main Road, 2nd Stage, Banashankari, ...",Jalsa,Yes,Yes,4.1/5,775,Banashankari,Casual Dining,"North Indian, Mughlai, Chinese",800,"[('Rated 4.0', 'RATED\n A beautiful place to ...",[],Buffet,Banashankari
1,"2nd Floor, 80 Feet Road, Near Big Bazaar, 6th ...",Spice Elephant,Yes,No,4.1/5,787,Banashankari,Casual Dining,"Chinese, North Indian, Thai",800,"[('Rated 4.0', 'RATED\n Had been here for din...",[],Buffet,Banashankari
2,"1112, Next to KIMS Medical College, 17th Cross...",San Churro Cafe,Yes,No,3.8/5,918,Banashankari,"Cafe, Casual Dining","Cafe, Mexican, Italian",800,"[('Rated 3.0', ""RATED\n Ambience is not that ...",[],Buffet,Banashankari


### - Checking for duplicated rows

In [156]:
data.duplicated().sum()

43

As seen above, there are some duplicated rows which needs to be removed, Let's check the shape of the dataset before and after removing duplicates to confirm.

In [157]:
# Shape of the data before removing duplicates
data.shape

(51717, 14)

In [158]:
data.drop_duplicates(inplace=True)

In [159]:
# shape of the data after remvoing the duplicates
data.shape

(51674, 14)

### - Dropping NA values 

In [160]:
data.dropna(inplace=True, how='any')

In [161]:
# checking the shape of the dataset after removing the duplicates
data.shape

(43499, 14)

### - Checking column names

In [162]:
data.columns.to_list()

['address',
 'name',
 'online_order',
 'book_table',
 'rate',
 'votes',
 'location',
 'rest_type',
 'cuisines',
 'approx_cost(for two people)',
 'reviews_list',
 'menu_item',
 'listed_in(type)',
 'listed_in(city)']

In [163]:
data.rename(columns={'approx_cost(for two people)': 'cost','listed_in(type)': 'type', 'listed_in(city)' : 'city'}, inplace=True)
data.head(3)

Unnamed: 0,address,name,online_order,book_table,rate,votes,location,rest_type,cuisines,cost,reviews_list,menu_item,type,city
0,"942, 21st Main Road, 2nd Stage, Banashankari, ...",Jalsa,Yes,Yes,4.1/5,775,Banashankari,Casual Dining,"North Indian, Mughlai, Chinese",800,"[('Rated 4.0', 'RATED\n A beautiful place to ...",[],Buffet,Banashankari
1,"2nd Floor, 80 Feet Road, Near Big Bazaar, 6th ...",Spice Elephant,Yes,No,4.1/5,787,Banashankari,Casual Dining,"Chinese, North Indian, Thai",800,"[('Rated 4.0', 'RATED\n Had been here for din...",[],Buffet,Banashankari
2,"1112, Next to KIMS Medical College, 17th Cross...",San Churro Cafe,Yes,No,3.8/5,918,Banashankari,"Cafe, Casual Dining","Cafe, Mexican, Italian",800,"[('Rated 3.0', ""RATED\n Ambience is not that ...",[],Buffet,Banashankari


In [164]:
#confirming that data has no null values
data.isnull().sum().sum()

0

### - correcting column values

In [165]:
data['cost'].value_counts()

300      5707
400      5531
500      4295
200      3515
600      3341
250      2282
800      2201
700      1863
1,000    1562
150      1419
350      1409
450      1265
1,200     977
1,500     947
750       748
650       748
550       715
100       700
900       677
1,300     514
1,100     509
1,400     472
2,000     355
1,600     266
1,700     247
1,800     203
3,000     162
850       162
2,500     146
2,200      78
1,900      70
2,100      67
950        62
2,800      45
4,000      29
3,500      25
2,400      23
1,350      18
180        17
3,400      13
2,300      11
2,600      10
230        10
1,250       9
130         8
40          8
1,650       6
50          6
1,450       5
199         4
330         4
4,100       4
1,050       4
80          4
2,700       3
70          3
120         2
4,500       2
6,000       2
240         2
3,200       2
560         1
360         1
5,000       1
160         1
3,700       1
Name: cost, dtype: int64

As seen above, there commas in the cost which needs to be replaced with "."

In [166]:
data['cost']=data['cost'].astype(str).apply(lambda x: x.replace(",", ".")).astype(float)

In [167]:
data['cost'].value_counts()

300.00    5707
400.00    5531
500.00    4295
200.00    3515
600.00    3341
250.00    2282
800.00    2201
700.00    1863
1.00      1562
150.00    1419
350.00    1409
450.00    1265
1.20       977
1.50       947
650.00     748
750.00     748
550.00     715
100.00     700
900.00     677
1.30       514
1.10       509
1.40       472
2.00       355
1.60       266
1.70       247
1.80       203
850.00     162
3.00       162
2.50       146
2.20        78
1.90        70
2.10        67
950.00      62
2.80        45
4.00        29
3.50        25
2.40        23
1.35        18
180.00      17
3.40        13
2.30        11
230.00      10
2.60        10
1.25         9
130.00       8
40.00        8
50.00        6
1.65         6
1.45         5
1.05         4
80.00        4
330.00       4
199.00       4
4.10         4
70.00        3
2.70         3
6.00         2
4.50         2
3.20         2
240.00       2
120.00       2
160.00       1
3.70         1
560.00       1
360.00       1
5.00         1
Name: cost

In [168]:
data['rate'].value_counts(normalize=True)

NEW       0.050507
3.9/5     0.048024
3.7/5     0.046162
3.8/5     0.045909
3.9 /5    0.042875
3.8 /5    0.041817
3.7 /5    0.041357
3.6/5     0.040300
4.0/5     0.036713
4.0 /5    0.035564
3.6 /5    0.035242
4.1/5     0.033771
4.1 /5    0.033472
3.5/5     0.032713
3.5 /5    0.030805
3.4/5     0.028667
3.4 /5    0.027518
3.3/5     0.026368
4.2 /5    0.026230
3.3 /5    0.025863
4.2/5     0.023288
3.2/5     0.022920
4.3 /5    0.020920
3.1/5     0.019564
3.2 /5    0.019472
4.3/5     0.017748
3.1 /5    0.016069
4.4 /5    0.014414
3.0/5     0.012483
4.4/5     0.011931
3.0 /5    0.010276
2.9/5     0.009816
4.5 /5    0.009403
2.9 /5    0.008598
2.8/5     0.006943
2.8 /5    0.006391
4.5/5     0.005678
4.6 /5    0.004023
2.7/5     0.003839
2.6/5     0.003218
2.7 /5    0.003127
4.6/5     0.002874
2.6 /5    0.002506
4.7 /5    0.001977
4.7/5     0.001862
-         0.001494
2.5 /5    0.001287
2.5/5     0.001012
4.8 /5    0.000989
2.4/5     0.000828
2.4 /5    0.000690
4.9 /5    0.000690
2.3/5     0.

***As seen above, rating contains /5 in the rating which needs to be removed, also lot of "NEW" values need to be removed from consideration***

In [169]:
data=data[((data['rate']!='NEW') & (data['rate']!='-'))]
data.head()

Unnamed: 0,address,name,online_order,book_table,rate,votes,location,rest_type,cuisines,cost,reviews_list,menu_item,type,city
0,"942, 21st Main Road, 2nd Stage, Banashankari, ...",Jalsa,Yes,Yes,4.1/5,775,Banashankari,Casual Dining,"North Indian, Mughlai, Chinese",800.0,"[('Rated 4.0', 'RATED\n A beautiful place to ...",[],Buffet,Banashankari
1,"2nd Floor, 80 Feet Road, Near Big Bazaar, 6th ...",Spice Elephant,Yes,No,4.1/5,787,Banashankari,Casual Dining,"Chinese, North Indian, Thai",800.0,"[('Rated 4.0', 'RATED\n Had been here for din...",[],Buffet,Banashankari
2,"1112, Next to KIMS Medical College, 17th Cross...",San Churro Cafe,Yes,No,3.8/5,918,Banashankari,"Cafe, Casual Dining","Cafe, Mexican, Italian",800.0,"[('Rated 3.0', ""RATED\n Ambience is not that ...",[],Buffet,Banashankari
3,"1st Floor, Annakuteera, 3rd Stage, Banashankar...",Addhuri Udupi Bhojana,No,No,3.7/5,88,Banashankari,Quick Bites,"South Indian, North Indian",300.0,"[('Rated 4.0', ""RATED\n Great food and proper...",[],Buffet,Banashankari
4,"10, 3rd Floor, Lakshmi Associates, Gandhi Baza...",Grand Village,No,No,3.8/5,166,Basavanagudi,Casual Dining,"North Indian, Rajasthani",600.0,"[('Rated 4.0', 'RATED\n Very good restaurant ...",[],Buffet,Banashankari


In [170]:
data['rate']=data['rate'].astype(str).apply(lambda x: x.replace("/5", "").strip()).astype(float)
data['rate'].value_counts()


3.9    3954
3.8    3816
3.7    3807
3.6    3286
4.0    3144
4.1    2925
3.5    2763
3.4    2444
3.3    2272
4.2    2154
3.2    1844
4.3    1682
3.1    1550
4.4    1146
3.0     990
2.9     801
4.5     656
2.8     580
2.7     303
4.6     300
2.6     249
4.7     167
2.5     100
2.4      66
4.8      66
4.9      55
2.3      51
2.2      26
2.1      24
2.0      11
1.8       5
Name: rate, dtype: int64

In [171]:
data['rate_range']=pd.cut(data['rate'], bins=8)
data[['rate', 'rate_range']]

Unnamed: 0,rate,rate_range
0,4.1,"(3.738, 4.125]"
1,4.1,"(3.738, 4.125]"
2,3.8,"(3.738, 4.125]"
3,3.7,"(3.35, 3.738]"
4,3.8,"(3.738, 4.125]"
5,3.8,"(3.738, 4.125]"
6,3.6,"(3.35, 3.738]"
7,4.6,"(4.512, 4.9]"
8,4.0,"(3.738, 4.125]"
9,4.2,"(4.125, 4.512]"


In [172]:
# calculating mean rating per restaurant as a new column
data['mean_rating']=data.groupby('name')['rate'].transform('mean')

In [173]:
data.head(20)

Unnamed: 0,address,name,online_order,book_table,rate,votes,location,rest_type,cuisines,cost,reviews_list,menu_item,type,city,rate_range,mean_rating
0,"942, 21st Main Road, 2nd Stage, Banashankari, ...",Jalsa,Yes,Yes,4.1,775,Banashankari,Casual Dining,"North Indian, Mughlai, Chinese",800.0,"[('Rated 4.0', 'RATED\n A beautiful place to ...",[],Buffet,Banashankari,"(3.738, 4.125]",4.118182
1,"2nd Floor, 80 Feet Road, Near Big Bazaar, 6th ...",Spice Elephant,Yes,No,4.1,787,Banashankari,Casual Dining,"Chinese, North Indian, Thai",800.0,"[('Rated 4.0', 'RATED\n Had been here for din...",[],Buffet,Banashankari,"(3.738, 4.125]",4.1
2,"1112, Next to KIMS Medical College, 17th Cross...",San Churro Cafe,Yes,No,3.8,918,Banashankari,"Cafe, Casual Dining","Cafe, Mexican, Italian",800.0,"[('Rated 3.0', ""RATED\n Ambience is not that ...",[],Buffet,Banashankari,"(3.738, 4.125]",3.8
3,"1st Floor, Annakuteera, 3rd Stage, Banashankar...",Addhuri Udupi Bhojana,No,No,3.7,88,Banashankari,Quick Bites,"South Indian, North Indian",300.0,"[('Rated 4.0', ""RATED\n Great food and proper...",[],Buffet,Banashankari,"(3.35, 3.738]",3.7
4,"10, 3rd Floor, Lakshmi Associates, Gandhi Baza...",Grand Village,No,No,3.8,166,Basavanagudi,Casual Dining,"North Indian, Rajasthani",600.0,"[('Rated 4.0', 'RATED\n Very good restaurant ...",[],Buffet,Banashankari,"(3.738, 4.125]",3.8
5,"37, 5-1, 4th Floor, Bosco Court, Gandhi Bazaar...",Timepass Dinner,Yes,No,3.8,286,Basavanagudi,Casual Dining,North Indian,600.0,"[('Rated 3.0', 'RATED\n Food 3/5\nAmbience 3/...",[],Buffet,Banashankari,"(3.738, 4.125]",3.8
6,"19/1, New Timberyard Layout, Beside Satellite ...",Rosewood International Hotel - Bar & Restaurant,No,No,3.6,8,Mysore Road,Casual Dining,"North Indian, South Indian, Andhra, Chinese",800.0,"[('Rated 5.0', 'RATED\n Awesome food ??Great ...",[],Buffet,Banashankari,"(3.35, 3.738]",3.6
7,"2469, 3rd Floor, 24th Cross, Opposite BDA Comp...",Onesta,Yes,Yes,4.6,2556,Banashankari,"Casual Dining, Cafe","Pizza, Cafe, Italian",600.0,"[('Rated 5.0', 'RATED\n I personally really l...",[],Cafes,Banashankari,"(4.512, 4.9]",4.410588
8,"1, 30th Main Road, 3rd Stage, Banashankari, Ba...",Penthouse Cafe,Yes,No,4.0,324,Banashankari,Cafe,"Cafe, Italian, Continental",700.0,"[('Rated 3.0', ""RATED\n I had been to this pl...",[],Cafes,Banashankari,"(3.738, 4.125]",4.0
9,"2470, 21 Main Road, 25th Cross, Banashankari, ...",Smacznego,Yes,No,4.2,504,Banashankari,Cafe,"Cafe, Mexican, Italian, Momos, Beverages",550.0,"[('Rated 4.0', ""RATED\n Easy to locate\nVFM 3...",[],Cafes,Banashankari,"(4.125, 4.512]",4.2


### Text preprocessing

#### Below are the text processing actions that will be taken
- Changing to lower case characters
- Removing punctuations
- Stop words removal
- URL removals


In [174]:
len(data['reviews_list'])

41237

In [175]:
data['reviews_list'].iloc[2]

'[(\'Rated 3.0\', "RATED\\n  Ambience is not that good enough and it\'s not a pocket friendly cafe and the quantity is not that good and desserts are too good enough ??.."), (\'Rated 3.0\', "RATED\\n \\nWent there for a quick bite with friends.\\nThe ambience had more of corporate feel. I would say it was unique.\\nTried nachos, pasta churros and lasagne.\\n\\nNachos were pathetic.( Seriously don\'t order)\\nPasta was okayish.\\nLasagne was good.\\nNutella churros were the best.\\nOverall an okayish experience!\\nPeace ??"), (\'Rated 4.0\', "RATED\\n  First of all, a big thanks to the staff of this Cafe. Very polite and courteous.\\n\\nI was there 15mins before their closing time. Without any discomfort or hesitation, the staff welcomed me with a warm smile and said they\'re still open, though they were preparing to close the cafe for the day.\\n\\nQuickly ordered the Thai green curry, which is served with rice. They got it for me within 10mins, hot and freshly made.\\n\\nIt was tasty 

In [176]:
# changing to lower case
data['reviews_list']=data['reviews_list'].apply(lambda x: x.lower())

In [177]:
import string
PUNCH_TO_REMOVE=string.punctuation
PUNCH_TO_REMOVE

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [178]:
def remove_punctuation(text):
    """custom function to remove the punctuation"""
    return text.translate(str.maketrans('', '', PUNCH_TO_REMOVE))

data["reviews_list"] = data["reviews_list"].apply(lambda text: remove_punctuation(text))

In [179]:
from spacy.lang.en.stop_words import STOP_WORDS

In [180]:
def remove_stopwords(text):
    
    return " ".join([word for word in str(text) if word not in STOP_WORDS])

data['reviews_list']=data['reviews_list'].apply(lambda text: remove_punctuation(text))

In [181]:
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

data['reviews_list']=data['reviews_list'].apply(lambda text: remove_urls(text))

In [182]:
data[['reviews_list', 'cuisines']].sample(5)

Unnamed: 0,reviews_list,cuisines
3840,rated 40 ratedn this place recently opened in...,"North Indian, Biryani, South Indian"
21160,rated 10 ratedn i would have rated negative i...,"Bengali, North Indian, Chinese"
50370,rated 50 ratedn the best eggless waffles and ...,"Bakery, Desserts"
1977,rated 20 ratedn every chicken gravey tastes t...,"North Indian, Chinese"
6493,rated 40 ratedn went for breakfast on a satur...,South Indian


### Dropping unnecessary columns

In [183]:
data=data.drop(['address','rest_type', 'type', 'menu_item', 'votes'],axis=1)

## Randomly sample the data

In [184]:
df_percent = data.sample(frac=0.5)

### Creating TF-IDF matrix

In [185]:
df_percent.set_index('name', inplace=True)
indices = pd.Series(df_percent.index)

# Creating tf-idf matrix
tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
tfidf_matrix = tfidf.fit_transform(df_percent['reviews_list'])

cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)



def recommend(name, cosine_similarities = cosine_similarities):
    
    # Create a list to put top restaurants
    recommend_restaurant = []
    
    # Find the index of the hotel entered
    idx = indices[indices == name].index[0]
    
    # Find the restaurants with a similar cosine-sim value and order them from bigges number
    score_series = pd.Series(cosine_similarities[idx]).sort_values(ascending=False)
    
    # Extract top 30 restaurant indexes with a similar cosine-sim value
    top30_indexes = list(score_series.iloc[0:31].index)
    
    # Names of the top 30 restaurants
    for each in top30_indexes:
        recommend_restaurant.append(list(df_percent.index)[each])
    
    # Creating the new data set to show similar restaurants
    df_new = pd.DataFrame(columns=['cuisines', 'mean_rating', 'cost'])
    
    # Create the top 30 similar restaurants with some of their columns
    for each in recommend_restaurant:
        df_new = df_new.append(pd.DataFrame(df_percent[['cuisines','mean_rating', 'cost']][df_percent.index == each].sample()))
    
    # Drop the same named restaurants and sort only the top 10 by the highest rating
    df_new = df_new.drop_duplicates(subset=['cuisines','mean_rating', 'cost'], keep=False)
    df_new = df_new.sort_values(by='mean_rating', ascending=False).head(10)
    
    print('TOP %s RESTAURANTS LIKE %s WITH SIMILAR REVIEWS: ' % (str(len(df_new)), name))
    
    return df_new


In [186]:
recommend('Pai Vihar')

TOP 10 RESTAURANTS LIKE Pai Vihar WITH SIMILAR REVIEWS: 


Unnamed: 0,cuisines,mean_rating,cost
Gokul Kuteera,"North Indian, Chinese, South Indian",3.9,650.0
Food And You,"North Indian, Gujarati",3.856522,300.0
Cinnamon,"North Indian, Asian, Continental",3.828571,1.0
Agarwal Food Service,"North Indian, Chinese, Biryani",3.65,400.0
Sri Krishna Kitchen,"North Indian, Chinese",3.6125,400.0
Mayura Sagar,"Chinese, North Indian, South Indian",3.6,250.0
Lalchee's Rasoi,"North Indian, Chinese",3.6,500.0
A2B - Adyar Ananda Bhavan,"South Indian, North Indian, Chinese, Street Food",3.392,400.0
A2B - Adyar Ananda Bhavan,"South Indian, North Indian, Chinese, Street Fo...",3.392,400.0
Food Point,North Indian,3.3,450.0
