## Import the libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
import string
from nltk.corpus import stopwords
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

## Load the dataset

In [2]:
df = pd.read_csv('zomato.csv')
df.head()

Unnamed: 0,url,address,name,online_order,book_table,rate,votes,phone,location,rest_type,dish_liked,cuisines,approx_cost(for two people),reviews_list,menu_item,listed_in(type),listed_in(city)
0,https://www.zomato.com/bangalore/jalsa-banasha...,"942, 21st Main Road, 2nd Stage, Banashankari, ...",Jalsa,Yes,Yes,4.1/5,775,080 42297555\r\n+91 9743772233,Banashankari,Casual Dining,"Pasta, Lunch Buffet, Masala Papad, Paneer Laja...","North Indian, Mughlai, Chinese",800,"[('Rated 4.0', 'RATED\n A beautiful place to ...",[],Buffet,Banashankari
1,https://www.zomato.com/bangalore/spice-elephan...,"2nd Floor, 80 Feet Road, Near Big Bazaar, 6th ...",Spice Elephant,Yes,No,4.1/5,787,080 41714161,Banashankari,Casual Dining,"Momos, Lunch Buffet, Chocolate Nirvana, Thai G...","Chinese, North Indian, Thai",800,"[('Rated 4.0', 'RATED\n Had been here for din...",[],Buffet,Banashankari
2,https://www.zomato.com/SanchurroBangalore?cont...,"1112, Next to KIMS Medical College, 17th Cross...",San Churro Cafe,Yes,No,3.8/5,918,+91 9663487993,Banashankari,"Cafe, Casual Dining","Churros, Cannelloni, Minestrone Soup, Hot Choc...","Cafe, Mexican, Italian",800,"[('Rated 3.0', ""RATED\n Ambience is not that ...",[],Buffet,Banashankari
3,https://www.zomato.com/bangalore/addhuri-udupi...,"1st Floor, Annakuteera, 3rd Stage, Banashankar...",Addhuri Udupi Bhojana,No,No,3.7/5,88,+91 9620009302,Banashankari,Quick Bites,Masala Dosa,"South Indian, North Indian",300,"[('Rated 4.0', ""RATED\n Great food and proper...",[],Buffet,Banashankari
4,https://www.zomato.com/bangalore/grand-village...,"10, 3rd Floor, Lakshmi Associates, Gandhi Baza...",Grand Village,No,No,3.8/5,166,+91 8026612447\r\n+91 9901210005,Basavanagudi,Casual Dining,"Panipuri, Gol Gappe","North Indian, Rajasthani",600,"[('Rated 4.0', 'RATED\n Very good restaurant ...",[],Buffet,Banashankari


## Cleaning and processing the data

In [3]:
# Deleting Unnecessary Columns like url, dish_liked, phone
zomato = df.drop(['url', 'dish_liked', 'phone'], axis=1)

In [4]:
# Removing the Duplicates
zomato.duplicated().sum()

43

In [5]:
zomato.drop_duplicates(inplace=True)

In [6]:
zomato.dropna(how='any', inplace=True)

In [7]:
# Changing the column names
zomato = zomato.rename(columns={'approx_cost(for two people)': 'cost', 'listed_in(type)': 'type', 'listed_in(city)': 'city'})

In [8]:
# Transformations for cost
zomato['cost'] = zomato['cost'].astype(str) # changing the cost to string
zomato['cost'] = zomato['cost'].apply(lambda x: x.replace(',', '.')) # using lambda function to replace ',' from cost
zomato['cost'] = zomato['cost'].astype(float)

In [9]:
# Removing '/5' from rates
zomato = zomato.loc[zomato.rate != 'NEW']
zomato = zomato.loc[zomato.rate != '-'].reset_index(drop=True)
remove_slash = lambda x: x.replace('/5', '') if type(x) == np.str else x
zomato.rate = zomato.rate.apply(remove_slash).str.strip().astype('float')

In [10]:
# Adjust the column names
zomato.name = zomato.name.apply(lambda x: x.title())
zomato.online_order.replace(('Yes', 'No'), (True, False), inplace=True)
zomato.book_table.replace(('Yes', 'No'), (True, False), inplace=True)

In [11]:
# Computing mean rating
restaurants = list(zomato['name'].unique())

In [12]:
zomato['Mean Rating'] = 0

In [13]:
for i in range(len(restaurants)):
    zomato['Mean Rating'][zomato['name'] == restaurants[i]] = zomato['rate'][zomato['name'] == restaurants[i]].mean()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  zomato['Mean Rating'][zomato['name'] == restaurants[i]] = zomato['rate'][zomato['name'] == restaurants[i]].mean()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  zomato['Mean Rating'][zomato['name'] == restaurants[i]] = zomato['rate'][zomato['name'] == restaurants[i]].mean()


In [14]:
scaler = MinMaxScaler(feature_range=(1,5))
zomato[['Mean Rating']] = scaler.fit_transform(zomato[['Mean Rating']]).round(2)

In [15]:
# Lower casing
zomato["reviews_list"] = zomato["reviews_list"].str.lower()

In [16]:
# Removal of punctuations
PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

zomato['reviews_list'] = zomato['reviews_list'].apply(lambda text: remove_punctuation(text))

In [17]:
# removal of stopwords
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

zomato['reviews_list'] = zomato['reviews_list'].apply(lambda text: remove_stopwords(text))

In [18]:
zomato.head()

Unnamed: 0,address,name,online_order,book_table,rate,votes,location,rest_type,cuisines,cost,reviews_list,menu_item,type,city,Mean Rating
0,"942, 21st Main Road, 2nd Stage, Banashankari, ...",Jalsa,True,True,4.1,775,Banashankari,Casual Dining,"North Indian, Mughlai, Chinese",800.0,rated 40 ratedn beautiful place dine inthe int...,[],Buffet,Banashankari,3.99
1,"2nd Floor, 80 Feet Road, Near Big Bazaar, 6th ...",Spice Elephant,True,False,4.1,787,Banashankari,Casual Dining,"Chinese, North Indian, Thai",800.0,rated 40 ratedn dinner family turned good choo...,[],Buffet,Banashankari,3.97
2,"1112, Next to KIMS Medical College, 17th Cross...",San Churro Cafe,True,False,3.8,918,Banashankari,"Cafe, Casual Dining","Cafe, Mexican, Italian",800.0,rated 30 ratedn ambience good enough pocket fr...,[],Buffet,Banashankari,3.58
3,"1st Floor, Annakuteera, 3rd Stage, Banashankar...",Addhuri Udupi Bhojana,False,False,3.7,88,Banashankari,Quick Bites,"South Indian, North Indian",300.0,rated 40 ratedn great food proper karnataka st...,[],Buffet,Banashankari,3.45
4,"10, 3rd Floor, Lakshmi Associates, Gandhi Baza...",Grand Village,False,False,3.8,166,Basavanagudi,Casual Dining,"North Indian, Rajasthani",600.0,rated 40 ratedn good restaurant neighbourhood ...,[],Buffet,Banashankari,3.58


In [19]:
# removal of urls
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

zomato['reviews_list'] = zomato['reviews_list'].apply(lambda text: remove_urls(text))

In [20]:
zomato[['reviews_list', 'cuisines']].sample(5)

Unnamed: 0,reviews_list,cuisines
23498,rated 50 ratedn sorry could take single pictur...,"North Indian, Chinese, Pizza"
15039,rated 30 ratedn always liked ordered chicken r...,"Chinese, North Indian, Rolls"
17393,rated 20 ratedn okay frank like kfc years back...,"Burger, Fast Food"
5859,rated 15 ratedn good choice ordered foodchkb 1...,"Biryani, Hyderabadi, North Indian"
28415,rated 40 ratedn took mac n cheese veggie salad...,"Cafe, Continental, Burger"


In [21]:
# Restaurant names
restaurant_names = list(zomato['name'].unique())

zomato = zomato.drop(['address', 'rest_type', 'type', 'menu_item', 'votes'], axis=1)

df_percent = zomato.sample(frac=0.5)

## TF-IDF Vectorization

In [22]:
# TF-IDF Vectorization
df_percent.set_index('name', inplace=True)
indices = pd.Series(df_percent.index)

# creating tf-idf matrix
tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
tfidf_matrix = tfidf.fit_transform(df_percent['reviews_list'])

cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

## Building the recommendation system

In [23]:
# building the recommendation function
def recommend(name, cosine_similarities = cosine_similarities):
    
    # create a list to put top restaurants
    recommend_restaurant = []
    
    # find the index of the restairant entered
    idx = indices[indices == name].index[0]
    
    # find the restaurants with a similar cosine-sim vlaue and order them from biggest number
    score_series = pd.Series(cosine_similarities[idx]).sort_values(ascending=False)
    
    # extract top 30 restaurant indexes with a similar cosine-sim value
    top30_indexes = list(score_series.iloc[0:31].index)
    
    # names of the top 30 restaurants
    for each in top30_indexes:
        recommend_restaurant.append(list(df_percent.index)[each])
        
    # creating the new data set to show similar restaurants
    df_new = pd.DataFrame(columns=['cuisines', 'Mean Rating', 'cost'])
    
    # create the top 30 similar restaurants with some of their columns
    for each in recommend_restaurant:
        df_new = df_new.append(pd.DataFrame(df_percent[['cuisines', 'Mean Rating', 'cost']][df_percent.index == each].sample()))
        
    # drop the same named restaurants with and sort only the top 10 by the highest rating
    df_new = df_new.drop_duplicates(subset=['cuisines', 'Mean Rating', 'cost'], keep=False)
    df_new = df_new.sort_values(by='Mean Rating', ascending=False).head(10)
    
    print('Top %s Restaurants like %s with similar reviews: ' % (str(len(df_new)), name))
    
    return df_new

In [24]:
recommend('Pai Vihar')

Top 5 Restaurants like Pai Vihar with similar reviews: 


Unnamed: 0,cuisines,Mean Rating,cost
Samosa Singh,"Street Food, Fast Food, Rolls, Desserts",3.6,200.0
Shrusti Coffee,"Cafe, South Indian",3.45,150.0
Mayura Sagar,"Chinese, North Indian, South Indian",3.32,250.0
Container Coffee,South Indian,3.11,200.0
Udupi Grand,"South Indian, North Indian, Chinese",2.91,450.0


In [25]:
recommend('Samosa Singh')

Top 4 Restaurants like Samosa Singh with similar reviews: 


Unnamed: 0,cuisines,Mean Rating,cost
Burma Burma,"Asian, Burmese",4.74,1.5
Kartik'S Mithai Shoppe,"Mithai, Street Food",3.77,150.0
Snack Magic,"Fast Food, Beverages",3.45,250.0
Calcutta Victoria Chat House,Street Food,3.06,150.0


In [26]:
recommend('Burma Burma')

Top 8 Restaurants like Burma Burma with similar reviews: 


Unnamed: 0,cuisines,Mean Rating,cost
Asia Kitchen By Mainland China,"Asian, Chinese, Thai, Momos",5.0,1.5
Byg Brewski Brewing Company,"Continental, North Indian, Italian, South Indi...",5.0,1.6
Communiti,"Continental, BBQ, Salad",4.67,1.5
Hammered,"North Indian, Thai, Japanese, Continental, Cafe",4.65,1.3
Brew And Barbeque - A Microbrewery Pub,"Continental, North Indian, BBQ, Steak",4.64,1.4
1131 Bar + Kitchen,"Continental, Asian, Italian, North Indian",4.48,1.5
Btdt? Been There Done That,"European, Continental, Asian, Finger Food",4.23,1.3
Just Khao Suey,Burmese,3.9,600.0
