In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('./data/TA_restaurants_curated.csv')

In [2]:
df = df.iloc[:, 1:7].copy()
df = df.sample(500)
df.head()

Unnamed: 0,Name,City,Cuisine Style,Ranking,Rating,Price Range
64205,Banana's Bar,Luxembourg,['Bar'],414.0,3.5,$$ - $$$
57955,1897 Ltd,London,,12385.0,4.0,
105742,Apetit,Prague,,1473.0,4.0,
87052,Casa da Horta,Oporto,"['European', 'Portuguese', 'Vegetarian Friendl...",449.0,4.5,$
88818,Coco Vika,Oslo,"['Asian', 'Vegetarian Friendly', 'Gluten Free ...",635.0,4.0,$$ - $$$


In [3]:
df = df.dropna()

In [4]:
df['Price Range'] = df['Price Range'].astype('category')
df['Price Range'].value_counts()

$$ - $$$    220
$            72
$$$$         19
Name: Price Range, dtype: int64

In [5]:
# Parse the stringified features into their corresponding python objects
from ast import literal_eval

features = ['Cuisine Style']
for feature in features:
    df[feature] = df[feature].apply(literal_eval)

In [6]:
# Function to convert all strings to lower case and strip names of spaces
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

features = ['City', 'Cuisine Style']
for feature in features:
    df[feature] = df[feature].apply(clean_data)

df.head()

Unnamed: 0,Name,City,Cuisine Style,Ranking,Rating,Price Range
64205,Banana's Bar,luxembourg,[bar],414.0,3.5,$$ - $$$
87052,Casa da Horta,oporto,"[european, portuguese, vegetarianfriendly, veg...",449.0,4.5,$
88818,Coco Vika,oslo,"[asian, vegetarianfriendly, glutenfreeoptions]",635.0,4.0,$$ - $$$
63920,Persian FoodBox,luxembourg,"[middleeastern, persian]",129.0,4.5,$
94148,Palace Cafe,paris,"[european, french, bar, pub]",4752.0,4.0,$$ - $$$


In [7]:
def create_soup(x):
    return ' '.join(x['Cuisine Style']) + ' ' + x['City']

df['soup'] = df.apply(create_soup, axis=1)
df.head()

Unnamed: 0,Name,City,Cuisine Style,Ranking,Rating,Price Range,soup
64205,Banana's Bar,luxembourg,[bar],414.0,3.5,$$ - $$$,bar luxembourg
87052,Casa da Horta,oporto,"[european, portuguese, vegetarianfriendly, veg...",449.0,4.5,$,european portuguese vegetarianfriendly veganop...
88818,Coco Vika,oslo,"[asian, vegetarianfriendly, glutenfreeoptions]",635.0,4.0,$$ - $$$,asian vegetarianfriendly glutenfreeoptions oslo
63920,Persian FoodBox,luxembourg,"[middleeastern, persian]",129.0,4.5,$,middleeastern persian luxembourg
94148,Palace Cafe,paris,"[european, french, bar, pub]",4752.0,4.0,$$ - $$$,european french bar pub paris


In [8]:
# Import CountVectorizer and create the count matrix
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df['soup'])

In [9]:
# Compute the Cosine Similarity matrix based on the count_matrix
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [10]:
# Reset index of our main DataFrame and construct reverse mapping as before
df = df.reset_index()
indices = pd.Series(df.index, index=df['Name'])

In [11]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the restaurants indices
    restaurants_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return df['Name'].iloc[restaurants_indices]

In [12]:
df.head(15)

Unnamed: 0,index,Name,City,Cuisine Style,Ranking,Rating,Price Range,soup
0,64205,Banana's Bar,luxembourg,[bar],414.0,3.5,$$ - $$$,bar luxembourg
1,87052,Casa da Horta,oporto,"[european, portuguese, vegetarianfriendly, veg...",449.0,4.5,$,european portuguese vegetarianfriendly veganop...
2,88818,Coco Vika,oslo,"[asian, vegetarianfriendly, glutenfreeoptions]",635.0,4.0,$$ - $$$,asian vegetarianfriendly glutenfreeoptions oslo
3,63920,Persian FoodBox,luxembourg,"[middleeastern, persian]",129.0,4.5,$,middleeastern persian luxembourg
4,94148,Palace Cafe,paris,"[european, french, bar, pub]",4752.0,4.0,$$ - $$$,european french bar pub paris
5,84958,Lezizel Manti - RiemArcaden,munich,[turkish],1361.0,5.0,$$ - $$$,turkish munich
6,35528,Pulvermuehle,hamburg,"[bar, pizza, diner]",149.0,4.5,$$ - $$$,bar pizza diner hamburg
7,23655,Hanedan,brussels,[turkish],1714.0,4.0,$$ - $$$,turkish brussels
8,117882,Bangkok-Vienna,vienna,"[asian, thai, vegetarianfriendly, veganoptions...",99.0,4.5,$$ - $$$,asian thai vegetarianfriendly veganoptions glu...
9,16011,Gogogi,berlin,"[barbecue, asian, korean, vegetarianfriendly, ...",2216.0,4.0,$$ - $$$,barbecue asian korean vegetarianfriendly vegan...


In [14]:
get_recommendations('Gogogi')

272                           Cooking Papa
86     Samadhi Vegan Vegetarian Restaurant
56                     Restaurant Tim Raue
216                               Melo-Jia
245                                 Orient
270                                Dosanko
8                           Bangkok-Vienna
49                          Malaysian Deli
97                               Thai Thai
125                        an an Asia Food
Name: Name, dtype: object

In [15]:
df[(df['Name'] == 'Gogogi') | (df['Name'] == 'Cooking Papa')]

Unnamed: 0,index,Name,City,Cuisine Style,Ranking,Rating,Price Range,soup
9,16011,Gogogi,berlin,"[barbecue, asian, korean, vegetarianfriendly, ...",2216.0,4.0,$$ - $$$,barbecue asian korean vegetarianfriendly vegan...
272,15356,Cooking Papa,berlin,"[asian, korean]",1561.0,4.5,$$ - $$$,asian korean berlin
