# Data Exploration Notebook

In [1]:
import pandas as pd
import plotly.express as px

In [2]:
df = pd.read_csv('michelin.csv')
df.head()

Unnamed: 0,Name,Address,Location,Price,Cuisine,Longitude,Latitude,PhoneNumber,Url,WebsiteUrl,Award,GreenStar,FacilitiesAndServices,Description
0,Taïrroir,"6F, 299 Lequn 3rd Road, Zhongshan District, Ta...","Taipei, Taiwan",$$$$,Taiwanese contemporary,121.559303,25.082896,886285000000.0,https://guide.michelin.com/en/taipei-region/ta...,https://www.tairroir.com/,3 Stars,0,"Air conditioning,Wheelchair access","A portmanteau of Taiwan and terroir, Taïrroir ..."
1,JL Studio,"2F, 689, Section 4, Yifeng Road, Nantun Distri...","Taichung, Taiwan",$$$$,"Singaporean, Contemporary",120.62852,24.150486,886423800000.0,https://guide.michelin.com/en/taichung-region/...,https://jlstudiotw.com,3 Stars,0,"Air conditioning,Car park,Wheelchair access","JL stands for Jimmy Lim, a Singaporean chef wh..."
2,Le Palais,"17F, Palais de Chine Hotel, 3, Section 1, Chen...","Taipei, Taiwan",$$$$,Cantonese,121.516889,25.049163,886221800000.0,https://guide.michelin.com/en/taipei-region/ta...,https://www.palaisdechinehotel.com/pdc-en/page...,3 Stars,0,"Air conditioning,Car park,Wheelchair access","After the departure of the Macanese chef, the ..."
3,Addison,"5200 Grand Del Mar Way, San Diego, 92130, USA","San Diego, USA",$$$$,"Contemporary, Californian",-117.198891,32.941297,18583140000.0,https://guide.michelin.com/en/california/us-sa...,https://www.addisondelmar.com/,3 Stars,0,"Air conditioning,Car park,Garden or park,Inter...",Chef William Bradley has helmed the stoves at ...
4,Atelier Crenn,"3127 Fillmore St., San Francisco, 94123, USA","San Francisco, USA",$$$$,"Contemporary, French",-122.43586,37.79835,14154400000.0,https://guide.michelin.com/en/california/san-f...,https://www.ateliercrenn.com/,3 Stars,1,"Air conditioning,Interesting wine list,Wheelch...",At the hands of accomplished Chef Dominique Cr...


In [3]:
df.columns

Index(['Name', 'Address', 'Location', 'Price', 'Cuisine', 'Longitude',
       'Latitude', 'PhoneNumber', 'Url', 'WebsiteUrl', 'Award', 'GreenStar',
       'FacilitiesAndServices', 'Description'],
      dtype='object')

In [4]:
# capital letters are annoying
df.columns = df.columns.str.lower()

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15520 entries, 0 to 15519
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   name                   15520 non-null  object 
 1   address                15520 non-null  object 
 2   location               15520 non-null  object 
 3   price                  15519 non-null  object 
 4   cuisine                15519 non-null  object 
 5   longitude              15520 non-null  float64
 6   latitude               15520 non-null  float64
 7   phonenumber            15182 non-null  float64
 8   url                    15520 non-null  object 
 9   websiteurl             13245 non-null  object 
 10  award                  15520 non-null  object 
 11  greenstar              15520 non-null  int64  
 12  facilitiesandservices  14665 non-null  object 
 13  description            15519 non-null  object 
dtypes: float64(3), int64(1), object(10)
memory usage: 1.7+

In [6]:
df.describe()

Unnamed: 0,longitude,latitude,phonenumber,greenstar
count,15520.0,15520.0,15182.0,15520.0
mean,16.719998,40.212486,563531400000.0,0.015528
std,61.550179,13.974963,9912866000000.0,0.123645
min,-123.719064,-34.626786,35223690.0,0.0
25%,-0.369541,35.725245,33145330000.0,0.0
50%,6.730856,43.919181,37068550000.0,0.0
75%,15.603101,48.864412,390430100000.0,0.0
max,139.825702,66.166565,971971600000000.0,1.0


## Preprocessing

In [7]:
# split location into city and country
df[['city','country']] = df['location'].str.split(',', expand=True)

In [8]:
# split price into its parts so that it can be referred to more easily
df['price_currency'] = df['price'].str[0]
df['price_amount'] = df['price'].str.len()

In [9]:
df['award']

0                     3 Stars
1                     3 Stars
2                     3 Stars
3                     3 Stars
4                     3 Stars
                 ...         
15515    Selected Restaurants
15516    Selected Restaurants
15517    Selected Restaurants
15518    Selected Restaurants
15519    Selected Restaurants
Name: award, Length: 15520, dtype: object

In [10]:
df['award_stars'] = df['award'].str.extract(r'(\d+)')
df['award_stars'] = pd.to_numeric(df['award_stars']).fillna(0).astype(int)


In [11]:
# facility and services are more useful if they are added as bool columns
extras_dummies = df['facilitiesandservices'].str.get_dummies(sep=',').add_prefix('extras_')
df_with_extra_cats = pd.concat([df, extras_dummies], axis=1)

cuisine_dummies = df['cuisine'].str.get_dummies(sep=',').add_prefix('cuisine_')
df_with_cuisine_cats = pd.concat([df_with_extra_cats, cuisine_dummies], axis=1)

In [12]:
df_final = df_with_cuisine_cats

## Explore

In [13]:
# countries with most michelin star restaurants
df_grouped = df[df['award_stars']>0].groupby(by=['country']).size().reset_index(name='count')
df_grouped_top_n = df_grouped.sort_values(by='count',ascending=False).head(15)
fig = px.bar(df_grouped_top_n, x='country', y='count')
fig.show()

In [14]:
# cities with most michelin star restaurants
df_grouped = df[df['award_stars']>0].groupby(by=['city']).size().reset_index(name='count')
df_grouped_top_n = df_grouped.sort_values(by='count',ascending=False).head(15)
fig = px.bar(df_grouped_top_n, x='city', y='count')
fig.show()

In [15]:
# interaction of price and starts
cross_tab = pd.crosstab(df['price_amount'],df['award_stars'])

fig = px.imshow(cross_tab, text_auto=True)
fig.show()

In [16]:
# geographical location
fig = px.scatter_geo(df[df['award_stars']>0], lat="latitude", lon='longitude', color="award_stars",
                     hover_name="name", projection="natural earth")
fig.show()