In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import requests
import re
import unicodedata
import datetime
import time

# Individual Restaurant Scrape

In [2]:
restaurant_url = 'https://guide.michelin.com/en/california/encino/restaurant/shin-sushi'

In [3]:
restaurant_page = requests.get(restaurant_url)

In [4]:
soup = BeautifulSoup(restaurant_page.text, 'html.parser')

In [5]:
soup.select('title')

[<title>Shin Sushi – Encino - a MICHELIN Guide Restaurant</title>,
 <title>MICHELIN Guide - the official website</title>]

In [6]:
star_dict = {
    'm': 1,
    'n': 2,
    'o': 3
}

utensil_dict = {
    '2': 0, #simple shop
    'ò': 1, #Comfortable
    'ó': 2, #Quite Comfortable
    'ô': 3, #Very Comfortable
    'õ': 4, #Top class comfortable
    'ö': 5, #Luxury in the traditional style
    'A': .5, #Pub serving good food
    'NA': 99
}


def num_stars(site):
    content = site.find('i', 'fa-michelin restaurant-details__classification--list-icon color-primary').contents
    letter = content[0]
    return star_dict[letter] 

In [7]:
def get_name(site):
    content = site.find('h2')
    name = content.contents[0]
    return pd.Series(name)

In [8]:
def get_address(site):
    content = site.find('ul', 'restaurant-details__heading--list').contents
    inner = content[1].contents
    
    if len(inner) > 2:
        inner = content[3].contents
    
    return pd.Series(inner[1])
    

In [9]:
def remove_space_lines(text):
    pattern1 = re.compile(r'\s\s+')
    return re.sub(pattern1, '', text)

In [10]:
def split_money(text):
    currency = text[-3:]
    
    money = text[:-3]    
    money = money.split('-')
    
    min_price = money[0]
    try:
        max_price = money[1]
    except:
        max_price = money[0]
    
    min_price = re.sub(',','', min_price) #Remove commas so they can be converted to floats
    max_price = re.sub(',','', max_price)
    
    min_price = float(min_price)
    max_price = float(max_price)
        
    
    money_series = pd.Series({'min_price': min_price, 'max_price': max_price, 'currency': currency})
    
    return money_series 

In [11]:
def get_price_cuisine(site):
    content = site.find('li', 'restaurant-details__heading-price').contents[0]
    content = remove_space_lines(content)
    content = content.strip()

    
    if re.match(r'.*•.*', content) == None:
        money_series = pd.Series({'min_price': 0, 'max_price': 0, 'currency': 'Not Available'})
        cuisine = pd.Series(content, index = ['cuisine'])
        
    else:
        content = content.split('•')
        money_series = split_money(content[0])
        cuisine = pd.Series(content[1], index = ['cuisine'])

    money_series = money_series.append(cuisine)

    return money_series

    


In [12]:
get_price_cuisine(soup)

min_price           50
max_price           75
currency           USD
cuisine       Japanese
dtype: object

In [13]:
def get_description(site):
    contents = site.find('div', 'js-show-description-text').contents[1].contents[0]
    return pd.Series(unicodedata.normalize('NFKD', contents))




In [14]:
def get_star_comfort(site):
    content = site.find_all('i', "fa-michelin restaurant-details__classification--list-icon color-primary")
    
    #If the restaurant is marked as "delightful," the utensils are also in 'primary' color:
    if len(content) == 2:
        star_content = content[0].contents
        star_letter = star_content[0]
        
        utensil_content = content[1]
        utensil_letter = utensil_content.contents[0]
        
        star_comfort_series = pd.Series({'star': star_dict[star_letter], 'comfort': utensil_dict[utensil_letter],
                                        'delightful': 1})
        

    #If the restaurant does not have 'delightful' distinction, utensils are black:
    if len(content) == 1:
        star_content = content[0].contents
        star_letter = star_content[0]        
        
        content_black = site.find_all('i', "fa-michelin restaurant-details__classification--list-icon color-black")
        
        if len(content_black) != 0:
            utensil_content = content_black[0]
            utensil_letter = utensil_content.contents[0]
        else:
            utensil_letter = 'NA'
            
        star_comfort_series = pd.Series({'star': star_dict[star_letter], 'comfort': utensil_dict[utensil_letter],
                                        'delightful': 0})
    
    
    return star_comfort_series
    
    

In [15]:
get_star_comfort(soup)

star          1
comfort       2
delightful    0
dtype: int64

In [16]:
def get_restaurant_servies(site):
    services = []
    content = site.find_all('div', "restaurant-details__services--content")
    
    for item in content:
        service = item.contents[2].strip()
        services.append(service)
        
    return pd.Series([services])

In [17]:
def determine_attributes(text):
    
    pattern = re.compile(r'.*\+.*')
    
    if re.match(pattern, text) == None:
        return 'website'
    else:
        return 'phone'


In [18]:
def get_restaurant_info(site):
    content = site.find_all('span', 'flex-fill')
    hours_content = site.find('div', 'open__time-hour flex-fill')
    
    
    
    if len(content) == 3:
        phone = content[0].contents[0]
        website = content[1].contents[0]
        hours = hours_content.contents[1].contents[0]
        
    elif len(content) == 2:
        if hours_content == None:
            hours = 'Not Available'
            phone = content[0].contents[0]
            website = content[1].contents[0]
    
        else:
            hours = hours_content.contents[1].contents[0]
            
            mystery = content[0].contents[0]
            
            if determine_attributes(mystery) == 'website':
                website = mystery
                phone = 'Not Available'
            else:
                website = 'Not Available'
                phone = mystery
            
    
    elif len(content) == 1:
        if hours_content == None:
            hours = 'Not Available'
            
            mystery = content[0].contents[0]
            
            if determine_attributes(mystery) == 'website':
                website = mystery
                phone = 'Not Available'
            else:
                website = 'Not Available'
                phone = mystery
        else:
            hours = hours_content.contents[1].contents[0]
            phone = 'Not Available'
            website = 'Not Available'
        
    
    else:
        phone = 'Not Available'
        website = 'Not Available'
        hours = 'Not Available'
    
    info_series = pd.Series({'phone': phone, 'website': website, 'hours': hours})
    
    return info_series   
    
    

In [19]:
content = soup.find_all('span', 'flex-fill')
content


[<span class="flex-fill" x-ms-format-detection="none">+1 818-616-4148</span>,
 <span class="flex-fill">Opening hours</span>]

In [20]:
get_restaurant_info(soup)

phone       +1 818-616-4148
website       Not Available
hours      Dinner Tue - Sat
dtype: object

In [21]:
def get_lon_lat(site):
    content = site.find_all('iframe')
    link = content[-1]['src']
    
    coordinates = link.split('=')[2]
    coordinates = coordinates.split(',')
    
    latitude = float(coordinates[0])
    longitude = float(coordinates[1])
    
    lon_lat_series = pd.Series({'lat': latitude, 'lon': longitude})
    
    return lon_lat_series
    
    
    
    

In [22]:
get_lon_lat(soup)

lat     34.15784
lon   -118.49415
dtype: float64

In [23]:
def get_restaurant_df(url):
    restaurant_page = requests.get(url) #request the html page
    soup = BeautifulSoup(restaurant_page.text, 'html.parser') #parse html page with BeautifulSoup
    
    name = get_name(soup)
    address = get_address(soup)
    price_cuisine = get_price_cuisine(soup)
    description = get_description(soup)
    star_comfort = get_star_comfort(soup)
    services = get_restaurant_servies(soup)
    info = get_restaurant_info(soup)
    location = get_lon_lat(soup)
    
    res_url = pd.Series(url)
    df = pd.concat([name, address, price_cuisine, description, star_comfort, services, info, location, res_url], axis = 0)
    df.index = ['name', 'address', 'min_price', 'max_price', 'currency', 'cuisine', 'description',
               'star', 'comfort', 'delightful', 'services', 'phone', 'website', 'hours', 'lat', 'lon', 'url']
    
    return df
    
    
    

In [24]:
get_restaurant_df(restaurant_url)

name                                                  Shin Sushi
address        16573 Ventura Blvd., Encino, 91436, United States
min_price                                                     50
max_price                                                     75
currency                                                     USD
cuisine                                                 Japanese
description    Set in a nondescript shopping center, this hig...
star                                                           1
comfort                                                        2
delightful                                                     0
services       [New establishment in the guide, Establishment...
phone                                            +1 818-616-4148
website                                            Not Available
hours                                           Dinner Tue - Sat
lat                                                      34.1578
lon                      

# Loop Through All Restaurants

In [25]:
url_head = 'https://guide.michelin.com'

In [34]:
michelin_star_restaurants = 'https://guide.michelin.com/en/restaurants/3-stars-michelin/2-stars-michelin/1-star-michelin/page/67'

In [35]:
def get_links(url):
    page = requests.get(url) #url of a main page listing restaurants
    soup = BeautifulSoup(page.text, 'html.parser')
    
    all_links = soup.find_all(class_ = 'link')
    links = []
    for item in all_links:
        links.append(url_head + item['href'])
    
    return links

In [37]:
len(get_links(michelin_star_restaurants))

0

In [29]:
def get_restaurants(links):
    
    #Gets DF for all restaurants on a 'main-page'
    
    dfs = []

    for restaurant_url in links:
        try:
            restaurant_data = get_restaurant_df(restaurant_url)
            dfs.append(restaurant_data)
        except:
            print('ERROR!')
            print('\n')
            print(restaurant_url)

            dfs.append(restaurant_url)

        time.sleep(.25)
        
    return dfs


In [39]:
all_restaurants = []
page_num = 0
while(True):
    
    page_num += 1
    page_url = 'https://guide.michelin.com/en/restaurants/3-stars-michelin/2-stars-michelin/1-star-michelin/page/{}'.format(page_num)
    
    page_links = get_links(page_url)
    
    if len(page_links) == 0:
        break
    
    add_restaurants = get_restaurants(page_links)
    
    print(page_num)
    all_restaurants.append(add_restaurants)
    

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66


In [40]:
restaurant_df = pd.DataFrame([])

for item in all_restaurants:
    restaurant_df = pd.concat([restaurant_df, pd.DataFrame(item)], axis = 0)

In [41]:
restaurant_df.reset_index(drop = True, inplace = True)
restaurant_df

Unnamed: 0,name,address,min_price,max_price,currency,cuisine,description,star,comfort,delightful,services,phone,website,hours,lat,lon,url
0,Shin Sushi,"16573 Ventura Blvd., Encino, 91436, United States",50.0,75.0,USD,Japanese,"Set in a nondescript shopping center, this hig...",1.0,2.0,0.0,"[New establishment in the guide, Establishment...",+1 818-616-4148,Not Available,Dinner Tue - Sat,34.157840,-118.494150,https://guide.michelin.com/en/california/encin...
1,Hayato,"1320 E. 7th St., Ste. 126, Los Angeles, 90001,...",75.0,150.0,USD,Japanese,The entrance can be a bit tricky to locate wit...,1.0,3.0,0.0,"[New establishment in the guide, Establishment...",+1 213-395-0607,www.hayatorestaurant.com,Dinner Tue - Sat,34.033110,-118.242650,https://guide.michelin.com/en/california/us-lo...
2,Angler,"132 The Embarcadero, San Francisco, 94101, Uni...",75.0,150.0,USD,Contemporary,Pitched as a more casual counterpart to the st...,1.0,3.0,0.0,"[Notable cocktail list, Wheelchair accessible,...",+1 415-872-9442,www.anglerrestaurants.com,Lunch Thu - Sat Dinner nightly,37.793167,-122.392130,https://guide.michelin.com/en/california/san-f...
3,Harbor House,"5600 CA-1, Elk, United States",75.0,150.0,USD,Californian,You’ll drive along winding roads past sloping ...,1.0,3.0,1.0,"[Notable wine list, New establishment in the g...",+1 800-720-7474,www.theharborhouseinn.com,Dinner Thu - Mon,39.135876,-123.719444,https://guide.michelin.com/en/california/elk/r...
4,Addison,"5200 Grand Del Mar Way, San Diego, 92130, Unit...",75.0,150.0,USD,Contemporary,"Among other luxuries, an evening at the Addiso...",1.0,5.0,0.0,"[Notable cocktail list, Wheelchair accessible,...",+1 858-314-1900,www.addisondelmar.com,Dinner Tue - Sat,32.939903,-117.200130,https://guide.michelin.com/en/california/us-sa...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2614,Candlenut,"Block 17A, Dempsey Road, Singapore",20.0,128.0,SGD,Peranakan,The high ceiling makes the dining room feel op...,1.0,1.0,0.0,"[Car park, Establishment totally or partly res...",+65 1800 304 2288,www.comodempsey.sg,"OPENING HOURS, LAST ORDER: Lunch: 12:00-15:00 ...",1.279771,103.840200,https://guide.michelin.com/en/singapore-region...
2615,Burnt Ends,"20 Teck Lim Road, Singapore",50.0,250.0,SGD,Barbecue,"An obvious choice for meat lovers, it is prize...",1.0,1.0,0.0,[Establishment totally or partly reserved for ...,+65 6224 3933,www.burntends.com.sg,"OPENING HOURS, LAST ORDER: Lunch: 11:45-14:00 ...",1.280501,103.841800,https://guide.michelin.com/en/singapore-region...
2616,Buona Terra,"29 Scotts Road, Singapore",48.0,168.0,SGD,Italian Contemporary,The entrance to this intimate restaurant in a ...,1.0,2.0,0.0,"[Car park, Notable wine list, Establishment to...",+65 6733 0209,www.buonaterra.com.sg,"OPENING HOURS, LAST ORDER: Lunch: 12:00-14:30 ...",1.310424,103.834800,https://guide.michelin.com/en/singapore-region...
2617,Béni,"333A Orchard Road #02-37 Mandarin Gallery, Sin...",88.0,258.0,SGD,French Contemporary,"In the Mandarin Gallery, this small, elegant r...",1.0,1.0,1.0,"[Car park, Establishment totally or partly res...",+65 9159 3177,www.beni-sg.com,"OPENING HOURS, LAST ORDER: Lunch: 12:00-13:30 ...",1.302379,103.836600,https://guide.michelin.com/en/singapore-region...


In [42]:
restaurant_df.to_csv('Michelin_Details_2019.csv')

# Error Debugging (Cleared)

In [451]:
error_series = []

for item in all_restaurants:
    for series in item:
        if len(series) != 15:
            error_series.append(url_head + series['href'])


In [523]:
error_series

['https://guide.michelin.com/en/hong-kong-region/hong-kong/restaurant/kam-s-roast-goose',
 'https://guide.michelin.com/en/hong-kong-region/hong-kong/restaurant/yat-lok',
 'https://guide.michelin.com/en/hong-kong-region/hong-kong/restaurant/tim-ho-wan-sham-shui-po',
 'https://guide.michelin.com/en/catalunya/barcelona/restaurant/tickets',
 'https://guide.michelin.com/en/galicia/santiago-de-compostela/restaurant/casa-marcelo',
 'https://guide.michelin.com/en/aragon/huesca/restaurant/tatau',
 'https://guide.michelin.com/en/catalunya/la-barceloneta/restaurant/dos-palillos',
 'https://guide.michelin.com/en/bangkok-region/bangkok/restaurant/jay-fai',
 'https://guide.michelin.com/en/toscana/viareggio/restaurant/lunasia500267',
 'https://guide.michelin.com/en/clare/lios-duin-bhearna-lisdoonvarna/restaurant/wild-honey-inn',
 'https://guide.michelin.com/en/warwickshire/kenilworth/restaurant/the-cross-at-kenilworth',
 'https://guide.michelin.com/en/greater-london/fulham/restaurant/harwood-arms',
 

In [524]:
test = get_restaurants(error_series)

In [525]:
test

[address                   226 Hennessy Road, Wan Chai, Hong Kong
 min_price                                                      0
 max_price                                                      0
 currency                                           Not Available
 cuisine                            64-200HKD•CantoneseRoastMeats
 description    The Kam family name is synonymous with their f...
 star                                                           1
 comfort                                                        0
 delightful                                                     0
 services       [Air conditioning, Cash only, Establishment to...
 phone                                             +852 2520 1110
 website                                           www.krg.com.hk
 hours                                      Closed: 24-29 January
 lat                                                      22.2778
 lon                                                      114.176
 dtype: ob

# CSV Debug

In [21]:
data = pd.read_csv('Michelin_Details_2019.csv', index_col=0)

In [26]:
website_error = data['website'][data['website'] == 'Opening hours'].index

In [29]:
data.loc[website_error]

Unnamed: 0,address,min_price,max_price,currency,cuisine,description,star,comfort,delightful,services,phone,website,hours,lat,lon
0,"16573 Ventura Blvd., Encino, 91436, United States",50,75,USD,Japanese,"Set in a nondescript shopping center, this hig...",1.0,2.0,0.0,"['New establishment in the guide', 'Establishm...",+1 818-616-4148,Opening hours,Dinner Tue - Sat,34.157840,-118.494150
13,"1315 3rd St. Promenade, Ste. K, Santa Monica, ...",75,150,USD,Contemporary,Secreted away in The Gallery Food Hall (a remo...,1.0,2.0,0.0,"['New establishment in the guide', 'Establishm...",www.dialoguerestaurant.com,Opening hours,Dinner Tue - Sat,34.016790,-118.497480
25,"218 N. Rodeo Dr., Beverly Hills, 90210, United...",75,150,USD,Japanese,Welcome to one of the most expensive dining ex...,2.0,2.0,0.0,"['New establishment in the guide', 'Establishm...",+1 310-247-8939,Opening hours,Dinner Tue - Sat,34.067352,-118.400670
38,"Rua Manoel da Nóbrega 76, Loja 12, São Paulo, ...",120,322,BRL,Japanese,"This miniscule, secluded Japanese restaurant i...",1.0,1.0,0.0,"['Air conditioning', 'Establishment totally or...",+55 11 3266-3819,Opening hours,"Closed 25 December, 31 December-7 January, Car...",-23.568400,-46.649360
44,"Rua Lisboa 55, São Paulo, 05413-000, Brazil",165,365,BRL,Japanese,Leonardo Jun Sakamoto is not just a sushi chef...,1.0,2.0,0.0,"['Air conditioning', 'Establishment totally or...",+55 11 3088-6019,Opening hours,Closed 24 December-11 January and Sunday,-23.563180,-46.677370
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2033,"104 E. 30th St., New York, 10016, United States",205,205,USD,Korean,The opening of Atomix in 2018 brought about a ...,2.0,3.0,1.0,"['Air conditioning', 'Notable cocktail list', ...",www.atomixnyc.com,Opening hours,"Closed: Monday, Tuesday-Saturday lunch, Sunday",40.744306,-73.982675
2060,"17 W. 20th St., New York, 10011, United States",200,200,USD,Japanese,Make your way past the front bar area and soli...,1.0,2.0,0.0,"['Air conditioning', 'Notable cocktail list', ...",www.odo.nyc,Opening hours,"Closed: Monday, Tuesday-Saturday lunch, Sunday",40.740705,-73.992990
2108,"4 Redchurch Street, Shoreditch, E1 6JL, United...",25,65,GBP,Traditional British,Brat takes inspiration from the cooking styles...,1.0,1.0,0.0,"['Air conditioning', 'Establishment totally or...",www.bratrestaurant.com,Opening hours,Closed: Sunday dinner,51.524260,-0.076989
2294,"953 W. Fulton Market, Chicago, 60607, United S...",200,200,USD,Contemporary,Whether you come to experience the cuisine of ...,1.0,2.0,0.0,"['Air conditioning', 'Establishment totally or...",www.nextrestaurant.com,Opening hours,"Closed: Monday, Tuesday, Wednesday-Sunday lunch",41.886692,-87.651881
