In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import requests
import re
import unicodedata
import datetime
import time

# Individual Restaurant Scrape

In [629]:
restaurant_url = 'https://guide.michelin.com/en/galicia/santiago-de-compostela/restaurant/casa-marcelo'

In [630]:
restaurant_page = requests.get(restaurant_url)

In [631]:
soup = BeautifulSoup(restaurant_page.text, 'html.parser')

In [632]:
soup.select('title')

[<title>Casa Marcelo – Santiago de Compostela - a MICHELIN Guide Restaurant</title>,
 <title>MICHELIN Guide - the official website</title>]

In [633]:
star_dict = {
    'm': 1,
    'n': 2,
    'o': 3
}

utensil_dict = {
    '2': 0, #simple shop
    'ò': 1, #Comfortable
    'ó': 2, #Quite Comfortable
    'ô': 3, #Very Comfortable
    'õ': 4, #Top class comfortable
    'ö': 5, #Luxury in the traditional style
    'A': .5, #Pub serving good food
    'NA': 99
}


def num_stars(site):
    content = site.find('i', 'fa-michelin restaurant-details__classification--list-icon color-primary').contents
    letter = content[0]
    return star_dict[letter] 

In [634]:
def get_address(site):
    content = site.find('ul', 'restaurant-details__heading--list').contents
    inner = content[1].contents
    
    if len(inner) > 2:
        inner = content[3].contents
    
    return pd.Series(inner[1])
    

In [635]:
def remove_space_lines(text):
    pattern1 = re.compile(r'\s\s+')
    return re.sub(pattern1, '', text)

In [636]:
def split_money(text):
    currency = text[-3:]
    
    money = text[:-3]    
    money = money.split('-')
    
    min_price = money[0]
    try:
        max_price = money[1]
    except:
        max_price = money[0]
    
    money_series = pd.Series({'min_price': min_price, 'max_price': max_price, 'currency': currency})
    
    return money_series 

In [637]:
def get_price_cuisine(site):
    content = site.find('li', 'restaurant-details__heading-price').contents[0]
    content = remove_space_lines(content)
    content = content.strip()

    
    if re.match(r'.*•.*', content) == None:
        money_series = pd.Series({'min_price': 0, 'max_price': 0, 'currency': 'Not Available'})
        cuisine = pd.Series(content, index = ['cuisine'])
        
    else:
        content = content.split('•')
        money_series = split_money(content[0])
        cuisine = pd.Series(content[1], index = ['cuisine'])

    money_series = money_series.append(cuisine)

    return money_series

    


In [638]:
get_price_cuisine(soup)

min_price                0
max_price                0
currency     Not Available
cuisine             Fusion
dtype: object

In [639]:
def get_description(site):
    contents = site.find('div', 'js-show-description-text').contents[1].contents[0]
    return pd.Series(unicodedata.normalize('NFKD', contents))




In [640]:
def get_star_comfort(site):
    content = site.find_all('i', "fa-michelin restaurant-details__classification--list-icon color-primary")
    
    #If the restaurant is marked as "delightful," the utensils are also in 'primary' color:
    if len(content) == 2:
        star_content = content[0].contents
        star_letter = star_content[0]
        
        utensil_content = content[1]
        utensil_letter = utensil_content.contents[0]
        
        star_comfort_series = pd.Series({'star': star_dict[star_letter], 'comfort': utensil_dict[utensil_letter],
                                        'delightful': 1})
        

    #If the restaurant does not have 'delightful' distinction, utensils are black:
    if len(content) == 1:
        star_content = content[0].contents
        star_letter = star_content[0]        
        
        content_black = site.find_all('i', "fa-michelin restaurant-details__classification--list-icon color-black")
        
        if len(content_black) != 0:
            utensil_content = content_black[0]
            utensil_letter = utensil_content.contents[0]
        else:
            utensil_letter = 'NA'
            
        star_comfort_series = pd.Series({'star': star_dict[star_letter], 'comfort': utensil_dict[utensil_letter],
                                        'delightful': 0})
    
    
    return star_comfort_series
    
    

In [641]:
get_star_comfort(soup)

star           1
comfort       99
delightful     0
dtype: int64

In [642]:
def get_restaurant_servies(site):
    services = []
    content = site.find_all('div', "restaurant-details__services--content")
    
    for item in content:
        service = item.contents[2].strip()
        services.append(service)
        
    return pd.Series([services])

In [643]:
def get_restaurant_info(site):
    content = site.find_all('span', 'flex-fill')
    hours_content = site.find('div', 'open__time-hour flex-fill')
    
    try:
        phone = content[0].contents[0]
    except:
        phone = 'Not Available'
    try:
        website = content[1].contents[0]
    except:
        website = 'Not Available'
    try:
        hours = hours_content.contents[1].contents[0]
    except:
        hours = 'Not Available'

    info_series = pd.Series({'phone': phone, 'website': website, 'hours': hours})
    
    return info_series
    
    

In [644]:
get_restaurant_info(soup)

phone                                       +34 981 55 85 80
website                                  www.casamarcelo.net
hours      Closed: 15-29 February, 28 May-4 June, 15-30 N...
dtype: object

In [645]:
def get_lon_lat(site):
    content = site.find_all('iframe')
    link = content[-1]['src']
    
    coordinates = link.split('=')[2]
    coordinates = coordinates.split(',')
    
    latitude = float(coordinates[0])
    longitude = float(coordinates[1])
    
    lon_lat_series = pd.Series({'lat': latitude, 'lon': longitude})
    
    return lon_lat_series
    
    
    
    

In [646]:
get_lon_lat(soup)

lat    42.880834
lon    -8.546647
dtype: float64

In [647]:
def get_restaurant_df(url):
    restaurant_page = requests.get(url) #request the html page
    soup = BeautifulSoup(restaurant_page.text, 'html.parser') #parse html page with BeautifulSoup
    
    address = get_address(soup)
    price_cuisine = get_price_cuisine(soup)
    description = get_description(soup)
    star_comfort = get_star_comfort(soup)
    services = get_restaurant_servies(soup)
    info = get_restaurant_info(soup)
    location = get_lon_lat(soup)
    
    df = pd.concat([address, price_cuisine, description, star_comfort, services, info, location], axis = 0)
    df.index = ['address', 'min_price', 'max_price', 'currency', 'cuisine', 'description',
               'star', 'comfort', 'delightful', 'services', 'phone', 'website', 'hours', 'lat', 'lon']
    
    return df
    
    
    

In [648]:
get_restaurant_df(restaurant_url)

address           Hortas 1, Santiago de Compostela, 15705, Spain
min_price                                                      0
max_price                                                      0
currency                                           Not Available
cuisine                                                   Fusion
description    This attractive gastro - bar is located just a...
star                                                           1
comfort                                                       99
delightful                                                     0
services       [Air conditioning, Establishment totally or pa...
phone                                           +34 981 55 85 80
website                                      www.casamarcelo.net
hours          Closed: 15-29 February, 28 May-4 June, 15-30 N...
lat                                                      42.8808
lon                                                     -8.54665
dtype: object

# Loop Through All Restaurants

In [526]:
url_head = 'https://guide.michelin.com'

In [527]:
michelin_star_restaurants = 'https://guide.michelin.com/en/restaurants/3-stars-michelin/2-stars-michelin/1-star-michelin/page/5'

In [649]:
def get_links(url):
    page = requests.get(url) #url of a main page listing restaurants
    soup = BeautifulSoup(page.text, 'html.parser')
    
    all_links = soup.find_all(class_ = 'link')
    links = []
    for item in all_links:
        links.append(url_head + item['href'])
    
    return links

In [650]:
#get_links(michelin_star_restaurants)

In [651]:
def get_restaurants(links):
    dfs = []

    for restaurant_url in links:
        try:
            restaurant_data = get_restaurant_df(restaurant_url)
            dfs.append(restaurant_data)
        except:
            print('ERROR!')
            dfs.append(restaurant_url)

        time.sleep(.25)
        
    return dfs


In [537]:
all_restaurants = []

page_url = 'https://guide.michelin.com/en/restaurants/3-stars-michelin/2-stars-michelin/1-star-michelin/page/1'
get_restaurants(get_links(page_url))



[address        16573 Ventura Blvd., Encino, 91436, United States
 min_price                                                      0
 max_price                                                      0
 currency                                           Not Available
 cuisine                                        50-75USD•Japanese
 description    Set in a nondescript shopping center, this hig...
 star                                                           1
 comfort                                                        2
 delightful                                                     0
 services       [New establishment in the guide, Establishment...
 phone                                            +1 818-616-4148
 website                                            Opening hours
 hours                                           Dinner Tue - Sat
 lat                                                      34.1578
 lon                                                     -118.494
 dtype: ob

In [652]:
all_restaurants = []
page_num = 0
while(page_num <= 61):
    page_num += 1
    page_url = 'https://guide.michelin.com/en/restaurants/3-stars-michelin/2-stars-michelin/1-star-michelin/page/{}'.format(page_num)
    
    page_links = get_links(page_url)
    print(page_num)
    add_restaurants = get_restaurants(page_links)
    
    if len(add_restaurants) == 0:
        break
    
    all_restaurants.append(add_restaurants)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61


In [666]:
restaurant_df = pd.DataFrame([])

for item in all_restaurants:
    restaurant_df = pd.concat([restaurant_df, pd.DataFrame(item)], axis = 0)

In [668]:
restaurant_df.reset_index(drop = True, inplace = True)
restaurant_df

In [670]:
restaurant_df.to_csv('Michelin_Details_2019.csv')

# Error Debugging (Cleared)

In [451]:
error_series = []

for item in all_restaurants:
    for series in item:
        if len(series) != 15:
            error_series.append(url_head + series['href'])


In [523]:
error_series

['https://guide.michelin.com/en/hong-kong-region/hong-kong/restaurant/kam-s-roast-goose',
 'https://guide.michelin.com/en/hong-kong-region/hong-kong/restaurant/yat-lok',
 'https://guide.michelin.com/en/hong-kong-region/hong-kong/restaurant/tim-ho-wan-sham-shui-po',
 'https://guide.michelin.com/en/catalunya/barcelona/restaurant/tickets',
 'https://guide.michelin.com/en/galicia/santiago-de-compostela/restaurant/casa-marcelo',
 'https://guide.michelin.com/en/aragon/huesca/restaurant/tatau',
 'https://guide.michelin.com/en/catalunya/la-barceloneta/restaurant/dos-palillos',
 'https://guide.michelin.com/en/bangkok-region/bangkok/restaurant/jay-fai',
 'https://guide.michelin.com/en/toscana/viareggio/restaurant/lunasia500267',
 'https://guide.michelin.com/en/clare/lios-duin-bhearna-lisdoonvarna/restaurant/wild-honey-inn',
 'https://guide.michelin.com/en/warwickshire/kenilworth/restaurant/the-cross-at-kenilworth',
 'https://guide.michelin.com/en/greater-london/fulham/restaurant/harwood-arms',
 

In [524]:
test = get_restaurants(error_series)

In [525]:
test

[address                   226 Hennessy Road, Wan Chai, Hong Kong
 min_price                                                      0
 max_price                                                      0
 currency                                           Not Available
 cuisine                            64-200HKD•CantoneseRoastMeats
 description    The Kam family name is synonymous with their f...
 star                                                           1
 comfort                                                        0
 delightful                                                     0
 services       [Air conditioning, Cash only, Establishment to...
 phone                                             +852 2520 1110
 website                                           www.krg.com.hk
 hours                                      Closed: 24-29 January
 lat                                                      22.2778
 lon                                                      114.176
 dtype: ob