In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import requests
import re
import unicodedata

In [80]:
restaurant_url = 'https://guide.michelin.com/en/california/san-francisco/restaurant/omakase'

In [81]:
restaurant_page = requests.get(restaurant_url)

In [82]:
soup = BeautifulSoup(restaurant_page.text, 'html.parser')

In [83]:
soup


<!DOCTYPE html>

<html class="full-screen-mobile" lang="en">
<head>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1.0, user-scalable=0" name="viewport"/>
<meta content="" name="author"/>
<meta content="#bd2333" name="theme-color"/>
<meta content="vFUp7XI3G-xKLevkT2qftuscMInOxirdCu0oYr7_uLM" name="google-site-verification">
<meta content="eZwPBJ-0r-9qysyJy_aEPti-m_vr-h_oUJy2qUR1144" name="google-site-verification">
<meta content="https://d3h1lg3ksw6i6b.cloudfront.net/guide/xlarge/477464_5.jpg" itemprop="image">
<meta content="Article" property="og:type">
<meta content="MICHELIN Guide" property="og:site_name"/>
<meta content="https://guide.michelin.com/en/california/san-francisco/restaurant/omakase" property="og:url"/>
<meta content="https://d3h1lg3ksw6i6b.cloudfront.net/guide/xlarge/477464_5.jpg" property="og:image"/>
<meta content="Omakase – San Francisco - a MICHELIN Guide Restaurant" itemprop="name"/>
<meta content="Omakase – San Francisco - a MICHELIN Gui

In [84]:
soup.select('title')

[<title>Omakase – San Francisco - a MICHELIN Guide Restaurant</title>,
 <title>MICHELIN Guide - the official website</title>]

In [85]:
star_dict = {
    'm': 1,
    'n': 2,
    'o': 3
}

utensil_dict = {
    2: 0, #simple shop
    'ò': 1, #Comfortable
    'ó': 2, #Quite Comfortable
    'ô': 3, #Very Comfortable
    'õ': 4, #Top class comfortable
    'ö': 5 #Luxury in the traditional style
}


def num_stars(site):
    content = site.find('i', 'fa-michelin restaurant-details__classification--list-icon color-primary').contents
    letter = content[0]
    return star_dict[letter] 

In [86]:
def get_address(site):
    content = soup.find('ul', 'restaurant-details__heading--list').contents
    inner = res_heading[1].contents
    
    return inner[1] 
    

In [87]:
def remove_space_lines(text):
    pattern1 = re.compile(r'[\s\s+]')
    return re.sub(pattern1, '', text)

In [88]:
def split_money(text):
    currency = text[-3:]
    
    money = text[:-3]    
    money = money.split('-')
    
    min_price = money[0]
    max_price = money[1]
    
    money_series = pd.Series({'min_price': min_price, 'max_price': max_price, 'currency': currency})
    
    return money_series 

In [89]:
def get_price_cuisine(site):
    content = soup.find('li', 'restaurant-details__heading-price').contents[0]
    content = content.strip()
    content = remove_space_lines(content)
    
    content = content.split('•')
    
    money_series = split_money(content[0])
    
    cuisine = pd.Series(content[1], index = ['cuisine'])
    
    money_series = money_series.append(cuisine)
    
    return money_series
    
    


In [90]:
get_price_cuisine(soup)

min_price          75
max_price         150
currency          USD
cuisine      Japanese
dtype: object

In [91]:
def get_description(site):
    contents = soup.find('div', 'js-show-description-text').contents[1].contents[0]
    return unicodedata.normalize('NFKD', contents)




In [95]:
def get_star_comfort(site):
    content = site.find_all('i', "fa-michelin restaurant-details__classification--list-icon color-primary")
    
    #If the restaurant is marked as "delightful," the utensils are also in 'primary' color:
    if len(content) == 2:
        star_content = content[0].contents
        star_letter = star_content[0]
        
        utensil_content = content[1]
        utensil_letter = utensil_content.contents[0]
        
        star_comfort_series = pd.Series({'star': star_dict[star_letter], 'comfort': utensil_dict[utensil_letter],
                                        'delightful': 1})
        

    #If the restaurant does not have 'delightful' distinction, utensils are black:
    if len(content) == 1:
        content_black = site.find_all('i', "fa-michelin restaurant-details__classification--list-icon color-black")
        star_content = content[0].contents
        star_letter = star_content[0]        
        
        utensil_content = content_black[0]
        utensil_letter = utensil_content.contents[0]
        
        star_comfort_series = pd.Series({'star': star_dict[star_letter], 'comfort': utensil_dict[utensil_letter],
                                        'delightful': 0})
    
    
    return star_comfort_series
    
    

In [134]:
def get_restaurant_servies(site):
    services = []
    content = soup.find_all('div', "restaurant-details__services--content")
    
    for item in content:
        service = item.contents[2].strip()
        services.append(service)
        
    return services

In [160]:
def get_restaurant_info(site):
    content = site.find_all('span', 'flex-fill')
    hours_content = site.find('div', 'open__time-hour flex-fill')
    
    phone = content[0].contents[0]
    website = content[1].contents[0]
    hours = hours_content.contents[1].contents[0]

    info_series = pd.Series({'phone': phone, 'website': website, 'hours': hours})
    
    return info_series
    
    

In [166]:
get_restaurant_info(soup)

phone        +1 415-865-0633
website    www.omakasesf.com
hours         Dinner nightly
dtype: object

In [179]:
def get_lon_lat(site):
    content = site.find_all('iframe')
    link = content[1]['src']
    
    coordinates = link.split('=')[2]
    coordinates = coordinates.split(',')
    
    latitude = float(coordinates[0])
    longitude = float(coordinates[1])
    
    lon_lat_series = pd.Series({'lat': latitude, 'lon': longitude})
    
    return lon_lat_series
    
    
    
    

In [180]:
get_lon_lat(soup)

lat     37.77077
lon   -122.40298
dtype: float64

In [162]:
a = soup.find_all('iframe')

In [173]:
b = a[1]['src']

In [174]:
b

'https://www.google.com/maps/embed/v1/place?key=AIzaSyDvEyVCVpGtn81z5NrMKgdehPsrO9sJiMw&q=37.7707700,-122.4029800'

In [175]:
b.split('=')[2]

'37.7707700,-122.4029800'