# resaurantRating

## ETL Group Project

    Germaine Johnson, Jeremy Jones, Reza Abasaltian
    October 27, 2020

### Google Places API - Text Search

In [1]:
# Import dependencies
import requests
import sqlalchemy
import warnings
import pandas as pd

from pprint import pprint
from datetime import datetime, timedelta
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func, inspect
from bs4 import BeautifulSoup
from pandas.core.common import SettingWithCopyWarning

# Google developer API key import
from api_key import gkey

# Postgres database user and password import
# from config import password
from db_key import user, password

warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

# Set today as current date and time
t = datetime.now()

# Print todays date formatted as mm/dd/yy
date = t.strftime('%m/%d/%y')

# format time to round to the nearest hour in hundreds
time = (t.replace(second=0, microsecond=0, minute=0, hour=t.hour)
                        + timedelta(hours=t.minute//30))
hour = time.strftime('%H'+'00')

print(f'Today is {date} @ {hour} hour.')

Today is 10/26/20 @ 1500 hour.


In [2]:
# assign zip code for the base of our search
target_zip = "77056"

# distance, IN METERS, within which the place results must live from assigned zip code
target_radius = 11111

# type of establishment to filter place results
target_type = "restaurant"

In [3]:
# base url
base_url = "https://maps.googleapis.com/maps/api/place/textsearch/json"

In [4]:
# set up a dictionary to store all query parameters - for initial page and first 20 listings
params = {
    "query": target_zip,
    "radius": target_radius,
    "type": target_type,
    "key": gkey
}

# run a request using our params dictionary
response = requests.get(base_url, params=params)

# print the response status code
print(f'first google response: {response.status_code}')

first google response: 200


In [5]:
# convert the response to json
response_json = response.json()

In [6]:
print(f'Total {target_type} listings retrieved on first google response: {len(response_json["results"])}')

Total restaurant listings retrieved on first google response: 20


In [7]:
# set up a dictionary to store all query parameters - next page and next 20 listings, second response
params = {
    "query": target_zip,
    "radius": target_radius,
    "type": target_type,
    "key": gkey,
    "pagetoken": response_json['next_page_token']
}

# run a request using our params dictionary
response2 = requests.get(base_url, params=params)

# print the response status code
print(f'second google response: {response2.status_code}')

# convert the response to json
response_json2 = response2.json()

second google response: 200


In [8]:
# set up a dictionary to store all query parameters - next page and next 20 listings, third response
params = {
    "query": target_zip,
    "radius": target_radius,
    "type": target_type,
    "key": gkey,
    "pagetoken": response_json2['next_page_token']
}

# run a request using our params dictionary
response3 = requests.get(base_url, params=params)

# print the response status code
print(f'third google response: {response3.status_code}')

# convert the response to json
response_json3 = response3.json()

third google response: 200


In [9]:
def getPlaces(response_json, i, date, hour):
    places = []
    for result in response_json['results']:
        place = {}
        try:
            place['id'] = i
            place['name'] = result['name']
            address = result['formatted_address']
            s = address.split(', ')
            s2 = s[2].split(' ')   
            place['street'] = s[0]
            place['city'] = s[1]
            place['state'] = s2[0]
            place['zip code'] = s2[1]
            place['avg rating'] = result['rating']
            place['total ratings'] = result['user_ratings_total']
            place['price level'] = result['price_level']
            place['date'] = date
            place['hour'] = hour
            places.append(place)
            i+=1
        
        except (KeyError, IndexError) as e:
            if str(e) == "'price_level'":
                place['price level'] = "NA"
                place['date'] = date
                place['hour'] = hour
                places.append(place)
                print(f'Missing field/result... set NA. {str(e)}, listing {i}')
                i+=1
            else:
                print(f'Missing field/result... skipping. {str(e)}')        
    return places

In [10]:
# call get place function for each response
df_places1 = pd.DataFrame(getPlaces(response_json,0,date,hour))
df_places2 = pd.DataFrame(getPlaces(response_json2,len(df_places1),date,hour))
df_places3 = pd.DataFrame(getPlaces(response_json3,(len(df_places1)+len(df_places2)),date,hour))

Missing field/result... set NA. 'price_level', listing 3
Missing field/result... set NA. 'price_level', listing 5
Missing field/result... set NA. 'price_level', listing 7
Missing field/result... set NA. 'price_level', listing 8
Missing field/result... skipping. list index out of range
Missing field/result... set NA. 'price_level', listing 19
Missing field/result... set NA. 'price_level', listing 28
Missing field/result... set NA. 'price_level', listing 30
Missing field/result... set NA. 'price_level', listing 36
Missing field/result... set NA. 'price_level', listing 37
Missing field/result... set NA. 'price_level', listing 41
Missing field/result... set NA. 'price_level', listing 45
Missing field/result... set NA. 'price_level', listing 51
Missing field/result... set NA. 'price_level', listing 55
Missing field/result... skipping. list index out of range


In [11]:
# Concatenate all 3 API responses
df_places = pd.concat([df_places1, df_places2, df_places3], axis=0)
df_places = df_places.reset_index(drop=True)
df_places.head(10)
print(len(df_places1))
print(len(df_places2))
print(len(df_places3))

19
20
19


In [12]:
# sorted dataframe by total ratings
df_ratings = df_places.sort_values(by='total ratings', ascending=False)
df_ratings = df_ratings.reset_index(drop=True)
print(len(df_ratings))
df_ratings.head()

58


Unnamed: 0,id,name,street,city,state,zip code,avg rating,total ratings,price level,date,hour
0,42,Grand Lux Cafe,5000 Westheimer Rd,Houston,TX,77056,4.2,2885,2,10/26/20,1500
1,24,The Cheesecake Factory,5015 Westheimer Rd,Houston,TX,77056,4.1,2683,2,10/26/20,1500
2,29,Maggiano's Little Italy,2019 Post Oak Blvd,Houston,TX,77056,4.5,2655,2,10/26/20,1500
3,20,North Italia,1700 Post Oak Blvd Ste 190,Houston,TX,77056,4.6,2194,2,10/26/20,1500
4,4,Caracol Restaurant,2200 Post Oak Blvd #160,Houston,TX,77056,4.6,2120,3,10/26/20,1500


In [13]:
#create specific columns from df - google load
ratings_columns = ["id", "name", "street", "city", "state", "zip code", "avg rating", "total ratings", "price level", "date", "hour"]
ratings_transformed = df_ratings[ratings_columns].copy()

#Rename the column headers
ratings_transformed = ratings_transformed.rename(columns={"id": "id",
                                                         "name": "restaurant_name",
                                                         "street": "street_address",
                                                         "city": "city", 
                                                         "state": "state", 
                                                         "zip code": "zip_code", 
                                                         "avg rating": "avg_rating", 
                                                         "total ratings": "total_ratings", 
                                                         "price level": "price_level",
                                                         "date": "date",
                                                         "hour": "hour"})

# Clean the data by dropping duplicates and setting the index
# ratings_transformed.drop_duplicates("id", inplace=True)
ratings_transformed.set_index("id", inplace=True)
ratings_transformed.head()

Unnamed: 0_level_0,restaurant_name,street_address,city,state,zip_code,avg_rating,total_ratings,price_level,date,hour
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
42,Grand Lux Cafe,5000 Westheimer Rd,Houston,TX,77056,4.2,2885,2,10/26/20,1500
24,The Cheesecake Factory,5015 Westheimer Rd,Houston,TX,77056,4.1,2683,2,10/26/20,1500
29,Maggiano's Little Italy,2019 Post Oak Blvd,Houston,TX,77056,4.5,2655,2,10/26/20,1500
20,North Italia,1700 Post Oak Blvd Ste 190,Houston,TX,77056,4.6,2194,2,10/26/20,1500
4,Caracol Restaurant,2200 Post Oak Blvd #160,Houston,TX,77056,4.6,2120,3,10/26/20,1500


In [14]:
#database connection
engine = create_engine(f'postgresql://{user}:{password}@localhost:5432/restaurant_db')

In [15]:
engine.table_names()

['google', 'yelp', 'combined_rating']

In [16]:
#already a table set up, dont run again
ratings_transformed.to_sql(name='google', con=engine, if_exists='append', index=True)

In [17]:
inspector = inspect(engine)
inspector.get_table_names()

['google', 'yelp', 'combined_rating']

In [18]:
#confirm data has been loaded to sql table for google
pd.read_sql_query('select * from google', con=engine).head()

Unnamed: 0,id,restaurant_name,street_address,city,state,zip_code,avg_rating,total_ratings,price_level,date,hour
0,42,Grand Lux Cafe,5000 Westheimer Rd,Houston,TX,77056,4.2,2885,2,10/26/20,1500
1,24,The Cheesecake Factory,5015 Westheimer Rd,Houston,TX,77056,4.1,2683,2,10/26/20,1500
2,29,Maggiano's Little Italy,2019 Post Oak Blvd,Houston,TX,77056,4.5,2655,2,10/26/20,1500
3,20,North Italia,1700 Post Oak Blvd Ste 190,Houston,TX,77056,4.6,2194,2,10/26/20,1500
4,4,Caracol Restaurant,2200 Post Oak Blvd #160,Houston,TX,77056,4.6,2120,3,10/26/20,1500


In [19]:
url = []

print(len(df_ratings))

for i in range(len(df_ratings)):
    url_dict = {}
    url_dict['id'] = df_ratings.iloc[i]['id']
    url_dict['name'] = df_ratings.iloc[i]['name']
    url_dict['street'] = df_ratings.iloc[i]['street']
    url_dict['URL'] = (f'https://www.yelp.com/search?find_desc={df_ratings.iloc[i]["name"]}&find_loc={df_ratings.iloc[i]["street"]}, {df_ratings.iloc[i]["city"]}, {df_ratings.iloc[i]["state"]}')
    url.append(url_dict)
    
df_url = pd.DataFrame(url)
df_url.head()

58


Unnamed: 0,id,name,street,URL
0,42,Grand Lux Cafe,5000 Westheimer Rd,https://www.yelp.com/search?find_desc=Grand Lu...
1,24,The Cheesecake Factory,5015 Westheimer Rd,https://www.yelp.com/search?find_desc=The Chee...
2,29,Maggiano's Little Italy,2019 Post Oak Blvd,https://www.yelp.com/search?find_desc=Maggiano...
3,20,North Italia,1700 Post Oak Blvd Ste 190,https://www.yelp.com/search?find_desc=North It...
4,4,Caracol Restaurant,2200 Post Oak Blvd #160,https://www.yelp.com/search?find_desc=Caracol ...


In [20]:
# Call yelp web scraping use from notebook
def getSoupResults(url):    
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html5lib')
    results = soup.find_all('li', class_='lemon--li__09f24__1r9wz')
    return results

In [21]:
def getYelpRating(results, nameSelect, idSelect, streetSelect):
    
    restaurant = []
    i = 0
    valid = False

    for result in results:
    
        try:
            rating = result.find('div', class_='lemon--div__09f24__1mboc container__09f24__21w3G hoverable__09f24__2nTf3 margin-t3__09f24__5bM2Z margin-b3__09f24__1DQ9x padding-t3__09f24__-R_5x padding-r3__09f24__1pBFG padding-b3__09f24__1vW6j padding-l3__09f24__1yCJf border--top__09f24__1H_WE border--right__09f24__28idl border--bottom__09f24__2FjZW border--left__09f24__33iol border-color--default__09f24__R1nRO')
        
            for rate in rating:
                business = {}
                business['listing'] = i
                name = rate.find('span', class_='lemon--span__09f24__3997G').text
                s = name.split('.')
                s2 = s[1].lstrip()
                business['name'] = s2
                business['street'] = rate.find('span', class_='lemon--span__09f24__3997G raw__09f24__3Obuy').text
                business['total ratings'] = rate.find('span', class_='lemon--span__09f24__3997G text__09f24__2tZKC reviewCount__09f24__EUXPN text-color--black-extra-light__09f24__38DtK text-align--left__09f24__3Drs0').text
                rating_f = rate.find('span', class_='lemon--span__09f24__3997G display--inline__09f24__3iACj border-color--default__09f24__R1nRO').contents
                rating_fi = str(rating_f[0])
                i1 = rating_fi.find('aria-label')+12
                i2 = rating_fi.find('star rating')-1
                business['avg ratings'] = rating_fi[i1:i2]
                business['price level'] = rate.find('span', class_="lemon--span__09f24__3997G text__09f24__2tZKC priceRange__09f24__2O6le text-color--black-extra-light__09f24__38DtK text-align--left__09f24__3Drs0 text-bullet--after__09f24__1MWoX").text               
                restaurant.append(business)
                i+=1
            
        except (AttributeError, TypeError, IndexError):
            continue

    select = []
    if len(restaurant) > 0:
        df_restaurant = pd.DataFrame(restaurant)    
    
        if df_restaurant.iloc[0]['name'] == nameSelect:
            valid = True
            #print(f'name: {valid}')
            df_restaurant['listing'][0] = idSelect
            df_restaurant = df_restaurant.rename(columns = {'listing': 'id'})
            select.append(df_restaurant.iloc[0,:])
        elif df_restaurant.iloc[0]['street'] == streetSelect:
            valid = True
            #print(f'street: {valid}')
            df_restaurant['listing'][0] = idSelect
            df_restaurant = df_restaurant.rename(columns = {'listing': 'id'})
            select.append(df_restaurant.iloc[0,:])
        
    return select 

In [22]:
def runDelay():
    c = 0
    for i in range(10000):
        c = i
    return c

In [23]:
def processYelp(start, end):
    yelpRating = []
    for i in range(start,end):
        soupResult = getSoupResults(df_url.iloc[i]['URL'])
        yelpRating.append(getYelpRating(soupResult, df_url.iloc[i]["name"], df_url.iloc[i]['id'], df_url.iloc[i]['street']))
        delay = runDelay()
        print(df_url.iloc[i]["name"])
    
    df_yelpRating = pd.DataFrame(yelpRating)
    return df_yelpRating

In [24]:
df_yelp1 = processYelp(0,30)
df_yelp2 = processYelp(30,len(df_url))
df_yelpRating = pd.concat([df_yelp1, df_yelp2], axis=0)

Grand Lux Cafe
The Cheesecake Factory
Maggiano's Little Italy
North Italia
Caracol Restaurant
Kenny & Ziggy's New York Delicatessen
Yia Yia Mary's
Bubba's Texas Burger Shack
Truluck's
Moxie's
Dimassi's Mediterranean Buffet
Peli Peli South African Kitchen - Galleria
Del Frisco's Double Eagle Steakhouse
Shake Shack
MOD Pizza
The Oceanaire Seafood Room
Roostar
Adair Kitchen
Chick-fil-A
Argentina Cafe
Chili's Grill & Bar
Morton's The Steakhouse
The Capital Grille
FIG & OLIVE Tasting Kitchen & Bar
The Burger Palace
Masraff's
Snooze, an A.M. Eatery
HS Green Fresh Food Kitchen
La Table Houston
Daily Grill
Blanco Tacos + Tequila
Alexander the Great
The Annie Café & Bar
Sozo Sushi Lounge
la Madeleine French Bakery & Cafe Houston Galleria
Ekko's Greek American Deli
Sage 400
51fifteen Cuisine & Cocktails
White Oak Kitchen + Drinks
Cafe Ginger Uptown
Chipotle Mexican Grill
Bazille
Sultan Pepper ( HALAL )
Salata
Piatto Ristorante
E-Tao Asian Eatery
KENZ MEDITERRANEAN CUISINE
In D Kitchen
Musaafer
C

In [25]:
df_yelpRating = df_yelpRating.rename(columns = {0: 'Yelp'})
df_yelpRating = df_yelpRating.reset_index(drop=True)
df_yelpRating.head()

Unnamed: 0,Yelp
0,id 42 name ...
1,id 24 name ...
2,id 29 name ...
3,id 20 name ...
4,


In [26]:
yelpRating_clean = []
for i in range(len(df_yelpRating)):
    yelpRating_c = {}
    try:
        yelpRating_c['id'] =  df_yelpRating.iloc[i]['Yelp']['id']
        yelpRating_c['total_ratings'] =  df_yelpRating.iloc[i]['Yelp']['total ratings']
        yelpRating_c['average_rating'] =  df_yelpRating.iloc[i]['Yelp']['avg ratings']
        yelpRating_c['price_level'] =  df_yelpRating.iloc[i]['Yelp']['price level']
        yelpRating_c['name'] =  df_yelpRating.iloc[i]['Yelp']['name']
        yelpRating_clean.append(yelpRating_c)     
    except (TypeError, KeyError):
        continue
        
df_yelpCleanRating = pd.DataFrame(yelpRating_clean)
df_yelpCleanRating

Unnamed: 0,id,total_ratings,average_rating,price_level,name
0,42,908,3.5,$$,Grand Lux Cafe
1,24,672,3.0,$$,The Cheesecake Factory
2,29,668,3.5,$$,Maggiano’s Little Italy
3,20,1657,4.0,$$,North Italia
4,47,1208,4.0,$$,Kenny & Ziggy’s New York Delicatessen
5,54,757,4.0,$,Bubba’s Texas Burger Shack
6,27,764,4.5,$$$,Truluck’s Ocean’s Finest Seafood and Crab
7,13,662,4.0,$$,Moxie’s Grill & Bar
8,44,137,4.0,$$,Dimassi’s Mediterranean Buffet
9,31,421,3.5,$$,Shake Shack


In [27]:
# create specific columns from df - yelp load
yelp_ratings_columns = ["id", "total_ratings", "average_rating", "price_level"]
ratings_transformed = df_yelpCleanRating[yelp_ratings_columns].copy()

# rename the column headers
yelp_ratings_transformed = ratings_transformed.rename(columns={"id": "id",
                                                         "total_ratings": "yelp_total_ratings", 
                                                         "average_rating": "yelp_average_rating",
                                                         "price_level": "yelp_price_level"
                                                         })

# setting the index
yelp_ratings_transformed.set_index("id", inplace=True)
yelp_ratings_transformed.head()

Unnamed: 0_level_0,yelp_total_ratings,yelp_average_rating,yelp_price_level
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
42,908,3.5,$$
24,672,3.0,$$
29,668,3.5,$$
20,1657,4.0,$$
47,1208,4.0,$$


In [28]:
#database connection
engine = create_engine(f'postgresql://{user}:{password}@localhost:5432/restaurant_db')

In [29]:
engine.table_names()

['google', 'yelp', 'combined_rating']

In [30]:
# dont run after table already created in SQL, you get an error message
yelp_ratings_transformed.to_sql(name='yelp', con=engine, if_exists='append', index=True)

In [31]:
inspector = inspect(engine)
inspector.get_table_names()

['google', 'yelp', 'combined_rating']

In [32]:
#confirm data has been loaded to sql table
pd.read_sql_query('select * from yelp', con=engine).head()

Unnamed: 0,id,yelp_total_ratings,yelp_average_rating,yelp_price_level
0,42,908,3.5,$$
1,24,672,3.0,$$
2,29,668,3.5,$$
3,20,1657,4.0,$$
4,47,1208,4.0,$$


In [33]:
df_combined = pd.merge(df_yelpCleanRating, df_ratings, on=['id'], how='left')
print(len(df_combined))
df_combined.head()

35


Unnamed: 0,id,total_ratings,average_rating,price_level,name_x,name_y,street,city,state,zip code,avg rating,total ratings,price level,date,hour
0,42,908,3.5,$$,Grand Lux Cafe,Grand Lux Cafe,5000 Westheimer Rd,Houston,TX,77056,4.2,2885,2,10/26/20,1500
1,24,672,3.0,$$,The Cheesecake Factory,The Cheesecake Factory,5015 Westheimer Rd,Houston,TX,77056,4.1,2683,2,10/26/20,1500
2,29,668,3.5,$$,Maggiano’s Little Italy,Maggiano's Little Italy,2019 Post Oak Blvd,Houston,TX,77056,4.5,2655,2,10/26/20,1500
3,20,1657,4.0,$$,North Italia,North Italia,1700 Post Oak Blvd Ste 190,Houston,TX,77056,4.6,2194,2,10/26/20,1500
4,47,1208,4.0,$$,Kenny & Ziggy’s New York Delicatessen,Kenny & Ziggy's New York Delicatessen,2327 Post Oak Blvd,Houston,TX,77056,4.6,1953,2,10/26/20,1500


In [34]:
rating_stats = []
for i in range(len(df_combined)):
    stats = {}
    
    stats['id'] = df_combined.iloc[i]['id']
    
    stats['Combined Ratings'] = int(df_combined.iloc[i]['total_ratings']) + int(df_combined.iloc[i]['total ratings'])
    
    stats['Combined Avg Rating'] = round((((int(df_combined.iloc[i]['total_ratings'])*float(df_combined.iloc[i]['average_rating'])) 
                                    + (int(df_combined.iloc[i]['total ratings'])*float(df_combined.iloc[i]['avg rating']))) / stats['Combined Ratings']), 2)
    
    stats['Delta Avg Rating'] = round(stats['Combined Avg Rating'] - float(df_combined.iloc[i]['avg rating']),2)
    
    if df_combined.iloc[i]['price level'] != "NA":
        stats['Average Price Level'] = round((len(df_combined.iloc[i]['price_level']) + int(df_combined.iloc[i]['price level']))/2, 1)
    else:
        stats['Average Price Level'] = round(len(df_combined.iloc[i]['price_level']),1)
    
    rating_stats.append(stats)
    
df_rating_stats = pd.DataFrame(rating_stats)
print(len(df_rating_stats))
df_rating_stats.head()

35


Unnamed: 0,id,Combined Ratings,Combined Avg Rating,Delta Avg Rating,Average Price Level
0,42,3793,4.03,-0.17,2.0
1,24,3355,3.88,-0.22,2.0
2,29,3323,4.3,-0.2,2.0
3,20,3851,4.34,-0.26,2.0
4,47,3161,4.37,-0.23,2.0


In [35]:
#database connection
engine = create_engine(f'postgresql://{user}:{password}@localhost:5432/restaurant_db')
engine.table_names()

['google', 'yelp', 'combined_rating']

In [36]:
# create specific columns from df - combined rating load
df_rating_stats_columns = ["id", "Combined Ratings", "Combined Avg Rating", "Delta Avg Rating", "Average Price Level"]
stats_transformed = df_rating_stats[df_rating_stats_columns].copy()

# rename the column headers
stats_ratings_transformed = stats_transformed.rename(columns={"id": "id",
                                                        "Combined Ratings": "total_ratings",
                                                        "Combined Avg Rating": "average_rating",
                                                        "Delta Avg Rating": "delta_rating",
                                                        "Average Price Level": "average_price_level"
                                                         })

# setting the index
stats_ratings_transformed.set_index("id", inplace=True)
stats_ratings_transformed.head()

Unnamed: 0_level_0,total_ratings,average_rating,delta_rating,average_price_level
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
42,3793,4.03,-0.17,2.0
24,3355,3.88,-0.22,2.0
29,3323,4.3,-0.2,2.0
20,3851,4.34,-0.26,2.0
47,3161,4.37,-0.23,2.0


In [37]:
stats_ratings_transformed.to_sql(name='combined_rating', con=engine, if_exists='append', index=True)

In [38]:
#confirm data has been loaded to sql table
pd.read_sql_query('select * from combined_rating', con=engine).head()

Unnamed: 0,id,total_ratings,average_rating,delta_rating,average_price_level
0,42,3793,4.03,-0.17,2.0
1,24,3355,3.88,-0.22,2.0
2,29,3323,4.3,-0.2,2.0
3,20,3851,4.34,-0.26,2.0
4,47,3161,4.37,-0.23,2.0
