# resaurantRating

## ETL Group Project

    Germaine Johnson, Jeremy Jones, Reza Abasaltian
    October 27, 2020

### Google Places API - Text Search

In [1]:
# Import dependencies
import requests
from pprint import pprint
import pandas as pd
from datetime import datetime, timedelta
from config import password
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func, inspect
# Google developer API key
from api_key import gkey
from bs4 import BeautifulSoup
import warnings
from pandas.core.common import SettingWithCopyWarning

warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

# Set today as current date and time
t = datetime.now()

# Print todays date formatted as mm/dd/yy
date = t.strftime('%m/%d/%y')

# format time to round to the nearest hour in hundreds
time = (t.replace(second=0, microsecond=0, minute=0, hour=t.hour)
                        + timedelta(hours=t.minute//30))
hour = time.strftime('%H'+'00')

print(f'Today is {date} @ {hour} hour.')

Today is 10/25/20 @ 1200 hour.


In [2]:
# assign zip code for the base of our search
target_zip = "77056"

# distance, IN METERS, within which the place results must live from assigned zip code
target_radius = 11111

# type of establishment to filter place results
target_type = "restaurant"

In [3]:
# base url
base_url = "https://maps.googleapis.com/maps/api/place/textsearch/json"

In [4]:
# set up a dictionary to store all query parameters - for initial page and first 20 listings
params = {
    "query": target_zip,
    "radius": target_radius,
    "type": target_type,
    "key": gkey
}

# run a request using our params dictionary
response = requests.get(base_url, params=params)

# print the response status code
print(f'first response: {response.status_code}')

first response: 200


In [5]:
# convert the response to json
response_json = response.json()

In [6]:
print(f'Total {target_type} listings retrieved on first response: {len(response_json["results"])}')

Total restaurant listings retrieved on first response: 20


In [7]:
# set up a dictionary to store all query parameters - next page and next 20 listings, second response
params = {
    "query": target_zip,
    "radius": target_radius,
    "type": target_type,
    "key": gkey,
    "pagetoken": response_json['next_page_token']
}

# run a request using our params dictionary
response2 = requests.get(base_url, params=params)

# print the response status code
print(f'second response: {response2.status_code}')

# convert the response to json
response_json2 = response2.json()

second response: 200


In [16]:
# set up a dictionary to store all query parameters - next page and next 20 listings, third response
params = {
    "query": target_zip,
    "radius": target_radius,
    "type": target_type,
    "key": gkey,
    "pagetoken": response_json2['next_page_token']
}

# run a request using our params dictionary
response3 = requests.get(base_url, params=params)

# print the response status code
print(f'third response: {response3.status_code}')

# convert the response to json
response_json3 = response3.json()

third response: 200


In [17]:
def getPlaces(response_json, i, date, hour):
    places = []
    for result in response_json['results']:
        place = {}
        try:
            place['id'] = i
            place['name'] = result['name']
            address = result['formatted_address']
            s = address.split(', ')
            s2 = s[2].split(' ')   
            place['street'] = s[0]
            place['city'] = s[1]
            place['state'] = s2[0]
            place['zip code'] = s2[1]
            place['avg rating'] = result['rating']
            place['total ratings'] = result['user_ratings_total']
            place['price level'] = result['price_level']
            place['date'] = date
            place['hour'] = hour
            places.append(place)
            i+=1
        
        except (KeyError, IndexError) as e:
            if str(e) == "'price_level'":
                place['price level'] = "NA"                  
                places.append(place)
                print(f'Missing field/result... set NA. {str(e)}, listing {i}')
                i+=1
            else:
                print(f'Missing field/result... skipping. {str(e)}')        
    return places

In [18]:
# call get place function for each response
df_places1 = pd.DataFrame(getPlaces(response_json,0,date,hour))
df_places2 = pd.DataFrame(getPlaces(response_json2,len(df_places1),date,hour))
df_places3 = pd.DataFrame(getPlaces(response_json3,(len(df_places1)+len(df_places2)),date,hour))

Missing field/result... set NA. 'price_level', listing 0
Missing field/result... set NA. 'price_level', listing 3
Missing field/result... set NA. 'price_level', listing 6
Missing field/result... set NA. 'price_level', listing 9
Missing field/result... set NA. 'price_level', listing 12
Missing field/result... set NA. 'price_level', listing 13
Missing field/result... set NA. 'price_level', listing 22
Missing field/result... set NA. 'price_level', listing 26
Missing field/result... set NA. 'price_level', listing 28
Missing field/result... set NA. 'price_level', listing 38
Missing field/result... set NA. 'price_level', listing 42
Missing field/result... set NA. 'price_level', listing 45
Missing field/result... set NA. 'price_level', listing 49
Missing field/result... set NA. 'price_level', listing 52


In [19]:
# Concatenate all 3 API responses
df_places = pd.concat([df_places1, df_places2, df_places3], axis=0)
df_places = df_places.reset_index(drop=True)
df_places.head(10)

print(len(df_places1))
print(len(df_places2))
print(len(df_places3))

20
20
20


In [20]:
# sorted dataframe by total ratings
df_ratings = df_places.sort_values(by='total ratings', ascending=False)
df_ratings = df_ratings.reset_index(drop=True)
df_ratings.head()


print(len(df_ratings))

60


In [21]:
#create specific columns from df - google load
ratings_columns = ["id", "name", "street", "city", "state", "zip code", "avg rating", "total ratings", "price level", "date", "hour"]
ratings_transformed = df_ratings[ratings_columns].copy()

#Rename the column headers
ratings_transformed = ratings_transformed.rename(columns={"id": "id",
                                                         "name": "restaurant_name",
                                                         "street": "street_address",
                                                         "city": "city", 
                                                         "state": "state", 
                                                         "zip code": "zip_code", 
                                                         "avg rating": "avg_rating", 
                                                         "total ratings": "total_ratings", 
                                                         "price level": "price_level",
                                                         "date": "date",
                                                         "hour": "hour"})

# Clean the data by dropping duplicates and setting the index
#ratings_transformed.drop_duplicates("id", inplace=True)
ratings_transformed.set_index("id", inplace=True)

ratings_transformed.head()

Unnamed: 0_level_0,restaurant_name,street_address,city,state,zip_code,avg_rating,total_ratings,price_level,date,hour
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
33,Grand Lux Cafe,5000 Westheimer Rd,Houston,TX,77056,4.2,2882,2,10/25/20,1200
43,The Cheesecake Factory,5015 Westheimer Rd,Houston,TX,77056,4.1,2682,2,10/25/20,1200
8,Maggiano's Little Italy,2019 Post Oak Blvd,Houston,TX,77056,4.5,2654,2,10/25/20,1200
1,North Italia,1700 Post Oak Blvd Ste 190,Houston,TX,77056,4.6,2193,2,10/25/20,1200
2,Caracol Restaurant,2200 Post Oak Blvd #160,Houston,TX,77056,4.6,2118,3,10/25/20,1200


In [22]:
#database connection

engine = create_engine(f'postgresql://postgres:{password}@localhost:5432/restaurant_db')


In [24]:
engine.table_names()

['google']

In [25]:
#already a table set up, dont run again
ratings_transformed.to_sql(name='google', con=engine, if_exists='append', index=True)

In [26]:
inspector = inspect(engine)
inspector.get_table_names()

['google']

In [27]:
#confirm data has been loaded to sql table for google
pd.read_sql_query('select * from google', con=engine).head()

Unnamed: 0,id,restaurant_name,street_address,city,state,zip_code,avg_rating,total_ratings,price_level,date,hour
0,33,Grand Lux Cafe,5000 Westheimer Rd,Houston,TX,77056,4.2,2882,2,10/25/20,1200
1,43,The Cheesecake Factory,5015 Westheimer Rd,Houston,TX,77056,4.1,2682,2,10/25/20,1200
2,8,Maggiano's Little Italy,2019 Post Oak Blvd,Houston,TX,77056,4.5,2654,2,10/25/20,1200
3,1,North Italia,1700 Post Oak Blvd Ste 190,Houston,TX,77056,4.6,2193,2,10/25/20,1200
4,2,Caracol Restaurant,2200 Post Oak Blvd #160,Houston,TX,77056,4.6,2118,3,10/25/20,1200


In [28]:
#produce a csv just to check
#ETL_csv_data = ratings_transformed.to_csv('ETL.csv', index = True) 
#print('\nCSV String:\n', ETL_csv_data) 



CSV String:
 None


In [72]:
url = []

print(len(df_ratings))

for i in range(len(df_ratings)):
    url_dict = {}
    url_dict['id'] = df_ratings.iloc[i]['id']
    url_dict['name'] = df_ratings.iloc[i]['name']
    url_dict['street'] = df_ratings.iloc[i]['street']
    url_dict['URL'] = (f'https://www.yelp.com/search?find_desc={df_ratings.iloc[i]["name"]}&find_loc={df_ratings.iloc[i]["street"]}, {df_ratings.iloc[i]["city"]}, {df_ratings.iloc[i]["state"]}')
    url.append(url_dict)
    
df_url = pd.DataFrame(url)
df_url.head()

60


Unnamed: 0,id,name,street,URL
0,33,Grand Lux Cafe,5000 Westheimer Rd,https://www.yelp.com/search?find_desc=Grand Lu...
1,43,The Cheesecake Factory,5015 Westheimer Rd,https://www.yelp.com/search?find_desc=The Chee...
2,8,Maggiano's Little Italy,2019 Post Oak Blvd,https://www.yelp.com/search?find_desc=Maggiano...
3,1,North Italia,1700 Post Oak Blvd Ste 190,https://www.yelp.com/search?find_desc=North It...
4,2,Caracol Restaurant,2200 Post Oak Blvd #160,https://www.yelp.com/search?find_desc=Caracol ...


In [73]:
# Call yelp web scraping use from notebook
def getSoupResults(url):
    
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html5lib')
    results = soup.find_all('li', class_='lemon--li__09f24__1r9wz')
    #print(soup.title)
    #print(len(results))

    return results

In [74]:
def getYelpRating(results, nameSelect, idSelect, streetSelect):
    
    restaurant = []
    i = 0
    valid = False

    for result in results:
    
        try:
            rating = result.find('div', class_='lemon--div__09f24__1mboc container__09f24__21w3G hoverable__09f24__2nTf3 margin-t3__09f24__5bM2Z margin-b3__09f24__1DQ9x padding-t3__09f24__-R_5x padding-r3__09f24__1pBFG padding-b3__09f24__1vW6j padding-l3__09f24__1yCJf border--top__09f24__1H_WE border--right__09f24__28idl border--bottom__09f24__2FjZW border--left__09f24__33iol border-color--default__09f24__R1nRO')
        
            for rate in rating:
                business = {}
                business['listing'] = i
                name = rate.find('span', class_='lemon--span__09f24__3997G').text
                s = name.split('.')
                s2 = s[1].lstrip()
                business['name'] = s2
                business['street'] = rate.find('span', class_='lemon--span__09f24__3997G raw__09f24__3Obuy').text
                #print(business['street'])
                business['total ratings'] = rate.find('span', class_='lemon--span__09f24__3997G text__09f24__2tZKC reviewCount__09f24__EUXPN text-color--black-extra-light__09f24__38DtK text-align--left__09f24__3Drs0').text
                business['price level'] = rate.find('span', class_="lemon--span__09f24__3997G text__09f24__2tZKC priceRange__09f24__2O6le text-color--black-extra-light__09f24__38DtK text-align--left__09f24__3Drs0 text-bullet--after__09f24__1MWoX").text
               
                restaurant.append(business)
                i+=1
            
        except (AttributeError, TypeError, IndexError):
            continue
    
    #print(len(restaurant))
    select = []
    if len(restaurant) > 0:
        df_restaurant = pd.DataFrame(restaurant)    
    
        if df_restaurant.iloc[0]['name'] == nameSelect:
            valid = True
            #print(f'name: {valid}')
            df_restaurant['listing'][0] = idSelect
            df_restaurant = df_restaurant.rename(columns = {'listing': 'id'})
            select.append(df_restaurant.iloc[0,:])
        elif df_restaurant.iloc[0]['street'] == streetSelect:
            valid = True
            #print(f'street: {valid}')
            df_restaurant['listing'][0] = idSelect
            df_restaurant = df_restaurant.rename(columns = {'listing': 'id'})
            select.append(df_restaurant.iloc[0,:])
        
    return select 

In [75]:
def runDelay():
    c = 0
    for i in range(10000):
        c = i
    return c

In [76]:
def processYelp(start, end):
    yelpRating = []
    for i in range(start,end):
        soupResult = getSoupResults(df_url.iloc[i]['URL'])
        yelpRating.append(getYelpRating(soupResult, df_url.iloc[i]["name"], df_url.iloc[i]['id'], df_url.iloc[i]['street']))
        delay = runDelay()
        print(df_url.iloc[i]["name"])
    
    df_yelpRating = pd.DataFrame(yelpRating)
    return df_yelpRating

In [77]:
df_yelp1 = processYelp(0,30)
df_yelp2 = processYelp(31,len(df_url))

df_yelpRating = pd.concat([df_yelp1, df_yelp2], axis=0)


Grand Lux Cafe
The Cheesecake Factory
Maggiano's Little Italy
North Italia
Caracol Restaurant
Kenny & Ziggy's New York Delicatessen
Whataburger
Bubba's Texas Burger Shack
Moxie's
Dimassi's Mediterranean Buffet
True Food Kitchen
Peli Peli South African Kitchen - Galleria
La Tapatia Mexican Cafe Galería
Shake Shack
McCormick & Schmick's Seafood & Steaks
MOD Pizza
Chipotle Mexican Grill
Roostar
Adair Kitchen
Argentina Cafe
Luby's
FIG & OLIVE Tasting Kitchen & Bar
Snooze, an A.M. Eatery
Songkran Thai Kitchen
Etoile Cuisine Et Bar
La Table Houston
Daily Grill
Flower Child
Blanco Tacos + Tequila
Smashburger
The Annie Café & Bar
Sozo Sushi Lounge
la Madeleine French Bakery & Cafe Houston Galleria
Ekko's Greek American Deli
The Original Ninfa's Uptown
Sage 400
Five Guys
51fifteen Cuisine & Cocktails
White Oak Kitchen + Drinks
Cafe Ginger Uptown
Chipotle Mexican Grill
Kolache Factory
Sultan Pepper ( HALAL )
Subway
Merchant
KENZ MEDITERRANEAN CUISINE
Los Tios
Musaafer
7 Star Asian Cafe
Merus Gri

In [78]:
df_yelpRating = df_yelpRating.rename(columns = {0: 'Yelp'})
df_yelpRating = df_yelpRating.reset_index(drop=True)
df_yelpRating.head()

0
1
2
3
4


In [79]:
yelpRating_clean = []
for i in range(len(df_yelpRating)):
    yelpRating_c = {}
    try:
        yelpRating_c['id'] =  df_yelpRating.iloc[i]['Yelp']['id']
        yelpRating_c['total_ratings'] =  df_yelpRating.iloc[i]['Yelp']['total ratings']
        yelpRating_c['price_level'] =  df_yelpRating.iloc[i]['Yelp']['price level']
        yelpRating_c['name'] =  df_yelpRating.iloc[i]['Yelp']['name']
        yelpRating_clean.append(yelpRating_c)     
    except (TypeError):
        continue
        
df_yelpCleanRating = pd.DataFrame(yelpRating_clean)
df_yelpCleanRating

KeyError: 'Yelp'

In [63]:
##create specific columns from df - yelp load
yelp_ratings_columns = ["id", "total_ratings", "price_level", "name"]
ratings_transformed = df_yelpCleanRating[yelp_ratings_columns].copy()

#Rename the column headers
yelp_ratings_transformed = ratings_transformed.rename(columns={"id": "id",
                                                         "total_ratings": "yelp_total_ratings", 
                                                         "price_level": "yelp_price_level",
                                                         "name": "name"
                                                         })

# Clean the data by dropping duplicates and setting the index
#ratings_transformed.drop_duplicates("id", inplace=True)
yelp_ratings_transformed.set_index("id", inplace=True)

yelp_ratings_transformed.head()


NameError: name 'df_yelpCleanRating' is not defined

In [64]:
#database connection

engine = create_engine(f'postgresql://postgres:{password}@localhost:5432/restaurant_db')

In [65]:
engine.table_names()

['google', 'yelp', 'aggregate']

In [66]:
# dont run after table already created in SQL, you get an error message
yelp_ratings_transformed.to_sql(name='yelp', con=engine, if_exists='append', index=True)

NameError: name 'yelp_ratings_transformed' is not defined

In [67]:
inspector = inspect(engine)
inspector.get_table_names()

['google', 'yelp', 'aggregate']

In [68]:
#confirm data has been loaded to sql table
pd.read_sql_query('select * from yelp', con=engine).head(50)

Unnamed: 0,id,yelp_total_ratings,yelp_price_level,name


In [69]:

aggregate_df = pd.merge(df_ratings, yelp_ratings_transformed, on=['id','name'])
aggregate_df.head(40)

NameError: name 'yelp_ratings_transformed' is not defined

In [70]:
##create specific columns from df - yelp load
aggregate_columns = ["id", "name", "street", "city", "state", "zip code", "total ratings", "price level", "date", "hour", "yelp_total_ratings", "yelp_price_level"]
aggregate_clean = aggregate_df[aggregate_columns].copy()

#Rename the column headers
aggregate_clean = aggregate_clean.rename(columns={"id": "id",
                                                         "name": "name",
                                                         "street": "street",
                                                         "city": "city",
                                                         "state": "state",
                                                         "zip code": "zip_code",
                                                         "total ratings": "total_ratings",
                                                         "price level": "price_level",
                                                         "date": "date",
                                                         "hour": "hour",
                                                         "yelp_total_ratings": "yelp_total_ratings", 
                                                         "yelp_price_level": "yelp_price_level"                                                   
                                                         })

# Clean the data by dropping duplicates and setting the index
#ratings_transformed.drop_duplicates("id", inplace=True)
aggregate_clean.set_index("id", inplace=True)

aggregate_clean.head()

NameError: name 'aggregate_df' is not defined

In [87]:
#database connection

engine = create_engine(f'postgresql://postgres:{password}@localhost:5432/restaurant_db')

In [88]:
engine.table_names()

['aggregate', 'google', 'yelp']

In [89]:
#produce a csv
a_csv_data = aggregate_df.to_csv('a.csv', index = True) 
print('\nCSV String:\n', a_csv_data) 


NameError: name 'aggregate_df' is not defined

In [71]:
# dont run after table already created in SQL, you get an error message
aggregate_df.to_sql(name='aggregate', con=engine, if_exists='append', index=True)

NameError: name 'aggregate_df' is not defined

In [91]:
#import from a database - SQL
engine = create_engine(f'postgresql://postgres:{password}@localhost:5432/restaurant_db')

In [92]:
inspector = inspect(engine)
inspector.get_table_names()

['aggregate', 'google', 'yelp']

In [93]:
#confirm data has been loaded to sql table
pd.read_sql_query('select * from aggregate', con=engine).head(50)

Unnamed: 0,id,name,street,city,state,zip_code,avg_rating,total_ratings,price_level,date,hour,yelp_total_ratings,yelp_price_level


In [95]:

#add two columns if works then move SQL cells down below otherwise add in SQL
aggregate_df['variance'] = (aggregate_df['total_ratings'], df_yelpCleanRating['yelp_total_ratings'])
print (aggregate_df)

NameError: name 'aggregate_df' is not defined