# resaurantRating

## ETL Group Project

    Germaine Johnson, Jeremy Jones, Reza Abasaltian
    October 27, 2020

### Google Places API - Text Search

In [42]:
# Import dependencies
import requests
from pprint import pprint
import pandas as pd
from datetime import datetime, timedelta
from config import password
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func, inspect
# Google developer API key
from api_key import gkey
from bs4 import BeautifulSoup
import warnings
from pandas.core.common import SettingWithCopyWarning

warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

# Set today as current date and time
t = datetime.now()

# Print todays date formatted as mm/dd/yy
date = t.strftime('%m/%d/%y')

# format time to round to the nearest hour in hundreds
time = (t.replace(second=0, microsecond=0, minute=0, hour=t.hour)
                        + timedelta(hours=t.minute//30))
hour = time.strftime('%H'+'00')

print(f'Today is {date} @ {hour} hour.')

Today is 10/24/20 @ 1500 hour.


In [2]:
# assign zip code for the base of our search
target_zip = "77056"

# distance, IN METERS, within which the place results must live from assigned zip code
target_radius = 11111

# type of establishment to filter place results
target_type = "restaurant"

In [3]:
# base url
base_url = "https://maps.googleapis.com/maps/api/place/textsearch/json"

In [4]:
# set up a dictionary to store all query parameters - for initial page and first 20 listings
params = {
    "query": target_zip,
    "radius": target_radius,
    "type": target_type,
    "key": gkey
}

# run a request using our params dictionary
response = requests.get(base_url, params=params)

# print the response status code
print(f'first response: {response.status_code}')

first response: 200


In [5]:
# convert the response to json
response_json = response.json()

In [6]:
print(f'Total {target_type} listings retrieved on first response: {len(response_json["results"])}')

Total restaurant listings retrieved on first response: 20


In [7]:
# set up a dictionary to store all query parameters - next page and next 20 listings, second response
params = {
    "query": target_zip,
    "radius": target_radius,
    "type": target_type,
    "key": gkey,
    "pagetoken": response_json['next_page_token']
}

# run a request using our params dictionary
response2 = requests.get(base_url, params=params)

# print the response status code
print(f'second response: {response2.status_code}')

# convert the response to json
response_json2 = response2.json()

second response: 200


In [13]:
# set up a dictionary to store all query parameters - next page and next 20 listings, third response
params = {
    "query": target_zip,
    "radius": target_radius,
    "type": target_type,
    "key": gkey,
    "pagetoken": response_json2['next_page_token']
}

# run a request using our params dictionary
response3 = requests.get(base_url, params=params)

# print the response status code
print(f'third response: {response3.status_code}')

# convert the response to json
response_json3 = response3.json()

third response: 200


In [14]:
def getPlaces(response_json, i, date, hour):
    places = []
    for result in response_json['results']:
        place = {}
        try:
            place['id'] = i
            place['name'] = result['name']
            address = result['formatted_address']
            s = address.split(', ')
            s2 = s[2].split(' ')   
            place['street'] = s[0]
            place['city'] = s[1]
            place['state'] = s2[0]
            place['zip code'] = s2[1]
            place['avg rating'] = result['rating']
            place['total ratings'] = result['user_ratings_total']
            place['price level'] = result['price_level']
            place['date'] = date
            place['hour'] = hour
            places.append(place)
            i+=1
        
        except (KeyError, IndexError) as e:
            if str(e) == "'price_level'":
                place['price level'] = "NA"                  
                places.append(place)
                print(f'Missing field/result... set NA. {str(e)}, listing {i}')
                i+=1
            else:
                print(f'Missing field/result... skipping. {str(e)}')        
    return places

In [15]:
# call get place function for each response
df_places1 = pd.DataFrame(getPlaces(response_json,0,date,hour))
df_places2 = pd.DataFrame(getPlaces(response_json2,len(df_places1),date,hour))
df_places3 = pd.DataFrame(getPlaces(response_json3,(len(df_places1)+len(df_places2)),date,hour))

Missing field/result... set NA. 'price_level', listing 1
Missing field/result... set NA. 'price_level', listing 7
Missing field/result... set NA. 'price_level', listing 10
Missing field/result... set NA. 'price_level', listing 11
Missing field/result... set NA. 'price_level', listing 21
Missing field/result... set NA. 'price_level', listing 27
Missing field/result... set NA. 'price_level', listing 32
Missing field/result... set NA. 'price_level', listing 35
Missing field/result... skipping. list index out of range
Missing field/result... set NA. 'price_level', listing 39
Missing field/result... set NA. 'price_level', listing 45
Missing field/result... set NA. 'price_level', listing 49
Missing field/result... set NA. 'price_level', listing 53
Missing field/result... set NA. 'price_level', listing 58


In [16]:
# Concatenate all 3 API responses
df_places = pd.concat([df_places1, df_places2, df_places3], axis=0)
df_places = df_places.reset_index(drop=True)
df_places.head(10)

print(len(df_places1))
print(len(df_places2))
print(len(df_places3))

20
19
20


In [17]:
# sorted dataframe by total ratings
df_ratings = df_places.sort_values(by='total ratings', ascending=False)
df_ratings = df_ratings.reset_index(drop=True)
df_ratings.head()


print(len(df_ratings))

59


In [18]:
#create specific columns from df - google load
ratings_columns = ["id", "name", "street", "city", "state", "zip code", "avg rating", "total ratings", "price level", "date", "hour"]
ratings_transformed = df_ratings[ratings_columns].copy()

#Rename the column headers
ratings_transformed = ratings_transformed.rename(columns={"id": "id",
                                                         "name": "restaurant_name",
                                                         "street": "street_address",
                                                         "city": "city", 
                                                         "state": "state", 
                                                         "zip code": "zip_code", 
                                                         "avg rating": "avg_rating", 
                                                         "total ratings": "total_ratings", 
                                                         "price level": "price_level",
                                                         "date": "date",
                                                         "hour": "hour"})

# Clean the data by dropping duplicates and setting the index
#ratings_transformed.drop_duplicates("id", inplace=True)
ratings_transformed.set_index("id", inplace=True)

ratings_transformed.head()

Unnamed: 0_level_0,restaurant_name,street_address,city,state,zip_code,avg_rating,total_ratings,price_level,date,hour
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
38,Grand Lux Cafe,5000 Westheimer Rd,Houston,TX,77056,4.2,2882,2,10/24/20,1400
46,The Cheesecake Factory,5015 Westheimer Rd,Houston,TX,77056,4.1,2681,2,10/24/20,1400
8,Maggiano's Little Italy,2019 Post Oak Blvd,Houston,TX,77056,4.5,2653,2,10/24/20,1400
0,North Italia,1700 Post Oak Blvd Ste 190,Houston,TX,77056,4.6,2193,2,10/24/20,1400
43,Caracol Restaurant,2200 Post Oak Blvd #160,Houston,TX,77056,4.6,2118,3,10/24/20,1400


In [28]:
#database connection

engine = create_engine(f'postgresql://postgres:{password}@localhost:5432/restaurant_db')


In [29]:
engine.table_names()

['yelp', 'google']

In [30]:
#already a table set up, dont run again
ratings_transformed.to_sql(name='google', con=engine, if_exists='append', index=True)

In [31]:
#import from a database - SQL
engine = create_engine(f'postgresql://postgres:{password}@localhost:5432/restaurant_db')

In [32]:
inspector = inspect(engine)
inspector.get_table_names()

['yelp', 'google']

In [33]:
#confirm data has been loaded to sql table for google
pd.read_sql_query('select * from google', con=engine).head()

Unnamed: 0,id,restaurant_name,street_address,city,state,zip_code,avg_rating,total_ratings,price_level,date,hour
0,38,Grand Lux Cafe,5000 Westheimer Rd,Houston,TX,77056,4.2,2882,2,10/24/20,1400
1,46,The Cheesecake Factory,5015 Westheimer Rd,Houston,TX,77056,4.1,2681,2,10/24/20,1400
2,8,Maggiano's Little Italy,2019 Post Oak Blvd,Houston,TX,77056,4.5,2653,2,10/24/20,1400
3,0,North Italia,1700 Post Oak Blvd Ste 190,Houston,TX,77056,4.6,2193,2,10/24/20,1400
4,43,Caracol Restaurant,2200 Post Oak Blvd #160,Houston,TX,77056,4.6,2118,3,10/24/20,1400


In [34]:
#produce a csv
#ETL_csv_data = ratings_transformed.to_csv('ETL.csv', index = True) 
#print('\nCSV String:\n', ETL_csv_data) 


In [35]:
url = []

print(len(df_ratings))

for i in range(len(df_ratings)):
    url_dict = {}
    url_dict['id'] = df_ratings.iloc[i]['id']
    url_dict['name'] = df_ratings.iloc[i]['name']
    url_dict['street'] = df_ratings.iloc[i]['street']
    url_dict['URL'] = (f'https://www.yelp.com/search?find_desc={df_ratings.iloc[i]["name"]}&find_loc={df_ratings.iloc[i]["street"]}, {df_ratings.iloc[i]["city"]}, {df_ratings.iloc[i]["state"]}')
    url.append(url_dict)
    
df_url = pd.DataFrame(url)
df_url.head()

59


Unnamed: 0,id,name,street,URL
0,38,Grand Lux Cafe,5000 Westheimer Rd,https://www.yelp.com/search?find_desc=Grand Lu...
1,46,The Cheesecake Factory,5015 Westheimer Rd,https://www.yelp.com/search?find_desc=The Chee...
2,8,Maggiano's Little Italy,2019 Post Oak Blvd,https://www.yelp.com/search?find_desc=Maggiano...
3,0,North Italia,1700 Post Oak Blvd Ste 190,https://www.yelp.com/search?find_desc=North It...
4,43,Caracol Restaurant,2200 Post Oak Blvd #160,https://www.yelp.com/search?find_desc=Caracol ...


In [36]:
# Call yelp web scraping use from notebook
def getSoupResults(url):
    
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html5lib')
    results = soup.find_all('li', class_='lemon--li__09f24__1r9wz')
    #print(soup.title)
    #print(len(results))

    return results

In [37]:
def getYelpRating(results, nameSelect, idSelect, streetSelect):
    
    restaurant = []
    i = 0
    valid = False

    for result in results:
    
        try:
            rating = result.find('div', class_='lemon--div__09f24__1mboc container__09f24__21w3G hoverable__09f24__2nTf3 margin-t3__09f24__5bM2Z margin-b3__09f24__1DQ9x padding-t3__09f24__-R_5x padding-r3__09f24__1pBFG padding-b3__09f24__1vW6j padding-l3__09f24__1yCJf border--top__09f24__1H_WE border--right__09f24__28idl border--bottom__09f24__2FjZW border--left__09f24__33iol border-color--default__09f24__R1nRO')
        
            for rate in rating:
                business = {}
                business['listing'] = i
                name = rate.find('span', class_='lemon--span__09f24__3997G').text
                s = name.split('.')
                s2 = s[1].lstrip()
                business['name'] = s2
                business['street'] = rate.find('span', class_='lemon--span__09f24__3997G raw__09f24__3Obuy').text
                #print(business['street'])
                business['total ratings'] = rate.find('span', class_='lemon--span__09f24__3997G text__09f24__2tZKC reviewCount__09f24__EUXPN text-color--black-extra-light__09f24__38DtK text-align--left__09f24__3Drs0').text
                business['price level'] = rate.find('span', class_="lemon--span__09f24__3997G text__09f24__2tZKC priceRange__09f24__2O6le text-color--black-extra-light__09f24__38DtK text-align--left__09f24__3Drs0 text-bullet--after__09f24__1MWoX").text
               
                restaurant.append(business)
                i+=1
            
        except (AttributeError, TypeError, IndexError):
            continue
    
    #print(len(restaurant))
    select = []
    if len(restaurant) > 0:
        df_restaurant = pd.DataFrame(restaurant)    
    
        if df_restaurant.iloc[0]['name'] == nameSelect:
            valid = True
            #print(f'name: {valid}')
            df_restaurant['listing'][0] = idSelect
            df_restaurant = df_restaurant.rename(columns = {'listing': 'id'})
            select.append(df_restaurant.iloc[0,:])
        elif df_restaurant.iloc[0]['street'] == streetSelect:
            valid = True
            #print(f'street: {valid}')
            df_restaurant['listing'][0] = idSelect
            df_restaurant = df_restaurant.rename(columns = {'listing': 'id'})
            select.append(df_restaurant.iloc[0,:])
        
    return select 

In [38]:
def runDelay():
    c = 0
    for i in range(1000):
        c = i
    return c

In [47]:
def processYelp(start, end):
    yelpRating = []
    for i in range(start,end):
        soupResult = getSoupResults(df_url.iloc[i]['URL'])
        yelpRating.append(getYelpRating(soupResult, df_url.iloc[i]["name"], df_url.iloc[i]['id'], df_url.iloc[i]['street']))
        delay = runDelay()
        print(df_url.iloc[i]["name"])
    
    df_yelpRating = pd.DataFrame(yelpRating)
    return df_yelpRating

In [48]:
df_yelp1 = processYelp(0,30)
df_yelp2 = processYelp(31,len(df_url))

df_yelpRating = pd.concat([df_yelp1, df_yelp2], axis=0)


Grand Lux Cafe
The Cheesecake Factory
Maggiano's Little Italy
North Italia
Caracol Restaurant
Kenny & Ziggy's New York Delicatessen
Bubba's Texas Burger Shack
Truluck's
Moxie's
Dimassi's Mediterranean Buffet
True Food Kitchen
Peli Peli South African Kitchen - Galleria
La Tapatia Mexican Cafe Galería
Shake Shack
McCormick & Schmick's Seafood & Steaks
MOD Pizza
Roostar
Adair Kitchen
Chick-fil-A
Argentina Cafe
Morton's The Steakhouse
The Capital Grille
FIG & OLIVE Tasting Kitchen & Bar
The Burger Palace
Masraff's
HS Green Fresh Food Kitchen
Songkran Thai Kitchen
Etoile Cuisine Et Bar
La Table Houston
Flower Child
Smashburger
Post Oak Grill
Alexander the Great
Sozo Sushi Lounge
Ekko's Greek American Deli
The Original Ninfa's Uptown
Sage 400
Uptown Sushi
Corelli's Italian Cafe
51fifteen Cuisine & Cocktails
Cafe Ginger Uptown
Chipotle Mexican Grill
Sultan Pepper ( HALAL )
Burganic Hub
Merchant
KENZ MEDITERRANEAN CUISINE
In D Kitchen
Los Tios
7 Star Asian Cafe
Merus Grill by J. Alexander's
Ch

In [79]:
#import csv from yelp
#csv_file = os.path.join("..", "Resources", "yelp_data.csv")
#yelp_data_df = pd.read_csv(csv_file)
#yelp_data_df.head()


df_yelpRating = df_yelpRating.rename(columns = {0: 'Yelp'})
df_yelpRating = df_yelpRating.reset_index(drop=True)
df_yelpRating.head()

Unnamed: 0,index,Yelp
0,0,id 38 name ...
1,1,id 46 name ...
2,2,id 8 name ...
3,3,id 0 name ...
4,4,


In [87]:
yelpRating_clean = []
for i in range(len(df_yelpRating)):
    yelpRating_c = {}
    try:
        yelpRating_c['id'] =  df_yelpRating.iloc[i]['Yelp']['id']
        yelpRating_c['total_ratings'] =  df_yelpRating.iloc[i]['Yelp']['total ratings']
        yelpRating_c['price_level'] =  df_yelpRating.iloc[i]['Yelp']['price level']
        yelpRating_clean.append(yelpRating_c)     
    except (TypeError):
        continue
        
df_yelpCleanRating = pd.DataFrame(yelpRating_clean)
df_yelpCleanRating

Unnamed: 0,id,total_ratings,price_level
0,38,908,$$
1,46,672,$$
2,8,668,$$
3,0,1657,$$
4,34,1206,$$
5,12,757,$
6,3,763,$$$
7,6,663,$$
8,31,137,$$
9,17,1000,$$


In [91]:
##create specific columns from df - yelp load
yelp_ratings_columns = ["id", "total_ratings", "price_level"]
ratings_transformed = df_yelpCleanRating[yelp_ratings_columns].copy()

#Rename the column headers
yelp_ratings_transformed = ratings_transformed.rename(columns={"id": "id",
                                                         "total_ratings": "total_ratings", 
                                                         "price_level": "price_level"
                                                         })

# Clean the data by dropping duplicates and setting the index
#ratings_transformed.drop_duplicates("id", inplace=True)
yelp_ratings_transformed.set_index("id", inplace=True)

yelp_ratings_transformed.head()


Unnamed: 0_level_0,total_ratings,price_level
id,Unnamed: 1_level_1,Unnamed: 2_level_1
38,908,$$
46,672,$$
8,668,$$
0,1657,$$
34,1206,$$


In [92]:
#database connection

engine = create_engine(f'postgresql://postgres:{password}@localhost:5432/restaurant_db')

In [93]:
engine.table_names()

['google', 'yelp']

In [95]:
yelp_ratings_transformed.to_sql(name='yelp', con=engine, if_exists='append', index=True)

In [96]:
#import from a database - SQL
engine = create_engine(f'postgresql://postgres:{password}@localhost:5432/restaurant_db')

In [97]:
inspector = inspect(engine)
inspector.get_table_names()

['google', 'yelp']

In [98]:
#confirm data has been loaded to sql table
pd.read_sql_query('select * from yelp', con=engine).head()

Unnamed: 0,id,total_ratings,price_level
0,38,908,$$
1,46,672,$$
2,8,668,$$
3,0,1657,$$
4,34,1206,$$
