## Perform basic cleaning on Yelp Dataset 8 and write to SQL table.
* Turn json objects into rows. 
* Discard non-restaurant rows.
* Convert to pandas dataframe. 
* Write dataframe to SQL table. 

In [1]:
import json
import pandas as pd
import psycopg2
import sqlalchemy
import logging

from sqlalchemy import create_engine, inspect

PSQL = 'postgres://{}@localhost:5432/rpred'

logging.basicConfig(level=logging.INFO)

In [3]:
# Yelp's json files are screwy, so I have to fix them a little bit.
# {}{}{}{} --> [{},{},{},{}]
# Then discard anything that isn't a restaurant. 
def clean_restaurant_json(filename):
    logging.info('Reading {}'.format(filename))
    json_str = ""
    with open(filename, "r") as file1:
        json_str = file1.read()
    items = []
    len_json = len(json_str.split("\n"))
    for row in json_str.split("\n"):
        try:
            row = json.loads(row)
        except:
            logging.info("Unable to decode item: {}".format(row))
            continue
        #if "Restaurants" in row['categories']:
        #    items.append(row)
        if is_restaurant(row):
            items.append(row)
    logging.info('{} valid items out of {}'.format(len(items), len_json))
    return items
    #return [json.loads(i) for i in items]

def is_restaurant(row):
    arr = row['categories']
    return ("Restaurants" in arr or "Food" in arr or 
            "Cafe" in arr or "Bakeries" in arr) and ("Grocery" not in arr)

def get_sql_connect(cred_path="../credentials/localhost/jessica.txt"):
    cred = ""
    with open(cred_path) as file:
        cred = file.read().strip("\n")
    
    cnx = create_engine(PSQL.format(cred), isolation_level='AUTOCOMMIT')
    return cnx

def write_to_sql(df, table_name, cnx):
    logging.info('Sample row:')
    logging.info(str(df.sample(1)))
    logging.info('Write data to table {}'.format(table_name))
    df.to_sql(table_name, cnx, if_exists='replace', index=False, 
        dtype={'attributes' : sqlalchemy.types.JSON, 'hours' : sqlalchemy.types.JSON})
    logging.info('Done!')
    
cnx = get_sql_connect()

In [4]:
yelp_arr = clean_restaurant_json("../data/yelp8/yelp_academic_dataset_business.json")

df_features = pd.DataFrame(yelp_arr)
df_features.head()

write_to_sql(df_features, 'yelp_8', cnx)

INFO:root:Reading ../data/yelp8/yelp_academic_dataset_business.json
INFO:root:Unable to decode item: 
INFO:root:32978 valid items out of 85902
INFO:root:Sample row:
INFO:root:                                             attributes  \
6066  {'Alcohol': 'full_bar', 'Takes Reservations': ...   

                 business_id  \
6066  WbAkmFYQr57zHz0Mjmj2sg   

                                             categories       city  \
6066  [Bars, Pool Halls, Nightlife, American (New), ...  Las Vegas   

                                           full_address  \
6066  5025 S Eastern Ave\nSoutheast\nLas Vegas, NV 8...   

                                                  hours   latitude  \
6066  {'Monday': {'close': '00:00', 'open': '00:00'}...  36.097945   

       longitude                  name neighborhoods  open  review_count  \
6066 -115.119864  Putter's Bar & Grill   [Southeast]  True             3   

      stars state      type  
6066    2.5    NV  business  
INFO:root:Write data to tab

## Perform basic cleaning on Yelp Dataset 11 and write to SQL table.
* Turn json objects into rows. 
* Discard non-restaurant rows.
* Convert to pandas dataframe. 
* Write dataframe to SQL table. 

In [5]:
yelp_target_arr = clean_restaurant_json("../data/yelp11/business.json")

df_target = pd.DataFrame(yelp_target_arr)

write_to_sql(df_target, 'yelp_11', cnx)

INFO:root:Reading ../data/yelp11/business.json
INFO:root:Unable to decode item: 
INFO:root:65800 valid items out of 174568
INFO:root:Sample row:
INFO:root:                      address  \
18277  12900 W Thunderbird Rd   

                                              attributes  \
18277  {'GoodForMeal': {'dessert': False, 'latenight'...   

                  business_id                         categories       city  \
18277  umdGWFEra3Bkeo2wbTNz4A  [Restaurants, Fast Food, Burgers]  El Mirage   

                                                   hours  is_open   latitude  \
18277  {'Monday': '5:00-23:00', 'Tuesday': '5:00-23:0...        1  33.610633   

        longitude        name neighborhood postal_code  review_count  stars  \
18277 -112.337106  McDonald's                    85335             4    2.0   

      state  
18277    AZ  
INFO:root:Write data to table yelp_11
INFO:root:Done!
