Perform basic cleaning on Yelp Dataset 8 and write to SQL table.
* Turn json objects into rows. 
* Discard non-restaurant rows.
* Convert to pandas dataframe. 
* Write dataframe to SQL table. 

In [64]:
import json
import pandas as pd
import psycopg2
import sqlalchemy
import logging

from sqlalchemy import create_engine, inspect

PSQL = 'postgres://{}@localhost:5432/rpred'

logging.basicConfig(level=logging.INFO)

In [73]:
# Yelp's json files are screwy, so I have to fix them a little bit.
# {}{}{}{} --> [{},{},{},{}]
# Then discard anything that isn't a restaurant. 
def clean_restaurant_json(filename):
    logging.info('Reading {}'.format(filename))
    json_str = ""
    with open(filename, "r") as file1:
        json_str = file1.read()
    items = []
    len_json = len(json_str.split("\n"))
    for row in json_str.split("\n"):
        if is_restaurant(row):
            items.append(row)
    logging.info('{} valid items out of {}'.format(len(items), len_json))
    return [json.loads(i) for i in items]

def is_restaurant(row):
    return ((row.find("Restaurants") > 0 or 
            row.find("Food ") > 0 or 
            row.find("Cafe") > 0 or 
            row.find("Bakeries") > 0) and row.find("Grocery") < 0)

def write_to_file(json, filename):
    with open(filename, "w") as file:
        file.write(json)

In [74]:
yelp_json = clean_restaurant_json("../data/yelp8/yelp_academic_dataset_business.json")

print(len(yelp_json))

df = pd.DataFrame(yelp_json)
df.head()

INFO:root:Reading ../data/yelp8/yelp_academic_dataset_business.json
INFO:root:27913 valid items out of 85902


27913


Unnamed: 0,attributes,business_id,categories,city,full_address,hours,latitude,longitude,name,neighborhoods,open,review_count,stars,state,type
0,"{'Take-out': True, 'Drive-Thru': False, 'Good ...",5UmKMjUEUNdYWqANhGckJw,"[Fast Food, Restaurants]",Dravosburg,"4734 Lebanon Church Rd\nDravosburg, PA 15034","{'Friday': {'close': '21:00', 'open': '11:00'}...",40.354327,-79.900706,Mr Hoagie,[],True,7,3.5,PA,business
1,"{'Alcohol': 'full_bar', 'Noise Level': 'averag...",mVHrayjG3uZ_RLHkLj-AMg,"[Bars, American (New), Nightlife, Lounges, Res...",Braddock,"414 Hawkins Ave\nBraddock, PA 15104","{'Tuesday': {'close': '19:00', 'open': '10:00'...",40.40883,-79.866211,Emil's Lounge,[],True,26,4.5,PA,business
2,"{'Alcohol': 'full_bar', 'Noise Level': 'loud',...",KayYbHCt-RkbGcPdGOThNg,"[Bars, American (Traditional), Nightlife, Rest...",Carnegie,"141 Hawthorne St\nGreentree\nCarnegie, PA 15106","{'Monday': {'close': '02:00', 'open': '11:00'}...",40.415486,-80.067549,Alexion's Bar & Grill,[Greentree],True,23,4.0,PA,business
3,"{'Take-out': True, 'Drive-Thru': False, 'Good ...",wJr6kSA5dchdgOdwH6dZ2w,"[Burgers, Breakfast & Brunch, American (Tradit...",Carnegie,"2100 Washington Pike\nCarnegie, PA 15106","{'Monday': {'close': '02:00', 'open': '08:00'}...",40.387512,-80.093215,Kings Family Restaurant,[],True,10,3.5,PA,business
4,"{'Alcohol': 'full_bar', 'Noise Level': 'averag...",fNGIbpazjTRdXgwRY_NIXA,"[Bars, American (Traditional), Nightlife, Loun...",Carnegie,"1201 Washington Ave\nCarnegie, PA 15106","{'Monday': {'close': '23:00', 'open': '11:00'}...",40.396469,-80.084942,Rocky's Lounge,[],True,10,4.0,PA,business


In [54]:
cred = ""
with open("../credentials/localhost/jessica.txt") as credfile:
    cred = credfile.read().strip("\n")
    
cnx = create_engine(PSQL.format(cred), isolation_level='AUTOCOMMIT')

df.to_sql('yelp_8', cnx, if_exists='replace', index=False, 
                    dtype={'attributes' : sqlalchemy.types.JSON, 'hours' : sqlalchemy.types.JSON})

In [55]:
len(df)

3919