In [1]:
import great_expectations as ge
import pandas as pd
import json
from flatten_json import flatten

In [2]:
input_file = "data/yelp_academic_dataset_business.json"
output_file = "data/yelp_academic_dataset_business_clean.json"

In [4]:
with open(input_file, "r", encoding="utf-8") as in_file, \
    open(output_file, "w") as out_file:
    
    # Start with a square bracket
    out_file.write('[')

    # Read each line from raw input JSON file,
    # flatten the dictionaries
    # and write it to output file with comma and newline
    for line in in_file.readlines()[:-1]:
        flat_json = flatten(json.loads(line))
        out_file.write(json.dumps(flat_json) + ',\n')
    
    # Go back to the start of the input file
    in_file.seek(0)
    
    # Write the last dictionary into the output file
    flat_json = flatten(json.loads(in_file.readlines()[-1]))
    out_file.write(json.dumps(flat_json))
    
    # Close with a square bracket 
    out_file.write(']')    

In [5]:
df = ge.read_json("data/yelp_academic_dataset_business_clean.json")

In [6]:
df.shape

(160585, 60)

Split data frame into half and write to JSON files

In [24]:
n = df.shape[0] // 2

In [25]:
n

80292

In [26]:
df_profiling = df.iloc[:n,:]

In [27]:
df_validating = df.iloc[n:,:]

In [28]:
df_profiling.shape

(80292, 60)

In [29]:
df_validating.shape

(80293, 60)

Drop unflattened columns and 100% null columns

In [21]:
df_profiling.isnull().sum()*100/len(df_profiling)

business_id                                0.000000
name                                       0.000000
address                                    0.000000
city                                       0.000000
state                                      0.000000
postal_code                                0.000000
latitude                                   0.000000
longitude                                  0.000000
stars                                      0.000000
review_count                               0.000000
is_open                                    0.000000
attributes_RestaurantsTableService        87.976386
attributes_WiFi                           63.236686
attributes_BikeParking                    52.467245
attributes_BusinessParking                39.088577
attributes_BusinessAcceptsCreditCards     25.292682
attributes_RestaurantsReservations        71.768047
attributes_WheelchairAccessible           81.622079
attributes_Caters                         75.189309
attributes_O

In [30]:
unflat_cols = ["attributes_BusinessParking", "attributes_Ambience", "attributes_GoodForMeal", "attributes", "hours"]

In [32]:
df_profiling.drop(unflat_cols, axis=1, inplace=True)

In [34]:
df_profiling.to_json("data/yelp_profiling.json")

In [35]:
df_validating.to_json("data/yelp_validating.json")

In [36]:
ge.read_json("data/yelp_profiling.json")

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,...,attributes_BYOB,attributes_CoatCheck,attributes_Smoking,attributes_DriveThru,attributes_BYOBCorkage,attributes_Corkage,attributes_RestaurantsCounterService,attributes_AgesAllowed,attributes_DietaryRestrictions,attributes_Open24Hours
0,6iYb2HFDywm3zjuRg0shjw,Oskar Blues Taproom,921 Pearl St,Boulder,CO,80302,40.017544,-105.283348,4.0,86,...,,,,,,,,,,
1,tCbdrRPZA0oiIYSmHG3J0w,Flying Elephants at PDX,7000 NE Airport Way,Portland,OR,97218,45.588906,-122.593331,4.0,126,...,,,,,,,,,,
2,bvN78flM8NLprQ1a1y5dRg,The Reclaimory,4720 Hawthorne Ave,Portland,OR,97214,45.511907,-122.613693,4.5,13,...,,,,,,,,,,
3,oaepsyvc0J17qwi8cfrOWg,Great Clips,2566 Enterprise Rd,Orange City,FL,32763,28.914482,-81.295979,3.0,8,...,,,,,,,,,,
4,PE9uqAjdw0E4-8mjGl3wVA,Crossfit Terminus,1046 Memorial Dr SE,Atlanta,GA,30316,33.747027,-84.353424,4.0,14,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80287,imQokyYSd-v190ulfo4cPA,Hidden Treasures,36 Jfk St,Cambridge,MA,02138,42.372674,-71.119869,3.0,7,...,,,,,,,,,,
80288,nIYYxJawFN4Xp7fINiWxLg,Harrison-Pearson Associates,"4014 Medical Pkwy, Ste 100",Austin,TX,78756,30.309836,-97.742965,3.5,15,...,,,,,,,,,,
80289,rTPMd1rHG6AUtd_8wSLnHg,9 Elm American Bistro,9 Elm St,Danvers,MA,01923,42.565378,-70.936095,4.5,208,...,False,,,,,,,,,
80290,rs6b8eaLr0v_ZXfxw9uESw,Mad Rooster's,8600 Fm 150,Kyle,TX,78640,30.047692,-97.986905,5.0,5,...,,,,,,,,,,
