# Load Data

In [50]:
import pandas as pd
import json
import progressbar

## Load Reviews
First, load the reviews. We can use chunksize to limit the number of reviews selected.

In [51]:
num_reviews = 50000

In [52]:
df = pd.read_json('data/review.json', lines=True, orient='columns', chunksize=num_reviews)
for chunk in df:
    review = chunk
    break
review.set_index('review_id', inplace=True)
review.head()

Unnamed: 0_level_0,business_id,cool,date,funny,stars,text,useful,user_id
review_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Q1sbwvVQXV2734tPgoKj4Q,ujmEBvifdJM6h6RLv4wQIg,0,2013-05-07 04:34:36,1,1,Total bill for this horrible service? Over $8G...,6,hG7b0MtEbXx5QzbzE6C_VA
GJXCdrto3ASJOqKeVWPi6Q,NZnhc2sEQy3RmzKTZnqtwQ,0,2017-01-14 21:30:33,0,5,I *adore* Travis at the Hard Rock's new Kelly ...,0,yXQM5uF2jS6es16SJzNHfg
2TzJjDVDEuAW6MR5Vuc1ug,WTqjgwHlXbSFevF32_DJVw,0,2016-11-09 20:09:03,0,5,I have to say that this office really has it t...,3,n6-Gk65cPZL6Uz8qRm3NYw
yi0R0Ugj_xUx_Nek0-_Qig,ikCg8xy5JIg_NGPx-MSIDA,0,2018-01-09 20:56:38,0,5,Went in for a lunch. Steak sandwich was delici...,0,dacAIZ6fTM6mqwW5uxkskg
11a8sVPMUFtaC7_ABRkmtw,b1b1eb3uo-w561D0ZfCEiQ,0,2018-01-30 23:07:38,0,1,Today was my second out of three sessions I ha...,7,ssoyf2_x0EQMed6fgHeMyQ


## Load Supplementary Data
Next, load the corresponding tables. In order to save memory, we only load the data that is referenced in the reviews table.

In [53]:
business_ids = review.business_id.unique()
user_ids = review.user_id.unique()

In [54]:
def load_data(filename, filters, stop_when_done):
    bar = progressbar.ProgressBar(widgets=[progressbar.AnimatedMarker(), " ", progressbar.Counter(), " ", progressbar.BouncingBar(), " ", progressbar.Timer()])
    i = 0
    df_dict = {}
    with open("data/"+filename+".json", encoding='utf-8') as f:
        for line in f:
            obj = json.loads(line)
            add = True
            for col_to_filter, filter_items in filters:
                if (obj[col_to_filter] not in filter_items):
                    add = False
                    break
            if add:
                df_dict[i] = obj
                i+=1
                if stop_when_done and len(df_dict) == len(filter_items):
                    break
            bar.update(len(df_dict))
    bar.finish()
    return pd.DataFrame.from_dict(df_dict, 'index')

In [55]:
business = load_data('business', [('business_id', business_ids)], True)
business.set_index('business_id', inplace=True)
business.head()

| 10658 |                  #                            | Elapsed Time: 0:00:07


Unnamed: 0_level_0,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1SWheh84yJXfytovILXOAQ,Arizona Biltmore Golf Club,2818 E Camino Acequia Drive,Phoenix,AZ,85016,33.522143,-112.018481,3.0,5,0,{'GoodForKids': 'False'},"Golf, Active Life",
QXAEGFB4oINsVuTFxEYKFQ,Emerald Chinese Restaurant,30 Eglinton Avenue W,Mississauga,ON,L5R 3E7,43.605499,-79.652289,2.5,128,1,"{'RestaurantsReservations': 'True', 'GoodForMe...","Specialty Food, Restaurants, Dim Sum, Imported...","{'Monday': '9:0-0:0', 'Tuesday': '9:0-0:0', 'W..."
gnKjwL_1w79qoiV3IC_xQQ,Musashi Japanese Restaurant,"10110 Johnston Rd, Ste 15",Charlotte,NC,28210,35.092564,-80.859132,4.0,170,1,"{'GoodForKids': 'True', 'NoiseLevel': 'u'avera...","Sushi Bars, Restaurants, Japanese","{'Monday': '17:30-21:30', 'Wednesday': '17:30-..."
HhyxOkGAM07SRYtlQ4wMFQ,Queen City Plumbing,"4209 Stuart Andrew Blvd, Ste F",Charlotte,NC,28217,35.190012,-80.887223,4.0,4,1,"{'BusinessAcceptsBitcoin': 'False', 'ByAppoint...","Plumbing, Shopping, Local Services, Home Servi...","{'Monday': '7:0-23:0', 'Tuesday': '7:0-23:0', ..."
5t3KVdMnFgAYmSl1wYLhmA,The Kilted Buffalo Langtree,"119 Landings Dr, Ste 101",Mooresville,NC,28117,35.52741,-80.868003,3.5,9,1,"{'BusinessParking': '{'garage': False, 'street...","Bars, Nightlife, Pubs, Barbers, Beauty & Spas,...","{'Monday': '10:0-1:0', 'Tuesday': '10:0-1:0', ..."


In [56]:
user = load_data('user', [('user_id', user_ids)], True)
user.set_index('user_id', inplace=True)
user.head()

| 42987 |                #                              | Elapsed Time: 0:05:40


Unnamed: 0_level_0,name,review_count,yelping_since,useful,funny,cool,elite,friends,fans,average_stars,...,compliment_more,compliment_profile,compliment_cute,compliment_list,compliment_note,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
dD0gZpBctWGdWo9WlGuhlA,Angela,17,2014-05-22 15:57:30,30,4,14,,"RZ6wS38wnlXyj-OOdTzBxA, l5jxZh1KsgI8rMunm-GN6A...",5,4.85,...,0,0,0,0,0,2,0,0,1,0
MM4RJAeH6yuaN8oZDSt0RA,Nancy,361,2013-10-23 07:02:50,1114,279,665,2015201620172018,"mbwrZ-RS76V1HoJ0bF_Geg, g64lOV39xSLRZO0aQQ6DeQ...",39,4.08,...,1,0,0,1,16,57,80,80,25,5
T0gWkTHWRChVUe_Dn1F8nw,Tanya,859,2005-07-20 22:38:17,1630,693,1244,2006200720082009201020112012,"ctr_BlCf3Ogny-vLs8E9tQ, c6HT44PKCaXqzN_BdgKPCw...",57,4.21,...,16,7,7,3,31,72,95,95,34,2
NQffx45eJaeqhFcMadKUQA,Trace,124,2008-12-10 22:59:45,202,70,185,,"N-xeG3U6rUkjVtQ0o-5YZA, wnO99pBbGqwqOoTQM25iCw...",15,4.53,...,3,0,2,0,12,8,14,14,3,5
gvXtMj3XuPr0xHjgmlmtng,Peter,47,2014-01-05 20:45:54,57,26,34,20172018,"CfGCj80EdA-xS-mTWlAn4Q, JgD2Rk9K07MkZgG7Nb9YzA...",9,3.6,...,0,0,0,0,4,11,5,5,4,3


In [57]:
checkin = load_data('checkin', [('business_id', business_ids)], False)
checkin.set_index('business_id', inplace=True)
checkin.head()

| 9711 |                                 #              | Elapsed Time: 0:00:54


Unnamed: 0_level_0,date
business_id,Unnamed: 1_level_1
--I7YYLada0tSLkORTHb5Q,"2014-11-07 00:51:45, 2014-11-10 23:51:38, 2014..."
--U98MNlDym2cLn36BBPgQ,"2011-10-05 22:50:41, 2012-04-11 00:06:36, 2012..."
--wIGbLEhlpl_UeAIyDmZQ,2015-06-06 20:01:06
-000aQFeK6tqVLndf7xORg,2018-10-17 21:16:27
-092wE7j5HZOogMLAh40zA,"2010-07-28 21:07:28, 2010-07-31 21:16:54, 2010..."


In [58]:
photo = load_data('photo', [('business_id', business_ids)], False)
photo.set_index('business_id', inplace=True)
photo.head()

| 16319 |                                             # | Elapsed Time: 0:01:00


Unnamed: 0_level_0,caption,photo_id,label
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
50Anorn0DJXFhBr9a9_gHQ,,5IiIo5UKEW0lWqZ6sWrY_A,inside
296PZdxSrtH08EUwCsOKMw,,1eDvPC4F8cGvuI2lGpIoEw,inside
7eQoxWr9RzyCB9IFvqHUPA,,GuvDS21yJ5efL1Zo1EzGPA,outside
GCRvrxMSC1nzShyM4Y-guQ,Bar Dancers,hkV_CrgjTeJBTAWJWAO46w,inside
jGH9DMTUojegjQZ4anb1kQ,Eddie V's Edgewater Grill,D__lKlNV-3Ha_eps3Y9_CA,inside


In [59]:
# tip = load_data('tip', [('business_id', business_ids), ('user_id', user_ids)], False)
# tip.head()

In [60]:
import gc
gc.collect()

42

## Export Data

We export the data for easier loading for future sessions. 

In [61]:
review.to_pickle("data/"+str(num_reviews)+"_review.pkl")
business.to_pickle("data/"+str(num_reviews)+"_business.pkl")
user.to_pickle("data/"+str(num_reviews)+"_user.pkl")
checkin.to_pickle("data/"+str(num_reviews)+"_checkin.pkl")
photo.to_pickle("data/"+str(num_reviews)+"_photo.pkl")