# Load Data

In [1]:
import pandas as pd
import json
import progressbar

## Load Reviews
First, load the reviews. We can use chunksize to limit the number of reviews selected.

In [2]:
num_reviews = 5000

In [3]:
df = pd.read_json('data/review.json', lines=True, orient='columns', chunksize=num_reviews)
for chunk in df:
    review = chunk
    break
review.set_index('review_id', inplace=True)
review.head()

Unnamed: 0_level_0,business_id,cool,date,funny,stars,text,useful,user_id
review_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Q1sbwvVQXV2734tPgoKj4Q,ujmEBvifdJM6h6RLv4wQIg,0,2013-05-07 04:34:36,1,1,Total bill for this horrible service? Over $8G...,6,hG7b0MtEbXx5QzbzE6C_VA
GJXCdrto3ASJOqKeVWPi6Q,NZnhc2sEQy3RmzKTZnqtwQ,0,2017-01-14 21:30:33,0,5,I *adore* Travis at the Hard Rock's new Kelly ...,0,yXQM5uF2jS6es16SJzNHfg
2TzJjDVDEuAW6MR5Vuc1ug,WTqjgwHlXbSFevF32_DJVw,0,2016-11-09 20:09:03,0,5,I have to say that this office really has it t...,3,n6-Gk65cPZL6Uz8qRm3NYw
yi0R0Ugj_xUx_Nek0-_Qig,ikCg8xy5JIg_NGPx-MSIDA,0,2018-01-09 20:56:38,0,5,Went in for a lunch. Steak sandwich was delici...,0,dacAIZ6fTM6mqwW5uxkskg
11a8sVPMUFtaC7_ABRkmtw,b1b1eb3uo-w561D0ZfCEiQ,0,2018-01-30 23:07:38,0,1,Today was my second out of three sessions I ha...,7,ssoyf2_x0EQMed6fgHeMyQ


## Load Supplementary Data
Next, load the corresponding tables. In order to save memory, we only load the data that is referenced in the reviews table.

In [4]:
business_ids = review.business_id.unique()
user_ids = review.user_id.unique()

In [5]:
def load_data(filename, filters, stop_when_done):
    bar = progressbar.ProgressBar(widgets=[progressbar.AnimatedMarker(), " ", progressbar.Counter(), " ", progressbar.BouncingBar(), " ", progressbar.Timer()])
    i = 0
    df_dict = {}
    with open("data/"+filename+".json", encoding='utf-8') as f:
        for line in f:
            obj = json.loads(line)
            add = True
            for col_to_filter, filter_items in filters:
                if (obj[col_to_filter] not in filter_items):
                    add = False
                    break
            if add:
                df_dict[i] = obj
                i+=1
                if stop_when_done and len(df_dict) == len(filter_items):
                    break
            bar.update(len(df_dict))
    bar.finish()
    return pd.DataFrame.from_dict(df_dict, 'index')

In [6]:
business = load_data('business', [('business_id', business_ids)], True)
business.set_index('business_id', inplace=True)
business.head()

| 2925 |                #                               | Elapsed Time: 0:00:01


Unnamed: 0_level_0,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1SWheh84yJXfytovILXOAQ,Arizona Biltmore Golf Club,2818 E Camino Acequia Drive,Phoenix,AZ,85016,33.522143,-112.018481,3.0,5,0,{'GoodForKids': 'False'},"Golf, Active Life",
gnKjwL_1w79qoiV3IC_xQQ,Musashi Japanese Restaurant,"10110 Johnston Rd, Ste 15",Charlotte,NC,28210,35.092564,-80.859132,4.0,170,1,"{'GoodForKids': 'True', 'NoiseLevel': 'u'avera...","Sushi Bars, Restaurants, Japanese","{'Monday': '17:30-21:30', 'Wednesday': '17:30-..."
HhyxOkGAM07SRYtlQ4wMFQ,Queen City Plumbing,"4209 Stuart Andrew Blvd, Ste F",Charlotte,NC,28217,35.190012,-80.887223,4.0,4,1,"{'BusinessAcceptsBitcoin': 'False', 'ByAppoint...","Plumbing, Shopping, Local Services, Home Servi...","{'Monday': '7:0-23:0', 'Tuesday': '7:0-23:0', ..."
fweCYi8FmbJXHCqLnwuk8w,Marco's Pizza,5981 Andrews Rd,Mentor-on-the-Lake,OH,44060,41.70852,-81.359556,4.0,16,1,"{'RestaurantsPriceRange2': '2', 'BusinessAccep...","Italian, Restaurants, Pizza, Chicken Wings","{'Monday': '10:0-0:0', 'Tuesday': '10:0-0:0', ..."
A98xW4qb7vOTguggHFs7Ng,Hot Yoga Wellness,1455 16th Avenue,Richmond Hill,ON,L4B 3G6,43.861503,-79.388499,4.0,4,1,"{'ByAppointmentOnly': 'False', 'BikeParking': ...","Fitness & Instruction, Active Life, Yoga","{'Monday': '16:0-23:0', 'Tuesday': '16:0-23:0'..."


In [7]:
user = load_data('user', [('user_id', user_ids)], True)
user.set_index('user_id', inplace=True)
user.head()

| 4787 |                #                               | Elapsed Time: 0:00:20


Unnamed: 0_level_0,name,review_count,yelping_since,useful,funny,cool,elite,friends,fans,average_stars,...,compliment_more,compliment_profile,compliment_cute,compliment_list,compliment_note,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
B2CkkEX341HLK3zbn3qcgQ,Carlos,4,2014-06-22 06:54:32,4,1,0,,"k7QixRwahZavaQ_Rhd9s_w, _gLR-74C8aY-lIP_u3vFow...",0,3.0,...,0,0,0,0,0,0,0,0,0,0
xZAmw5gihOVO4duMN2Ju6Q,Flynn,70,2013-03-25 20:22:45,127,44,28,,"hRbdn-DZAuqYv88bezKrZQ, q3IYh428CVOkU_W90sM3cg...",0,3.21,...,0,0,0,0,2,6,0,0,0,0
yK_WOwDLP8c87hNXVWZBhg,J,10,2010-04-17 19:34:14,7,1,2,,"dQfQok80y8YNwkZCh99Yeg, zeH8s6mOs9b2X5Ae2knhDw...",1,3.42,...,0,0,0,0,3,0,0,0,0,0
ryPjpbhGIQWnKGeF3QwUSA,Chelsea,20,2011-11-04 22:08:01,26,2,4,,"8K6b_JTadlTDQK23SCSRfw, Fbin3Swmqp6MagtaS2Tzkw...",3,4.43,...,0,0,0,0,0,0,1,1,0,0
cn-RDjSKQ5xR8HL_8tYXyw,Daniel,37,2012-07-09 07:55:22,44,20,21,,"Y-e7_xHTYwsqjeW1rmLUfA, fbCNVmV-PJQyDtbT9zgbFw...",0,3.84,...,2,0,0,0,0,1,0,0,0,0


In [8]:
checkin = load_data('checkin', [('business_id', business_ids)], False)
checkin.set_index('business_id', inplace=True)
checkin.head()

| 2775 |                                          #     | Elapsed Time: 0:00:13


Unnamed: 0_level_0,date
business_id,Unnamed: 1_level_1
-000aQFeK6tqVLndf7xORg,2018-10-17 21:16:27
-1xuC540Nycht_iWFeJ-dw,"2010-04-25 12:27:32, 2010-06-15 16:47:42, 2010..."
-4-MzST67P_jnX4mh3MIcw,"2011-10-15 00:47:37, 2011-12-07 21:00:41, 2011..."
-4TMQnQJW1yd6NqGRDvAeA,"2012-09-16 02:04:06, 2012-09-23 03:33:28, 2012..."
-9YyInW1wapzdNZrhQJ9dg,"2010-03-02 02:47:57, 2010-03-02 15:58:19, 2010..."


In [9]:
# tip = load_data('tip', [('business_id', business_ids), ('user_id', user_ids)], False)
# tip.head()

In [10]:
import gc
gc.collect()

7

## Export Data

We export the data for easier loading for future sessions. 

In [11]:
review.to_pickle("data/"+str(num_reviews)+"_review.pkl")
business.to_pickle("data/"+str(num_reviews)+"_business.pkl")
user.to_pickle("data/"+str(num_reviews)+"_user.pkl")
checkin.to_pickle("data/"+str(num_reviews)+"_checkin.pkl")