In [1]:
import time
import re
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
from sklearn.preprocessing import LabelEncoder

import scipy
from scipy.sparse import csr_matrix

data_path = './data/'
if not os.path.exists(data_path):
    os.makedirs(data_path)

## Loading Data

In [2]:
start_time = time.time()
business = pd.read_json("../yelp_dataset/yelp_academic_dataset_business.json", 
                      orient = 'records',
                      lines = True, chunksize = 10000).read()
end_time = time.time()
print(end_time - start_time)

2.21366810798645


In [3]:
display(business.head())
print(business.shape)
print("Number of business:  {}".format(business['business_id'].nunique()))

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,6iYb2HFDywm3zjuRg0shjw,Oskar Blues Taproom,921 Pearl St,Boulder,CO,80302,40.017544,-105.283348,4.0,86,1,"{'RestaurantsTableService': 'True', 'WiFi': 'u...","Gastropubs, Food, Beer Gardens, Restaurants, B...","{'Monday': '11:0-23:0', 'Tuesday': '11:0-23:0'..."
1,tCbdrRPZA0oiIYSmHG3J0w,Flying Elephants at PDX,7000 NE Airport Way,Portland,OR,97218,45.588906,-122.593331,4.0,126,1,"{'RestaurantsTakeOut': 'True', 'RestaurantsAtt...","Salad, Soup, Sandwiches, Delis, Restaurants, C...","{'Monday': '5:0-18:0', 'Tuesday': '5:0-17:0', ..."
2,bvN78flM8NLprQ1a1y5dRg,The Reclaimory,4720 Hawthorne Ave,Portland,OR,97214,45.511907,-122.613693,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Restau...","Antiques, Fashion, Used, Vintage & Consignment...","{'Thursday': '11:0-18:0', 'Friday': '11:0-18:0..."
3,oaepsyvc0J17qwi8cfrOWg,Great Clips,2566 Enterprise Rd,Orange City,FL,32763,28.914482,-81.295979,3.0,8,1,"{'RestaurantsPriceRange2': '1', 'BusinessAccep...","Beauty & Spas, Hair Salons",
4,PE9uqAjdw0E4-8mjGl3wVA,Crossfit Terminus,1046 Memorial Dr SE,Atlanta,GA,30316,33.747027,-84.353424,4.0,14,1,"{'GoodForKids': 'False', 'BusinessParking': '{...","Gyms, Active Life, Interval Training Gyms, Fit...","{'Monday': '16:0-19:0', 'Tuesday': '16:0-19:0'..."


(160585, 14)
Number of business:  160585


In [4]:
restaurants = business.loc[business['categories'].apply(lambda x: False if x is None else 'Restaurants' in x ),].copy()
print('Number of restaurants:  ', restaurants.shape[0])

Number of restaurants:   50763


### Review

In [5]:
start_time = time.time()
reviews = pd.read_json("../yelp_dataset/yelp_academic_dataset_review.json", 
                      orient = 'records',
                      lines = True, chunksize = 100000).read()
end_time = time.time()
print(end_time - start_time)

69.77719354629517


In [6]:
display(reviews.head())
print(reviews.shape)

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,lWC-xP3rd6obsecCYsGZRg,ak0TdVmGKo4pwqdJSTLwWw,buF9druCkbuXLX526sGELQ,4,3,1,1,Apparently Prides Osteria had a rough summer a...,2014-10-11 03:34:02
1,8bFej1QE5LXp4O05qjGqXA,YoVfDbnISlW0f7abNQACIg,RA4V8pr014UyUbDvI-LW2A,4,1,0,0,This store is pretty good. Not as great as Wal...,2015-07-03 20:38:25
2,NDhkzczKjLshODbqDoNLSg,eC5evKn1TWDyHCyQAwguUw,_sS2LBIGNT5NQb6PD1Vtjw,5,0,0,0,I called WVM on the recommendation of a couple...,2013-05-28 20:38:06
3,T5fAqjjFooT4V0OeZyuk1w,SFQ1jcnGguO0LYWnbbftAA,0AzLzHfOJgL7ROwhdww2ew,2,1,1,1,I've stayed at many Marriott and Renaissance M...,2010-01-08 02:29:15
4,sjm_uUcQVxab_EeLCqsYLg,0kA0PAJ8QFMeveQWHFqz2A,8zehGz9jnxPqXtOc7KaJxA,4,0,0,0,The food is always great here. The service fro...,2011-07-28 18:05:01


(8635403, 9)


In [7]:
rreviews = reviews[reviews['business_id'].isin(restaurants['business_id'])]
print(rreviews.shape)

(5574795, 9)


### User

In [8]:
start_time = time.time()
users = pd.read_json("../yelp_dataset/yelp_academic_dataset_user.json", 
                      orient = 'records',
                      lines = True, chunksize = 100000).read()
end_time = time.time()
print(end_time - start_time)

38.17113542556763


In [9]:
display(users.head())
print(users.shape)
print("Number of Users:  {}".format(users['user_id'].nunique()))

Unnamed: 0,user_id,name,review_count,yelping_since,useful,funny,cool,elite,friends,fans,...,compliment_more,compliment_profile,compliment_cute,compliment_list,compliment_note,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos
0,q_QQ5kBBwlCcbL1s4NVK3g,Jane,1220,2005-03-14 20:26:35,15038,10030,11291,200620072008200920102011201220132014,"xBDpTUbai0DXrvxCe3X16Q, 7GPNBO496aecrjJfW6UWtg...",1357,...,163,190,361,147,1212,5691,2541,2541,815,323
1,dIIKEfOgo0KqUfGQvGikPg,Gabi,2136,2007-08-10 19:01:51,21272,10289,18046,"2007,2008,2009,2010,2011,2012,2013,2014,2015,2...","XPzYf9_mwG2eXYP2BAGSTA, 2LooM5dcIk2o01nftYdPIg...",1025,...,87,94,232,96,1187,3293,2205,2205,472,294
2,D6ErcUnFALnCQN4b1W_TlA,Jason,119,2007-02-07 15:47:53,188,128,130,20102011,"GfB6sC4NJQvSI2ewbQrDNA, jhZtzZNNZJOU2YSZ6jPlXQ...",16,...,1,3,0,0,5,20,31,31,3,1
3,JnPIjvC0cmooNDfsa9BmXg,Kat,987,2009-02-09 16:14:29,7234,4722,4035,200920102011201220132014,"HQZPQhKMwRAyS6BCselVWQ, kP2U1s_sjQfHO9grxiyDTA...",420,...,129,93,219,90,1120,4510,1566,1566,391,326
4,37Hc8hr3cw0iHLoPzLK6Ow,Christine,495,2008-03-03 04:57:05,1577,727,1124,200920102011,"-Q88pZUcrfN0BLBDp-bkAQ, etPn4Pv1Gc4cRZjRgB_BOw...",47,...,19,32,16,15,77,131,310,310,98,44


(2189457, 22)
Number of Users:  2189457


In [10]:
rusers = users[users['user_id'].isin(rreviews['user_id'].unique())]
print(rusers.shape)

(1551310, 22)


### Tips

In [11]:
start_time = time.time()
tips = pd.read_json("../yelp_dataset/yelp_academic_dataset_tip.json", 
                      orient = 'records',
                      lines = True, chunksize = 100000).read()
end_time = time.time()
print(end_time - start_time)

4.816680908203125


In [12]:
display(tips.head())
print(tips.shape)

Unnamed: 0,user_id,business_id,text,date,compliment_count
0,WCjg0jdHXMlwbqS9tZUx8Q,ENwBByjpoa5Gg7tKgxqwLg,Carne asada chips...,2011-07-22 19:07:35,0
1,42-Z02y9bABShAGZhuSzrQ,jKO4Og6ucdX2-YCTKQVYjg,Best happy hour from 3pm to 6pm! $1 off martin...,2014-09-10 07:33:29,0
2,5u7E3LYp_3eB8dLuUBazXQ,9Bto7mky640ocgezVKSfVg,"Nice people, skilled staff, clean location - b...",2013-12-13 23:23:41,0
3,wDWoMG5N9oI4DJ-p7z8EBg,XWFjKtRGZ9khRGtGg2ZvaA,"1/2-price bowling & the ""Very"" Old Fashion are...",2017-07-11 23:07:16,0
4,JmuFlorjjRshHTKzTwNtgg,mkrx0VhSMU3p3uhyJGCoWA,"Solid gold's. Great sauna. Great staff, too. E...",2016-11-30 08:46:36,0


(1162119, 5)


In [13]:
rtips = tips[tips['business_id'].isin(restaurants['business_id'])]
print(rtips.shape)

(810577, 5)


## Taking Subsets

In [14]:
sparsity_score = lambda r, u, b: r.shape[0] / (u.shape[0] * b.shape[0])
print("Current sparsity:  ", sparsity_score(rreviews, rusers, restaurants))

Current sparsity:   7.079181214365046e-05


In [15]:
# print(restaurants['review_count'].describe())
# plt.plot(np.arange(0, 1, 0.001), np.quantile(restaurants['review_count'], np.arange(0, 1, 0.001)))

In [16]:
# Users leave more than 10 reviews
# Restaurants at least receive 114 (third quantile) reviews

# Users leave more than 50 reviews
# Restaurants at least receive 500 (third quantile) reviews

# Subset restaurants
restaurants_sub = restaurants[restaurants['review_count'] > 500]
state_count = restaurants.groupby(['state'])['business_id'].count()
restaurants_sub = restaurants_sub[restaurants_sub['state'].isin(state_count.index[state_count > 5])]
rreviews_business = rreviews[rreviews['business_id'].isin(restaurants_sub['business_id'])]

# Subset Users
user_review_count = rreviews_business.groupby(['user_id'])['review_id'].count()
rusers_sub = rusers[rusers['user_id'].isin(user_review_count.index[user_review_count > 50])]

# Subset Reviews
rreviews_users = rreviews[rreviews['user_id'].isin(user_review_count.index[user_review_count > 50])]
rreviews_union = rreviews.loc[set(rreviews_business.index).union(rreviews_users.index), ]
rreviews_inter = rreviews.loc[set(rreviews_business.index).intersection(rreviews_users.index), ]


# Subset tips
tips_business = tips[tips['business_id'].isin(restaurants_sub['business_id'])]
tips_users = tips[tips['user_id'].isin(rusers_sub['user_id'])]
tips_union = tips.loc[set(tips_business.index).union(set(tips_users.index)),]

print("Number of restaurants:  ", restaurants_sub.shape[0])
print("Number of users:  ", rusers_sub.shape[0])
print("Number of reviews:  ", rreviews_inter.shape[0])
print("Sparsity:  ", sparsity_score(rreviews_inter, rusers_sub, restaurants_sub))

Number of restaurants:   1702
Number of users:   908
Number of reviews:   67167
Sparsity:   0.04346208399550671


## Preprocess Restaurants

### Separting Restaurants Attributes

In [17]:
attributes = defaultdict(int)
for i in restaurants_sub['attributes'].apply(lambda x: None if x is None else x.keys()):
    if i is not None:
        for k in i:
            attributes[k] += 1
attributes

defaultdict(int,
            {'NoiseLevel': 1687,
             'BikeParking': 1678,
             'RestaurantsAttire': 1677,
             'BusinessAcceptsCreditCards': 1658,
             'BusinessParking': 1697,
             'RestaurantsReservations': 1679,
             'GoodForKids': 1691,
             'RestaurantsTakeOut': 1694,
             'Caters': 1681,
             'WiFi': 1688,
             'RestaurantsDelivery': 1696,
             'HasTV': 1676,
             'RestaurantsPriceRange2': 1701,
             'Alcohol': 1686,
             'Music': 560,
             'BusinessAcceptsBitcoin': 534,
             'GoodForDancing': 502,
             'DogsAllowed': 1341,
             'BestNights': 541,
             'RestaurantsGoodForGroups': 1685,
             'OutdoorSeating': 1687,
             'HappyHour': 1246,
             'RestaurantsTableService': 1147,
             'GoodForMeal': 1635,
             'WheelchairAccessible': 841,
             'Ambience': 1677,
             'BYOBCorkage

In [21]:
# Select attributes having more than 1000 restaurants

selected_attributes = [k for k in attributes if attributes[k] > restaurants_sub.shape[0] * 0.3]
print("selected attributes:  ", selected_attributes)
for at in selected_attributes:
    restaurants["{}_{}".format('attr', at)] = 0
    selected_idx = restaurants_sub['categories'].apply(lambda x: False if x is None else at in x)
    restaurants_sub.loc[selected_idx, "{}_{}".format('attr', at)] = 1

selected attributes:   ['NoiseLevel', 'BikeParking', 'RestaurantsAttire', 'BusinessAcceptsCreditCards', 'BusinessParking', 'RestaurantsReservations', 'GoodForKids', 'RestaurantsTakeOut', 'Caters', 'WiFi', 'RestaurantsDelivery', 'HasTV', 'RestaurantsPriceRange2', 'Alcohol', 'Music', 'BusinessAcceptsBitcoin', 'DogsAllowed', 'BestNights', 'RestaurantsGoodForGroups', 'OutdoorSeating', 'HappyHour', 'RestaurantsTableService', 'GoodForMeal', 'WheelchairAccessible', 'Ambience', 'BYOBCorkage', 'ByAppointmentOnly']


In [22]:
restaurants_sub.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,...,attr_HappyHour,attr_RestaurantsTableService,attr_GoodForMeal,attr_Ambience,attr_Music,attr_BusinessAcceptsBitcoin,attr_BestNights,attr_WheelchairAccessible,attr_BYOBCorkage,attr_ByAppointmentOnly
29,jGennaZUr2MsJyRhijNBfA,Legal Sea Foods,1 Harborside Dr,Boston,MA,2128,42.363442,-71.025781,3.5,856,...,,,,,,,,,,
113,XDv29FffNd2dWnDOtZP-wg,Sapporo Ramen,1815 Massachusetts Ave,Cambridge,MA,2140,42.387212,-71.118532,3.5,635,...,,,,,,,,,,
147,bP6goJODwRnM3AVy45Kn9w,Papi's Cuban & Caribbean Grill,216 Ponce De Leon Ave NE,Atlanta,GA,30308,33.772758,-84.380375,4.0,1001,...,,,,,,,,,,
149,vecuat0jOia-CJveW3ngDw,Schmidt's Sausage Haus,240 E Kossuth St,Columbus,OH,43206,39.946268,-82.991044,4.0,1354,...,,,,,,,,,,
312,6Y0lQh4O-9JCutgyuMNq0g,Loca Luna,550 Amsterdam Ave NE,Atlanta,GA,30306,33.788608,-84.369091,3.5,658,...,,,,,,,,,,


### Separating Restaurants Categories

In [23]:
category_dict = defaultdict(int)
for cat in restaurants_sub['categories']:
    for c in cat.split(','):
        category_dict[c.strip()] += 1
sorted(category_dict.items(), key=lambda x: x[1], reverse = True)

[('Restaurants', 1702),
 ('Nightlife', 686),
 ('Bars', 654),
 ('Food', 571),
 ('American (New)', 443),
 ('Breakfast & Brunch', 378),
 ('American (Traditional)', 357),
 ('Seafood', 238),
 ('Sandwiches', 202),
 ('Cocktail Bars', 186),
 ('Mexican', 160),
 ('Event Planning & Services', 158),
 ('Burgers', 146),
 ('Italian', 145),
 ('Coffee & Tea', 129),
 ('Japanese', 126),
 ('Vegetarian', 125),
 ('Desserts', 115),
 ('Wine Bars', 113),
 ('Pizza', 112),
 ('Beer', 108),
 ('Wine & Spirits', 108),
 ('Southern', 107),
 ('Sushi Bars', 103),
 ('Asian Fusion', 102),
 ('Steakhouses', 100),
 ('Specialty Food', 93),
 ('Salad', 92),
 ('Barbeque', 89),
 ('Cafes', 87),
 ('Gluten-Free', 86),
 ('Bakeries', 83),
 ('Venues & Event Spaces', 81),
 ('Diners', 78),
 ('Pubs', 77),
 ('Arts & Entertainment', 73),
 ('Lounges', 72),
 ('Caterers', 71),
 ('Chinese', 70),
 ('Tex-Mex', 70),
 ('Vegan', 65),
 ('Thai', 62),
 ('Tapas/Small Plates', 55),
 ('Gastropubs', 54),
 ('French', 54),
 ('Noodles', 51),
 ('Tapas Bars', 5

In [29]:
selected_categories = [x[0] for x in category_dict.items() 
                       if x[1] > restaurants_sub.shape[0]*0.05 and x[0] not in ['Restaurants', 'Food']]
# selected_categories = set(selected_categories).union(set(['']))
print("Selected categories:  ", selected_categories)

for sc in selected_categories:
    col_name = re.sub(r"[^a-zA-Z0-9()]+", '_', sc.lower()).replace('(', '').replace(')', '')
    restaurants_sub["{}_{}".format('cat', col_name)] = 0
    selected_idx = restaurants_sub['categories'].apply(lambda x: False if x is None else sc in x)
    restaurants_sub.loc[selected_idx, "{}_{}".format('cat', col_name)] = 1
restaurants_sub.head()

Selected categories:   ['Sandwiches', 'Breakfast & Brunch', 'Seafood', 'Italian', 'Beer', 'Wine & Spirits', 'Cocktail Bars', 'Gluten-Free', 'Nightlife', 'Bars', 'Salad', 'Japanese', 'Desserts', 'Event Planning & Services', 'American (Traditional)', 'Southern', 'Sushi Bars', 'American (New)', 'Coffee & Tea', 'Burgers', 'Mexican', 'Barbeque', 'Cafes', 'Wine Bars', 'Steakhouses', 'Asian Fusion', 'Pizza', 'Vegetarian', 'Specialty Food']


Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,...,cat_burgers,cat_mexican,cat_barbeque,cat_cafes,cat_wine_bars,cat_steakhouses,cat_asian_fusion,cat_pizza,cat_vegetarian,cat_specialty_food
29,jGennaZUr2MsJyRhijNBfA,Legal Sea Foods,1 Harborside Dr,Boston,MA,2128,42.363442,-71.025781,3.5,856,...,0,0,0,0,0,0,0,0,0,0
113,XDv29FffNd2dWnDOtZP-wg,Sapporo Ramen,1815 Massachusetts Ave,Cambridge,MA,2140,42.387212,-71.118532,3.5,635,...,0,0,0,0,0,0,0,0,0,0
147,bP6goJODwRnM3AVy45Kn9w,Papi's Cuban & Caribbean Grill,216 Ponce De Leon Ave NE,Atlanta,GA,30308,33.772758,-84.380375,4.0,1001,...,0,0,0,0,0,0,0,0,0,0
149,vecuat0jOia-CJveW3ngDw,Schmidt's Sausage Haus,240 E Kossuth St,Columbus,OH,43206,39.946268,-82.991044,4.0,1354,...,0,0,0,0,0,0,0,0,0,0
312,6Y0lQh4O-9JCutgyuMNq0g,Loca Luna,550 Amsterdam Ave NE,Atlanta,GA,30306,33.788608,-84.369091,3.5,658,...,0,0,0,0,0,0,0,0,0,0


In [30]:
restaurants_sub.to_csv('../data/restaurants2.csv', index = False)

## Preprocess Users

In [31]:
rusers_sub = rusers_sub.reset_index(drop=True).copy()
rusers_sub['elite'] = rusers_sub['elite'].apply(lambda x: 1 if len(x) > 0 else 0)

In [32]:
rusers_sub

Unnamed: 0,user_id,name,review_count,yelping_since,useful,funny,cool,elite,friends,fans,...,compliment_more,compliment_profile,compliment_cute,compliment_list,compliment_note,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos
0,1jXmzuIFKxTnEnR0pxO0Hg,Clara,299,2010-10-01 17:29:36,381,106,121,1,"VGfzq5na6LZUwxwWO5eVLA, 35uHDsVOEsWbLdEg8Ttobg...",23,...,8,2,6,0,17,47,30,30,4,1
1,CQUDh80m48xnzUkx-X5NAw,David,4205,2008-12-29 21:03:01,21059,8906,14640,1,"WnJlu4mpNtVxNQ2SM6GmvQ, 3BqKBuvY09lissdY_soI6w...",575,...,149,140,66,287,1034,1529,1572,1572,492,180
2,e_YQl5LBR7Gdrp_1vdj1yQ,Shaina,671,2007-07-31 19:21:58,1509,646,870,1,"tuyr32-Drc86VsXdPZaBcQ, oIUOyrMfDCE0gUXjt3F15A...",57,...,14,10,22,18,69,101,129,129,37,7
3,D8TvYz5Cy5-4V-LuD5nXvQ,Chiqui,478,2011-08-05 14:19:44,2115,1544,1614,0,"InDrrRvQ9f732YGPX4Ixmg, G-mMQLhruEVkevL5gPhzhg...",61,...,18,4,4,1,164,281,348,348,76,92
4,Vlab9b73R5qPLIv6tE4DJA,Tara,955,2007-03-19 18:33:14,3140,2130,1800,1,"O8siifzoJNwb-Wfjyf_veg, LboXWlcBzR-jmrnH-FpNbw...",90,...,21,12,17,4,54,106,175,175,60,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
903,V8oCtUzsdHA_Z9QpGaHCBg,Helen,666,2012-04-09 06:16:20,1166,290,507,1,"OQRWbRdi5Ki-seb4uBi7FA, otENkVmXbClU-aZSTZ8dIQ...",104,...,6,5,1,0,19,38,36,36,32,59
904,ezM7052Nk608iuDnunmTfA,Justin,532,2014-08-20 00:57:09,3373,1730,2637,1,"S5208fDRftxFMl5EzGTjow, cW7cD0n5EwDK_kyfIKV3-g...",54,...,11,6,2,0,53,413,227,227,126,128
905,C_eD0VF2OZi2rvUUQom3hQ,Anna,304,2011-06-23 22:44:10,502,45,215,1,"unkC6mSSTTnwJHEpUv7muQ, p-nsSxznzJgeAmAWSLzf5A...",23,...,4,0,0,0,9,12,12,12,4,7
906,koFIut32RUnYVY08N96slQ,Shannon,174,2015-05-20 00:09:22,891,453,642,1,"2VM9xpyjz_4EJ1RAUx-ugA, q-O7E8f0kv7vE_l4vljZ3g...",27,...,2,3,0,0,13,21,56,56,54,5


In [33]:
selected_users = set(rusers_sub['user_id'])
f = open('data/friend_network.txt', 'w')
for i in range(rusers_sub.shape[0]):
    uid = rusers_sub.loc[i, 'user_id'].strip()
    friends = rusers_sub.loc[i, 'friends']
    for fid in friends.split(','):
        if fid in selected_users:
            f.write("{} {}\n".format(uid, fid))
f.close()

In [37]:
rusers = rusers_sub.drop(['name', 'friends'], axis = 1)
rusers.to_csv("../data/users2.csv", index = False)

### Tips & Reviews

In [35]:
tips_union.to_csv("../data/tips2.csv", index = False)
rreviews_union.to_csv('../data/reviews2.csv', index = False)