In [1]:
import time
import re
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
from sklearn.preprocessing import LabelEncoder

import scipy
from scipy.sparse import csr_matrix

data_path = './data/'
if not os.path.exists(data_path):
    os.makedirs(data_path)

## Loading Data

In [2]:
start_time = time.time()
business = pd.read_json("yelp_dataset/yelp_academic_dataset_business.json", 
                      orient = 'records',
                      lines = True, chunksize = 10000).read()
end_time = time.time()
print(end_time - start_time)

2.3642706871032715


In [3]:
display(business.head())
print(business.shape)
print("Number of business:  {}".format(business['business_id'].nunique()))

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,6iYb2HFDywm3zjuRg0shjw,Oskar Blues Taproom,921 Pearl St,Boulder,CO,80302,40.017544,-105.283348,4.0,86,1,"{'RestaurantsTableService': 'True', 'WiFi': 'u...","Gastropubs, Food, Beer Gardens, Restaurants, B...","{'Monday': '11:0-23:0', 'Tuesday': '11:0-23:0'..."
1,tCbdrRPZA0oiIYSmHG3J0w,Flying Elephants at PDX,7000 NE Airport Way,Portland,OR,97218,45.588906,-122.593331,4.0,126,1,"{'RestaurantsTakeOut': 'True', 'RestaurantsAtt...","Salad, Soup, Sandwiches, Delis, Restaurants, C...","{'Monday': '5:0-18:0', 'Tuesday': '5:0-17:0', ..."
2,bvN78flM8NLprQ1a1y5dRg,The Reclaimory,4720 Hawthorne Ave,Portland,OR,97214,45.511907,-122.613693,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Restau...","Antiques, Fashion, Used, Vintage & Consignment...","{'Thursday': '11:0-18:0', 'Friday': '11:0-18:0..."
3,oaepsyvc0J17qwi8cfrOWg,Great Clips,2566 Enterprise Rd,Orange City,FL,32763,28.914482,-81.295979,3.0,8,1,"{'RestaurantsPriceRange2': '1', 'BusinessAccep...","Beauty & Spas, Hair Salons",
4,PE9uqAjdw0E4-8mjGl3wVA,Crossfit Terminus,1046 Memorial Dr SE,Atlanta,GA,30316,33.747027,-84.353424,4.0,14,1,"{'GoodForKids': 'False', 'BusinessParking': '{...","Gyms, Active Life, Interval Training Gyms, Fit...","{'Monday': '16:0-19:0', 'Tuesday': '16:0-19:0'..."


(160585, 14)
Number of business:  160585


In [4]:
restaurants = business.loc[business['categories'].apply(lambda x: False if x is None else 'Restaurants' in x ),].copy()
print('Number of restaurants:  ', restaurants.shape[0])

Number of restaurants:   50763


### Review

In [5]:
start_time = time.time()
reviews = pd.read_json("yelp_dataset/yelp_academic_dataset_review.json", 
                      orient = 'records',
                      lines = True, chunksize = 100000).read()
end_time = time.time()
print(end_time - start_time)

69.24591445922852


In [6]:
display(reviews.head())
print(reviews.shape)

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,lWC-xP3rd6obsecCYsGZRg,ak0TdVmGKo4pwqdJSTLwWw,buF9druCkbuXLX526sGELQ,4,3,1,1,Apparently Prides Osteria had a rough summer a...,2014-10-11 03:34:02
1,8bFej1QE5LXp4O05qjGqXA,YoVfDbnISlW0f7abNQACIg,RA4V8pr014UyUbDvI-LW2A,4,1,0,0,This store is pretty good. Not as great as Wal...,2015-07-03 20:38:25
2,NDhkzczKjLshODbqDoNLSg,eC5evKn1TWDyHCyQAwguUw,_sS2LBIGNT5NQb6PD1Vtjw,5,0,0,0,I called WVM on the recommendation of a couple...,2013-05-28 20:38:06
3,T5fAqjjFooT4V0OeZyuk1w,SFQ1jcnGguO0LYWnbbftAA,0AzLzHfOJgL7ROwhdww2ew,2,1,1,1,I've stayed at many Marriott and Renaissance M...,2010-01-08 02:29:15
4,sjm_uUcQVxab_EeLCqsYLg,0kA0PAJ8QFMeveQWHFqz2A,8zehGz9jnxPqXtOc7KaJxA,4,0,0,0,The food is always great here. The service fro...,2011-07-28 18:05:01


(8635403, 9)


In [8]:
rreviews = reviews[reviews['business_id'].isin(restaurants['business_id'])]
print(rreviews.shape)

(5574795, 9)


### User

In [9]:
start_time = time.time()
users = pd.read_json("yelp_dataset/yelp_academic_dataset_user.json", 
                      orient = 'records',
                      lines = True, chunksize = 100000).read()
end_time = time.time()
print(end_time - start_time)

38.01014232635498


In [10]:
display(users.head())
print(users.shape)
print("Number of Users:  {}".format(users['user_id'].nunique()))

Unnamed: 0,user_id,name,review_count,yelping_since,useful,funny,cool,elite,friends,fans,...,compliment_more,compliment_profile,compliment_cute,compliment_list,compliment_note,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos
0,q_QQ5kBBwlCcbL1s4NVK3g,Jane,1220,2005-03-14 20:26:35,15038,10030,11291,200620072008200920102011201220132014,"xBDpTUbai0DXrvxCe3X16Q, 7GPNBO496aecrjJfW6UWtg...",1357,...,163,190,361,147,1212,5691,2541,2541,815,323
1,dIIKEfOgo0KqUfGQvGikPg,Gabi,2136,2007-08-10 19:01:51,21272,10289,18046,"2007,2008,2009,2010,2011,2012,2013,2014,2015,2...","XPzYf9_mwG2eXYP2BAGSTA, 2LooM5dcIk2o01nftYdPIg...",1025,...,87,94,232,96,1187,3293,2205,2205,472,294
2,D6ErcUnFALnCQN4b1W_TlA,Jason,119,2007-02-07 15:47:53,188,128,130,20102011,"GfB6sC4NJQvSI2ewbQrDNA, jhZtzZNNZJOU2YSZ6jPlXQ...",16,...,1,3,0,0,5,20,31,31,3,1
3,JnPIjvC0cmooNDfsa9BmXg,Kat,987,2009-02-09 16:14:29,7234,4722,4035,200920102011201220132014,"HQZPQhKMwRAyS6BCselVWQ, kP2U1s_sjQfHO9grxiyDTA...",420,...,129,93,219,90,1120,4510,1566,1566,391,326
4,37Hc8hr3cw0iHLoPzLK6Ow,Christine,495,2008-03-03 04:57:05,1577,727,1124,200920102011,"-Q88pZUcrfN0BLBDp-bkAQ, etPn4Pv1Gc4cRZjRgB_BOw...",47,...,19,32,16,15,77,131,310,310,98,44


(2189457, 22)
Number of Users:  2189457


In [11]:
rusers = users[users['user_id'].isin(rreviews['user_id'].unique())]
print(rusers.shape)

(1551310, 22)


### Tips

In [12]:
start_time = time.time()
tips = pd.read_json("yelp_dataset/yelp_academic_dataset_tip.json", 
                      orient = 'records',
                      lines = True, chunksize = 100000).read()
end_time = time.time()
print(end_time - start_time)

4.769688129425049


In [13]:
display(tips.head())
print(tips.shape)

Unnamed: 0,user_id,business_id,text,date,compliment_count
0,WCjg0jdHXMlwbqS9tZUx8Q,ENwBByjpoa5Gg7tKgxqwLg,Carne asada chips...,2011-07-22 19:07:35,0
1,42-Z02y9bABShAGZhuSzrQ,jKO4Og6ucdX2-YCTKQVYjg,Best happy hour from 3pm to 6pm! $1 off martin...,2014-09-10 07:33:29,0
2,5u7E3LYp_3eB8dLuUBazXQ,9Bto7mky640ocgezVKSfVg,"Nice people, skilled staff, clean location - b...",2013-12-13 23:23:41,0
3,wDWoMG5N9oI4DJ-p7z8EBg,XWFjKtRGZ9khRGtGg2ZvaA,"1/2-price bowling & the ""Very"" Old Fashion are...",2017-07-11 23:07:16,0
4,JmuFlorjjRshHTKzTwNtgg,mkrx0VhSMU3p3uhyJGCoWA,"Solid gold's. Great sauna. Great staff, too. E...",2016-11-30 08:46:36,0


(1162119, 5)


In [14]:
rtips = tips[tips['business_id'].isin(restaurants['business_id'])]
print(rtips.shape)

(810577, 5)


## Taking Subsets

In [15]:
sparsity_score = lambda r, u, b: r.shape[0] / (u.shape[0] * b.shape[0])
print("Current sparsity:  ", sparsity_score(rreviews, rusers, restaurants))

Current sparsity:   7.079181214365046e-05


In [16]:
# print(restaurants['review_count'].describe())
# plt.plot(np.arange(0, 1, 0.001), np.quantile(restaurants['review_count'], np.arange(0, 1, 0.001)))

In [17]:
user_review_count = rreviews.groupby(['user_id'])['review_id'].count()
rusers_sub = rusers[rusers['user_id'].isin(user_review_count.index[user_review_count > 10])]

In [19]:
# Users leave more than 10 reviews
# Restaurants at least receive 114 (third quantile) reviews

# Subset restaurants
restaurants_sub = restaurants[restaurants['review_count'] > 114]
state_count = restaurants.groupby(['state'])['business_id'].count()
restaurants_sub = restaurants_sub[restaurants_sub['state'].isin(state_count.index[state_count > 5])]
rreviews_business = rreviews[rreviews['business_id'].isin(restaurants_sub['business_id'])]

# Subset Users
user_review_count = rreviews_business.groupby(['user_id'])['review_id'].count()
rusers_sub = rusers[rusers['user_id'].isin(user_review_count.index[user_review_count > 10])]

# Subset Reviews
rreviews_users = rreviews[rreviews['user_id'].isin(user_review_count.index[user_review_count>10])]
rreviews_union = rreviews.loc[set(rreviews_business.index).union(rreviews_users.index), ]
rreviews_inter = rreviews.loc[set(rreviews_business.index).intersection(rreviews_users.index), ]


# Subset tips
tips_business = tips[tips['business_id'].isin(restaurants_sub['business_id'])]
tips_users = tips[tips['user_id'].isin(rusers_sub['user_id'])]
tips_union = tips.loc[set(tips_business.index).union(set(tips_users.index)),]

print("Number of restaurants:  ", restaurants_sub.shape[0])
print("Number of users:  ", rusers_sub.shape[0])
print("Number of reviews:  ", rreviews_inter.shape[0])
print("Sparsity:  ", sparsity_score(rreviews_inter, rusers_sub, restaurants_sub))

Number of restaurants:   12665
Number of users:   60851
Number of reviews:   1674121
Sparsity:   0.0021722706300724864


## Preprocess Restaurants

### Separting Restaurants Attributes

In [20]:
attributes = defaultdict(int)
for i in restaurants_sub['attributes'].apply(lambda x: None if x is None else x.keys()):
    if i is not None:
        for k in i:
            attributes[k] += 1
attributes

defaultdict(int,
            {'RestaurantsTakeOut': 12563,
             'RestaurantsAttire': 12138,
             'GoodForKids': 12300,
             'BikeParking': 11958,
             'OutdoorSeating': 12506,
             'Ambience': 12348,
             'Caters': 12100,
             'RestaurantsReservations': 12393,
             'RestaurantsDelivery': 12550,
             'HasTV': 12348,
             'RestaurantsGoodForGroups': 12359,
             'BusinessAcceptsCreditCards': 11907,
             'NoiseLevel': 12220,
             'ByAppointmentOnly': 2101,
             'RestaurantsPriceRange2': 12581,
             'WiFi': 12406,
             'BusinessParking': 12605,
             'Alcohol': 12378,
             'GoodForMeal': 11241,
             'DogsAllowed': 6285,
             'Music': 2983,
             'BusinessAcceptsBitcoin': 2880,
             'GoodForDancing': 2337,
             'BestNights': 2701,
             'HappyHour': 6871,
             'RestaurantsTableService': 7768,
     

In [21]:
# Select attributes having more than 1000 restaurants

selected_attributes = [k for k in attributes if attributes[k] > 1000]
for at in selected_attributes:
    restaurants["{}_{}".format('attr', at)] = 0
    selected_idx = restaurants_sub['categories'].apply(lambda x: False if x is None else at in x)
    restaurants_sub.loc[selected_idx, "{}_{}".format('attr', at)] = 1

In [22]:
restaurants_sub.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,...,attr_BestNights,attr_HappyHour,attr_RestaurantsTableService,attr_WheelchairAccessible,attr_CoatCheck,attr_Smoking,attr_BYOBCorkage,attr_Corkage,attr_DriveThru,attr_BYOB
1,tCbdrRPZA0oiIYSmHG3J0w,Flying Elephants at PDX,7000 NE Airport Way,Portland,OR,97218,45.588906,-122.593331,4.0,126,...,,,,,,,,,,
5,D4JtQNTI4X3KcbzacDJsMw,Bob Likes Thai Food,3755 Main St,Vancouver,BC,V5V,49.251342,-123.101333,3.5,169,...,,,,,,,,,,
13,ufCxltuh56FF4-ZFZ6cVhg,Sister Honey's,247 E Michigan St,Orlando,FL,32806,28.513265,-81.374707,4.5,135,...,,,,,,,,,,
29,jGennaZUr2MsJyRhijNBfA,Legal Sea Foods,1 Harborside Dr,Boston,MA,02128,42.363442,-71.025781,3.5,856,...,,,,,,,,,,
41,NRPemqVb4qpWFF0Avq_6OQ,Eurasia Sushi Bar & Seafood,"7101 W Hwy 71, Ste C-13",Austin,TX,78735,30.234533,-97.877262,4.5,395,...,,,,,,,,,,


### Separating Restaurants Categories

In [23]:
category_dict = defaultdict(int)
for cat in restaurants_sub['categories']:
    for c in cat.split(','):
        category_dict[c.strip()] += 1
sorted(category_dict.items(), key=lambda x: x[1], reverse = True)

[('Restaurants', 12665),
 ('Food', 3873),
 ('Nightlife', 3868),
 ('Bars', 3735),
 ('American (Traditional)', 2391),
 ('American (New)', 2326),
 ('Breakfast & Brunch', 1922),
 ('Sandwiches', 1536),
 ('Seafood', 1279),
 ('Italian', 1148),
 ('Pizza', 1140),
 ('Mexican', 1133),
 ('Burgers', 1046),
 ('Japanese', 985),
 ('Coffee & Tea', 934),
 ('Event Planning & Services', 886),
 ('Sushi Bars', 863),
 ('Cocktail Bars', 850),
 ('Salad', 833),
 ('Chinese', 751),
 ('Asian Fusion', 731),
 ('Desserts', 693),
 ('Cafes', 672),
 ('Vegetarian', 625),
 ('Pubs', 580),
 ('Beer', 560),
 ('Wine & Spirits', 560),
 ('Steakhouses', 557),
 ('Wine Bars', 536),
 ('Thai', 531),
 ('Specialty Food', 527),
 ('Gluten-Free', 524),
 ('Caterers', 516),
 ('Bakeries', 515),
 ('Mediterranean', 491),
 ('Sports Bars', 490),
 ('Barbeque', 476),
 ('Vegan', 450),
 ('Fast Food', 404),
 ('Diners', 380),
 ('Arts & Entertainment', 373),
 ('Lounges', 368),
 ('Vietnamese', 362),
 ('Soup', 346),
 ('Tex-Mex', 342),
 ('Southern', 331),

In [24]:
selected_categories = [x[0] for x in category_dict.items() if x[1] > 500 and x[0] != 'Restaurants']
selected_categories

for sc in selected_categories:
    col_name = re.sub(r"[^a-zA-Z0-9()]+", '_', sc.lower()).replace('(', '').replace(')', '')
    restaurants_sub["{}_{}".format('cat', col_name)] = 0
    selected_idx = restaurants_sub['categories'].apply(lambda x: False if x is None else sc in x)
    restaurants_sub.loc[selected_idx, "{}_{}".format('cat', col_name)] = 1
restaurants_sub.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,...,cat_pizza,cat_caterers,cat_pubs,cat_event_planning_services,cat_mexican,cat_steakhouses,cat_wine_bars,cat_specialty_food,cat_asian_fusion,cat_burgers
1,tCbdrRPZA0oiIYSmHG3J0w,Flying Elephants at PDX,7000 NE Airport Way,Portland,OR,97218,45.588906,-122.593331,4.0,126,...,0,0,0,0,0,0,0,0,0,0
5,D4JtQNTI4X3KcbzacDJsMw,Bob Likes Thai Food,3755 Main St,Vancouver,BC,V5V,49.251342,-123.101333,3.5,169,...,0,0,0,0,0,0,0,0,0,0
13,ufCxltuh56FF4-ZFZ6cVhg,Sister Honey's,247 E Michigan St,Orlando,FL,32806,28.513265,-81.374707,4.5,135,...,0,0,0,0,0,0,0,0,0,0
29,jGennaZUr2MsJyRhijNBfA,Legal Sea Foods,1 Harborside Dr,Boston,MA,02128,42.363442,-71.025781,3.5,856,...,0,0,0,0,0,0,0,0,0,0
41,NRPemqVb4qpWFF0Avq_6OQ,Eurasia Sushi Bar & Seafood,"7101 W Hwy 71, Ste C-13",Austin,TX,78735,30.234533,-97.877262,4.5,395,...,0,0,0,0,0,0,0,0,0,0


In [25]:
restaurants_sub.to_csv('data/restaurants.csv', index = False)

## Preprocess Users

In [26]:
rusers_sub = rusers_sub.reset_index(drop=True).copy()
rusers_sub['elite'] = rusers_sub['elite'].apply(lambda x: 1 if len(x) > 0 else 0)

In [27]:
rusers_sub

Unnamed: 0,user_id,name,review_count,yelping_since,useful,funny,cool,elite,friends,fans,...,compliment_more,compliment_profile,compliment_cute,compliment_list,compliment_note,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos
0,q_QQ5kBBwlCcbL1s4NVK3g,Jane,1220,2005-03-14 20:26:35,15038,10030,11291,1,"xBDpTUbai0DXrvxCe3X16Q, 7GPNBO496aecrjJfW6UWtg...",1357,...,163,190,361,147,1212,5691,2541,2541,815,323
1,dIIKEfOgo0KqUfGQvGikPg,Gabi,2136,2007-08-10 19:01:51,21272,10289,18046,1,"XPzYf9_mwG2eXYP2BAGSTA, 2LooM5dcIk2o01nftYdPIg...",1025,...,87,94,232,96,1187,3293,2205,2205,472,294
2,JnPIjvC0cmooNDfsa9BmXg,Kat,987,2009-02-09 16:14:29,7234,4722,4035,1,"HQZPQhKMwRAyS6BCselVWQ, kP2U1s_sjQfHO9grxiyDTA...",420,...,129,93,219,90,1120,4510,1566,1566,391,326
3,37Hc8hr3cw0iHLoPzLK6Ow,Christine,495,2008-03-03 04:57:05,1577,727,1124,1,"-Q88pZUcrfN0BLBDp-bkAQ, etPn4Pv1Gc4cRZjRgB_BOw...",47,...,19,32,16,15,77,131,310,310,98,44
4,n-QwITZYrXlKQRiV30MqNg,Natasha,229,2008-06-25 14:53:17,476,101,140,1,"2ptwW5l68069vNtW8J-WOg, 4TVFuz8L4TsDtx_ObSin9g...",17,...,4,3,0,0,10,15,24,24,16,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60846,Xvik-lYFpaPnW8lpzHd89A,Karin,49,2011-06-18 21:57:49,9,2,4,0,38Nmkq0zyrFKqSrTyEhsVA,0,...,0,0,0,0,0,0,0,0,0,1
60847,1uMNrBqq9Au2w_xXN1HuPA,Graham Wellington,15,2016-10-31 20:09:44,19,14,0,0,,0,...,1,0,0,0,1,1,0,0,0,0
60848,pSSTYS9dkkqPGpcck5mFcQ,David,184,2016-12-05 00:03:34,102,20,52,0,,3,...,0,0,0,0,0,1,3,3,1,1
60849,fcl2-83fUbIvzMQCZKWKKA,Josh,54,2007-02-23 17:56:54,39,1,14,0,"lRLCZGMx--uxPI_9up5fGA, UlYXVV0Ls5UwJzUqt_m7mQ...",3,...,1,0,0,0,0,0,2,2,2,0


In [28]:
selected_users = set(rusers_sub['user_id'])
f = open('data/friend_network.txt', 'w')
for i in range(rusers_sub.shape[0]):
    uid = rusers_sub.loc[i, 'user_id'].strip()
    friends = rusers_sub.loc[i, 'friends']
    for fid in friends.split(','):
        if fid in selected_users:
            f.write("{} {}".format(uid, fid))
f.close()

In [29]:
rusers = rusers_sub.drop(['name', 'friends'], axis = 1)
rusers.to_csv("data/users.csv", index = False)

### Tips & Reviews

In [30]:
tips_union.to_csv("data/tips.csv", index = False)
rreviews_union.to_csv('data/reviews.csv', index = False)