In [2]:
import json
import pandas as pd
import datetime

In [None]:
with open('data/yelp_academic_dataset_business.json') as f:
     business_data = pd.DataFrame(json.loads(line) for line in f)
        
with open('data/yelp_academic_dataset_user.json') as f:
     user_data = pd.DataFrame(json.loads(line) for line in f)
        
with open('data/yelp_academic_dataset_review.json') as f:
     review_data = pd.DataFrame(json.loads(line) for line in f)
        
with open('data/yelp_academic_dataset_tip.json') as f:
     tip_data = pd.DataFrame(json.loads(line) for line in f)

In [None]:
# First generate the common portion of the dataframe
# From user data

# Keep total number of reviews/ user as is
# Collapse elite into total counts
# Keep number of fans as is
# Collapse yelping since into an integer(# of days old)

elite = [len(x) for x in user_data['elite']]
account_age = [(datetime.datetime.now() - x).days for x in pd.to_datetime(user_data['yelping_since'])]

In [None]:
# Look for "profile" in compliments - This represents profile likes
# Look for "useful" in votes - This represents the number of reviews the user has found useful

useful_votes = []
profile = []

compliments = user_data['compliments']

for ind in range(len(compliments)):
    if 'profile' in compliments[ind]:
        profile.append(compliments[ind]['profile'])
    else:
        profile.append(0)

votes = user_data['votes']
for ind in range(len(votes)):
    if 'useful' in votes[ind]:
        useful_votes.append(votes[ind]['useful'])
    else:
        useful_votes.append(0)

In [None]:
#  Look at tip data to get:
# Total # of tips
# Total # of tip likes
grouped_tips_users = tip_data.groupby('user_id')

num_likes = [user_tips['likes'].sum() for key, user_tips in grouped_tips_users]
num_tips = [len(user_tips) for key, user_tips in grouped_tips_users ]
user_ids = [key for key, user_tips in grouped_tips_users]

gen_tip_df = pd.DataFrame(zip(*[user_ids, num_tips, num_likes]))
gen_tip_df.columns = ['user_id', 'num_tips', 'num_likes']
gen_tip_df = gen_tip_df.set_index('user_id')

gen_tip_df.head()

In [None]:
user_df = pd.DataFrame(zip(*[user_data.user_id, user_data.review_count, elite, account_age, user_data.fans, profile, useful_votes]))
user_df.columns = ['user_id', 'review_count', 'elite', 'account_age', 'fans', 'profile', 'useful_votes']
user_df = user_df.set_index('user_id')

user_df.head()

In [None]:
common_df = pd.merge(user_df, gen_tip_df, how='outer',left_index=True, right_index=True)
common_df = common_df.fillna(0)
common_df.head()

In [None]:
import itertools
from collections import Counter
category_counts = Counter(itertools.chain.from_iterable(business_data['categories']))
category_counts.most_common(20)

def find_best(cat_list):
    counts = [[x, category_counts[x]] for x in cat_list]
    return sorted(counts, key=lambda (cat, count): -count)[0][0]

categories = [find_best(cat_list) if len(cat_list) > 0 else None for cat_list in business_data['categories']]
print len(set(categories))
print set(categories)

business_df = pd.DataFrame(zip(*[business_data.business_id, categories]))
business_df.columns = ['business_id', 'category']
business_df = business_df[~business_df.category.isnull()]
business_df = business_df.set_index('business_id')

In [None]:
# Add categories to business data
business_data_aug = pd.merge(business_data, business_df, left_on='business_id', right_index=True, how='right')
business_data_aug = business_data_aug.fillna(0)

# Add categories to tip data
tip_data_aug = pd.merge(tip_data, business_df, left_on='business_id', right_index=True, how='right')
tip_data_aug = tip_data_aug.fillna(0)

# Add categories to reiew data
review_data_aug = pd.merge(review_data, business_df, left_on='business_id', right_index=True, how='right')
review_data_aug = review_data_aug.fillna(0)
