In [1]:
%matplotlib inline
import pandas as pd
import simplejson as json  # faster json parsing
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [2]:
with open('yelp_academic_dataset_user.json') as f:
    user_data = pd.DataFrame(json.loads(line) for line in f)

with open('yelp_academic_dataset_review.json') as f:
    review_data = pd.DataFrame(json.loads(line) for line in f)

with open('yelp_academic_dataset_business.json') as f:
    business_data = pd.DataFrame(json.loads(line) for line in f)

In [105]:
# for each user, produce a matrix of all numeric values
# collapse elite to total counts
# collapse compliments to a sum
# collapse votes to a sum
# collapse yelping since to a integer (# of days old)
# keep fans as-is
# hold out average stars and review count - those will be used for ground truth
elite = [len(x) for x in user_data['elite']]
compliments = [sum(x.values()) for x in user_data['compliments']]
votes = [sum(x.values()) for x in user_data['votes']]
account_age = [(datetime.datetime.now() - x).days for x in pd.to_datetime(user_data['yelping_since'])]
user_df = pd.DataFrame(zip(*[user_data.user_id, elite, compliments, votes, account_age, user_data.fans]))
user_df.columns = ['user_id', 'elite', 'compliments', 'votes', 'account_age', 'fans']
user_df = user_df.set_index('user_id')

In [38]:
# now we can bring in their review data on a per-business basis. First, we need a list of all possible business types.
len(set([frozenset(x) for x in business_data['categories']]))

10116

In [54]:
# This is far too many! This is due to multiple categories for a business. How many categories do we have?
import itertools
categories = set(itertools.chain.from_iterable(business_data['categories']))
print len(categories)

1017


In [56]:
# 1017 categories is still too many. Another problem is that we don't know the exact hierarchy of categories.
# for example, there are places that have both the terms 'cafes' and 'restaurants'
# we also have a decision to make here regarding granularity - the more categories we allow, the fewer reviews we
# have to use as features.

# To derive the hierarchy, maybe I can create a ordered list by count, then pick the most common for each business

In [69]:
from collections import Counter
category_counts = Counter(itertools.chain.from_iterable(business_data['categories']))
category_counts.most_common(20)

[('Restaurants', 26729),
 ('Shopping', 12444),
 ('Food', 10143),
 ('Beauty & Spas', 7490),
 ('Health & Medical', 6106),
 ('Home Services', 5866),
 ('Nightlife', 5507),
 ('Automotive', 4888),
 ('Bars', 4727),
 ('Local Services', 4041),
 ('Active Life', 3455),
 ('Fashion', 3395),
 ('Event Planning & Services', 3237),
 ('Fast Food', 3154),
 ('Pizza', 2881),
 ('Mexican', 2705),
 ('Hotels & Travel', 2673),
 ('Sandwiches', 2666),
 ('American (Traditional)', 2608),
 ('Arts & Entertainment', 2447)]

In [80]:
def find_best(cat_list):
    counts = [[x, category_counts[x]] for x in cat_list]
    return sorted(counts, key=lambda (cat, count): -count)[0][0]

categories = [find_best(cat_list) if len(cat_list) > 0 else None for cat_list in business_data['categories']]
print len(set(categories))
print set(categories)

22
set(['Beauty & Spas', 'Arts & Entertainment', 'Pets', 'Home Services', 'Shopping', 'Food', 'Automotive', 'Religious Organizations', 'Local Flavor', 'Hotels & Travel', None, 'Local Services', 'Nightlife', 'Restaurants', 'Active Life', 'Public Services & Government', 'Health & Medical', 'Financial Services', 'Mass Media', 'Professional Services', 'Education', 'Event Planning & Services'])


In [91]:
# alright, we are getting somewhere!
# now we assign these categories to each business, and discard those with no categories
business_df = pd.DataFrame(zip(*[business_data.business_id, categories]))
business_df.columns = ['business_id', 'category']
business_df = business_df[~business_df.category.isnull()]
business_df = business_df.set_index('business_id')
# we leave out the star rating for now, as that will be part of our evaluation criteria

In [159]:
# the final set of features we need is to construct a DataFrame containing is the average scores a 
# user assigns to each category, the variance of these assignments, the total # of reviews in that category a 
# user produces, and the total # of votes they received in that category.
# the structure of this dataframe will be in a flat format:
#
# user_id    business_id     mu     var     total    total_votes
#   1             A          1       0        2         0
#

d = []
# use these columns as the key to a dictionary and begin populating
for user_id, user_reviews in review_data.groupby('user_id'):
    for business_id, business_reviews in user_reviews.groupby('business_id'):
        try:
            category = business_df.ix[business_id].category
        except KeyError:
            continue  # some businesses are not in the database it seems
        if category == None:
            continue
        mu = np.mean(user_reviews.stars)
        var = np.var(user_reviews.stars)
        total = len(user_reviews)
        total_votes = sum([sum(x.values()) for x in user_reviews.votes])
        d.append([user_id, business_id, mu, var, total, total_votes])

df = pd.DataFrame(d)
df.columns = ['user_id', 'business_id', 'mu', 'var', 'total', 'total_votes']

In [165]:
df.head()

Unnamed: 0,user_id,business_id,mu,var,total,total_votes
0,---teJGnwK07UO6_oJfbRw,0lWQSXID3T7K3DbjX7CpqQ,1.0,0.0,1,0
1,--0HEXd4W6bJI8k7E0RxTA,ZvvTPVqChi-mQd1JV6VM5w,5.0,0.0,2,3
2,--0HEXd4W6bJI8k7E0RxTA,bcBMAa0UQpNLFvvdZ4dxtQ,5.0,0.0,2,3
3,--0KsjlAThNWua2Pr4HStQ,DoKUOUwAsWrlRY6ehzQV_w,4.166667,0.472222,6,25
4,--0KsjlAThNWua2Pr4HStQ,NjHwPn2d3TL_xxnheXoSmw,4.166667,0.472222,6,25
