In [1]:
%pylab inline
import warnings
warnings.filterwarnings('ignore')

Populating the interactive namespace from numpy and matplotlib


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
import pandas as pd
import numpy as np
from numpy.linalg import norm
from sklearn.pipeline import FeatureUnion
import simplejson as json
from scipy.sparse import coo_matrix
from datetime import datetime

#### Helper Function

In [4]:
from sklearn import base
from sklearn.feature_extraction import DictVectorizer # for One_Hot_Encoder

def Value_To_Dict(val):
    return {val:1}

def List_To_Dict(the_list):
    return {category:1 for category in the_list}
    
def Flatten_Dict(d, prekey = ''):
    flat_dict = {}
    for key in d:
        if isinstance(d[key], bool) and d[key]:
            flat_dict.update({prekey+'_'+key:1})
        elif isinstance(d[key], str):
            flat_dict.update({prekey+'_'+key+'_'+d[key]:1})
        elif isinstance(d[key], dict):
            flat_dict.update(Flatten_Dict(d[key], prekey=prekey+'_'+key))
    return flat_dict

class One_Hot_Encoder(base.BaseEstimator, base.TransformerMixin):
    def __init__(self, colnames, value_type = 'value', sparse = True):
        if value_type == 'value':
            self.apply_function_ = Value_To_Dict
        elif value_type == 'list':
            self.apply_function_ = List_To_Dict
        elif value_type == 'dict':
            self.apply_function_ = Flatten_Dict
        self.colnames_ = colnames
        self.dv_ = DictVectorizer(sparse = sparse)

    def fit(self, X, y = None):
        self.dv_.fit(X[self.colnames_].apply(self.apply_function_))
        return self

    def transform(self, X):
        return self.dv_.transform(X[self.colnames_].apply(self.apply_function_))
class Column_Selector(base.BaseEstimator, base.TransformerMixin):
    def __init__(self, colnames):
        self.colnames_ = colnames

    def fit(self, X, y = None):
        return self

    def transform(self, X):
        return pd.DataFrame(X[self.colnames_])

#### Data Setup

In [5]:
# Loading the dataset
# Loading the reviews dataset
reviews_frame = pd.read_csv('data/filtered_review.csv')

# Loading business and user dataset
def get_data(line, columns):
    d = json.loads(line)
    return dict((key, d[key]) for key in columns)

# Loading user data
columns = ('user_id', 'name','average_stars')
with open('input/yelp_academic_dataset_user.json', 'rb') as f:
    user_frame = pd.DataFrame(get_data(line, columns) for line in f)
user_frame = user_frame.sort_values('user_id')

# Loading business data
columns = ('business_id', 'name','categories','attributes','city','stars')
with open('input/yelp_academic_dataset_business.json', 'rb') as f:
    business = pd.DataFrame(get_data(line, columns) for line in f)

business = business.sort_values('business_id')

# Trimming the dataset by city
business_by_city = business['city'] == "Pittsburgh"
business = business[business_by_city]

# Trimming the dataset by the category 'Restaurants'
business = business[business.categories.notnull()]
business = business[business["categories"].apply(str).str.contains("Restaurants")]

business = business[business.attributes.notnull()]
business_frame = business

print (len(business_frame))

2256


In [6]:
reviews_frame.head()

Unnamed: 0,business_id,stars,user_id,text,name,categories,attributes,city
0,YO8sWa0wYChH6DQWnE6NFg,4,djSJ6a9gsLSdj-7BoyNNQA,You're not gonna find a much better breakfast ...,P&G's Pamela's Diner,"Restaurants, Breakfast & Brunch","{'Alcohol': 'none', 'Ambience': ""{'romantic': ...",Pittsburgh
1,YO8sWa0wYChH6DQWnE6NFg,4,Tj58A0_D-WsD-UirmpYSEQ,"It's true. The pancakes here are incredible, ...",P&G's Pamela's Diner,"Restaurants, Breakfast & Brunch","{'Alcohol': 'none', 'Ambience': ""{'romantic': ...",Pittsburgh
2,YO8sWa0wYChH6DQWnE6NFg,1,lwCt_LgGAbPQpNRpaDR_Fg,Pamela's is not anything great (heavy and grea...,P&G's Pamela's Diner,"Restaurants, Breakfast & Brunch","{'Alcohol': 'none', 'Ambience': ""{'romantic': ...",Pittsburgh
3,YO8sWa0wYChH6DQWnE6NFg,3,Figs8mO8s_aPRq2W7W0cYQ,I've been here a couple of times for brunch. I...,P&G's Pamela's Diner,"Restaurants, Breakfast & Brunch","{'Alcohol': 'none', 'Ambience': ""{'romantic': ...",Pittsburgh
4,YO8sWa0wYChH6DQWnE6NFg,4,iRRjcenJiFkGO92I3RvW-A,Everything was great except the coffee. I had ...,P&G's Pamela's Diner,"Restaurants, Breakfast & Brunch","{'Alcohol': 'none', 'Ambience': ""{'romantic': ...",Pittsburgh


#### Feature Extraction

In [7]:
encoding_category = One_Hot_Encoder('categories', 'list', sparse=False)
encoding_attribute = One_Hot_Encoder('attributes', 'dict', sparse=False)
encoding_city= One_Hot_Encoder('city', 'value', sparse=False)
rating = Column_Selector(['stars'])
encoding_union = FeatureUnion([ ('cat', encoding_category),('attr', encoding_attribute),('city', encoding_city), ('rating', rating) ])
encoding_union.fit(business_frame)

FeatureUnion(n_jobs=1,
       transformer_list=[('cat', One_Hot_Encoder(colnames=None, sparse=None, value_type=None)), ('attr', One_Hot_Encoder(colnames=None, sparse=None, value_type=None)), ('city', One_Hot_Encoder(colnames=None, sparse=None, value_type=None)), ('rating', Column_Selector(colnames=None))],
       transformer_weights=None)

#### Building Matrix based on user's review history

In [8]:
user = 'djSJ6a9gsLSdj-7BoyNNQA'

reviews_given_by_user = reviews_frame.ix[reviews_frame.user_id == user]
reviews_given_by_user['stars'] = reviews_given_by_user['stars'] - float(user_frame.average_stars[user_frame.user_id == user])
reviews_given_by_user = reviews_given_by_user.sort_values('business_id')

# list of ids of the businesses reviewed by the user
reviewed_business_id_list = reviews_given_by_user['business_id'].tolist()
reviewed_business = business_frame[business_frame['business_id'].isin(reviewed_business_id_list)]
reviewed_business = reviewed_business.sort_values('business_id')

features = encoding_union.transform(reviewed_business)
profile = np.matrix(reviews_given_by_user.stars) * features

#### Calculating cosine similarity of the unreviewed reviews with the user's profile

In [9]:

# test_frame = business_frame[0:1000]
test_frame = business_frame
test_frame = test_frame.sort_values('business_id')
business_id_list = test_frame['business_id'].tolist()
features = encoding_union.transform(test_frame)
similarity = np.asarray(profile * features.T) * 1./(norm(profile) * norm(features, axis = 1))

In [10]:
similarity

array([[-0.81176734, -0.64844794, -0.63335977, ..., -0.56421303,
        -0.75105149, -0.73183188]])

#### Output the recommended restaurants

In [11]:
index_arr = (-similarity).argsort()[:10][0][0:10]
print ('Hi ' + user_frame.name[user_frame.user_id == user].values[0] + '\nCheck out these restaurants: ')
for i in index_arr:
    resturant = business_frame[business_frame.business_id == business_id_list[i]]
    print (str(resturant['name'].values[0]))

Hi Mackenzie
Check out these restaurants: 
Phong May House
Little Caesars
Vinny's Pizza
Fu Lai Chinese Restaurant
Little Caesars Pizza
Ron's Pizza Palace
Papa John's Pizza
Chubby's Pizza & Hoagies Northside
Hong Kong Taste
Vincenzos Pizza & Pasta
