In [2]:
import numpy as np
import pandas as pd
from sklearn import base
from sklearn.feature_extraction import DictVectorizer # for One_Hot_Encoder
import ast
from numpy.linalg import norm
from sklearn.pipeline import Pipeline, FeatureUnion
#import transformers
from scipy.sparse import coo_matrix
import heapq
import csv
import simplejson as json
from datetime import datetime
from sklearn.cross_validation import train_test_split



In [3]:
''' ----------------
Functions used in One_Hot_Encoder
----------------- '''
def Value_To_Dict(val):
    return {val:1}

def List_To_Dict(the_list):
    return {category:1 for category in the_list}
    
def Flatten_Dict(d, prekey = ''):
    flat_dict = {}
    for k, v in d.items():
        if isinstance(v, bool) and v:
            flat_dict.update({prekey+'_'+k:1})
        elif isinstance(v, str):
            flat_dict.update({prekey+'_'+k+'_'+v:1})
        elif isinstance(v, dict):
            flat_dict.update(Flatten_Dict(v, prekey=prekey+'_'+k))
    return flat_dict

def flatten(structure, key="", path="", flattened=None):
    if flattened is None:
        flattened = {}
    if type(structure) not in(dict, list):
        flattened[((path + "_") if path else "") + key] = structure
    elif isinstance(structure, list):
        for i, item in enumerate(structure):
            flatten(item, "%d" % i, path + "_" + key, flattened)
    else:
        for new_key, value in structure.items():
            flatten(value, new_key, path + "_" + key, flattened)
    return flattened
''' -------------------
Converts a feature column values into a One-Hot Encoding matrix. If
feature values are lists or (nested) dicts, a column for each list 
entry or dict (sub)key is created.
Inputs: colnames is a string of the column name
        value_type is the type (value, list or dict) of feature values
        sparse indicates whether the matrix is sparse
Dependencies: sklearn.feature_extraction.DictVectorizer
              sklearn.base
------------------- '''
class One_Hot_Encoder(base.BaseEstimator, base.TransformerMixin):
    def __init__(self, colnames, value_type = 'value', sparse = True):
        if value_type == 'value':
            self.apply_function_ = Value_To_Dict
        elif value_type == 'list':
            self.apply_function_ = List_To_Dict
        elif value_type == 'dict':
            self.apply_function_ = flatten
        self.colnames_ = colnames
        self.dv_ = DictVectorizer(sparse = sparse)

    def fit(self, X, y = None):
        self.dv_.fit(X[self.colnames_].apply(self.apply_function_))
        return self

    def transform(self, X):
        return self.dv_.transform(X[self.colnames_].apply(self.apply_function_))






''' -------------------
Selects and returns the specified column(s)
Inputs: colnames is a list of column(s) to select
Dependencies: sklearn.base
------------------- '''
class Column_Selector(base.BaseEstimator, base.TransformerMixin):
    def __init__(self, colnames):
        self.colnames_ = colnames

    def fit(self, X, y = None):
        return self

    def transform(self, X):
        return pd.DataFrame(X[self.colnames_])


In [7]:
# from src.load_data import df_review

print '**Loading data...'

# LOAD DATA FOR TYPE = dataset_type
#fileheading = 'yelp_dataset_challenge/yelp_academic_dataset_'

def get_data(line, cols):
    d = json.loads(line)
    return dict((key, d[key]) for key in cols)
# 
# def CustomParser(data):
#     j1 = json.loads(data)
#     return j1

# Load business data
# cols = ('business_id', 'name')
# with open(fileheading + 'business.json') as f:
#     df_business = pd.DataFrame(get_data(line, cols) for line in f)
def string_to_dict(dict_string):
    # Convert to proper json format
#     print type(ast.literal_eval(dict_string))
#     print dict_string.__class__
    if dict_string is not np.nan:
        dict_string = ast.literal_eval(dict_string)
#     dict_string = dict_string.replace("[", "{").replace("]", "}").replace("'", "\"")
#     print ast.literal_eval(dict_string)
#     d = json.loads(dict_string)
#     dict = {}
#     print dict_string
# #     list = dict_string.split
#     d = json.loads(dict_string)
#     print dict_string.__class__
    return dict_string

df_business = pd.read_csv("C:/desktop/ml-project/res_city/sc/southcarolina.csv")
df_business = df_business.sort('business_id')
# print type(df_business['attributes'][0])
# df_business.join(df_business['attributes'].apply(json.loads).apply(pd.Series)) 
# df_business.attributes = df_business.attributes.astype(dict)
df_business.attributes = df_business.attributes.apply(string_to_dict)
print type(df_business['attributes'][0])
df_business.index = range(len(df_business))

# Load user data
# cols = ('user_id', 'name')
# with open(fileheading + 'user.json') as f:
#     df_user = pd.DataFrame(get_data(line, cols) for line in f)
df_user = pd.read_csv("C:/desktop/ml-project/res_city/sc/sc_user.csv")
df_user = df_user.sort('user_id')
df_user.index = range(len(df_user))

# Load review data
# cols = ('user_id', 'business_id', 'stars')
# with open(fileheading + 'review.json') as f:
#     df_review = pd.DataFrame(get_data(line, cols) for line in f)
df_review = pd.read_csv("C:/desktop/ml-project/res_city/sc/sc_review.csv")
data_load_time = datetime.now()
print 'Data was loaded at ' + data_load_time.time().isoformat()

# # Load data
# try:
#     data_load_time
# except NameError:
#     execfile('src/load_data.py')
# else:
#     print 'Data was loaded at ' + data_load_time.time().isoformat()

# Personalized recommendation for a specific user
user = 'iB4nSMuClUa3dgUMsPW7-w'

# ----------------
# CONTENT BASED FILTERING
# ----------------
print '*** Using Content-based Filtering for Recommendation ***'
print '** Initializing feature extraction for user ' + user

# Extract features of each business: category, attribute, average rating
OHE_cat = One_Hot_Encoder('categories', 'list', sparse=False)
OHE_attr= One_Hot_Encoder('attributes', 'dict', sparse=False)
OHE_city= One_Hot_Encoder('city', 'value', sparse=False)
rating = Column_Selector(['stars'])
# OHE_union = FeatureUnion([ ('cat', OHE_cat), ('attr', OHE_attr), ('city', OHE_city), ('rating', rating) ])
OHE_union = OHE_attr
OHE_union.fit(df_business)
print 'Done'

# Generate profile: weighted average of features for business she has reviewed
print '**Getting businesses...'
reviewed_businesses = df_review.ix[df_review.user_id == user]
reviewed_businesses['stars'] = reviewed_businesses['stars'] - float(df_user.average_stars[df_user.user_id == user])
idx_reviewed = [pd.Index(df_business.business_id).get_loc(b) for b in reviewed_businesses.business_id]

print '**Creating profile...'
features = OHE_union.transform(df_business.ix[idx_reviewed])
profile = np.matrix(reviewed_businesses.stars) * features
print 'Done'

# Given un-reviewed business, compute cosine similarity to user's profile
print '**Computing similarity to all businesses...'
idx_new = range(100) 
#[pd.Index(df_business.business_id).get_loc(b) for b in df_business.business_id if b not in reviewed_businesses.business_id]
features = OHE_union.transform(df_business.ix[idx_new])
similarity = np.asarray(profile * features.T) * 1./(norm(profile) * norm(features, axis = 1))
print 'Done'

# Output: recommend the most similar business
idx_recommendation = similarity.argmax()
print '\n**********'
print 'Hi ' + df_user.name[df_user.user_id == user].iget_value(0) + '!'
print 'We recommend you to visit ' + df_business.name[idx_recommendation] + ' located at '
print df_business.address[idx_recommendation]
print '**********'

## -------------------
## COLLABORATIVE FILTERING
## -------------------
print '*** Using Collaborative Filtering for Recommendation ***'

df_review['stars'] = df_review.groupby('business_id')['stars'].transform(lambda x : x - x.mean())

def get_idx(user_id): 
    global running_index
    running_index = running_index + 1
    return pd.Series(np.zeros(len(user_id)) + running_index) 
# For speed, get_idx assumes df_review and df_user contain the same users, and is fed in sorted order.
running_index = -1 
df_review['user_idx'] = df_review.groupby('user_id')['user_id'].transform(get_idx)

# Work in terms of sparse matrix
print '** Processing utility matrix...'

def convert_to_sparse(group):
    ratings = coo_matrix( (np.array(group['stars']), (np.array(group['user_idx']), np.zeros(len(group)))), 
                          shape = (len(df_user), 1) ).tocsc()
    return ratings / np.sqrt(float(ratings.T.dot(ratings).toarray()))

utility = df_review.groupby('business_id')[['stars', 'user_idx']].apply(convert_to_sparse) 

# Get top recommendatiokns
print '** Generating recommendations...'

def cosine_similarity(v1, v2):
    return float(v1.T.dot(v2).toarray())

def get_recommended_businesses(n, business_id):
    util_to_match = utility[utility.index == business_id]
    similarity = utility.apply(lambda x: cosine_similarity(util_to_match.values[0], x))
    similarity.sort(ascending=False)
    return similarity[1:(n+1)]

fav_business = df_review.business_id[ df_review.stars[ df_review.user_id == user ].argmax() ]

rec = pd.DataFrame(get_recommended_businesses(5, fav_business), columns=['similarity'])
rec['name'] = [ df_business.name[ df_business.business_id == business_id ].values[0] for business_id in rec.index]
print 'Done'

# Output recommendation
print 'Hi ' + df_user.name[df_user.user_id == user].values[0] + '!\nCheck out these businesses!'
for name in rec.name:
    print name

**Loading data...
<type 'list'>
Data was loaded at 10:19:39.018000
*** Using Content-based Filtering for Recommendation ***
** Initializing feature extraction for user iB4nSMuClUa3dgUMsPW7-w
Done
**Getting businesses...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


**Creating profile...
Done
**Computing similarity to all businesses...
Done

**********
Hi Jeff!
We recommend you to visit Los Aztecas located at 
100 Fort Mill Sq
**********
*** Using Collaborative Filtering for Recommendation ***




** Processing utility matrix...


ValueError: cannot convert float NaN to integer

In [8]:
pd.series()

AttributeError: 'module' object has no attribute 'series'

In [9]:
business = pd.read_csv("C:/desktop/ml-project/yelp_academic_dataset/csv files/business.csv")

In [28]:
business['city'].unique().shape[0]

674L

In [12]:
len(business)

48485

In [13]:
business.columns

Index([u'neighborhood', u'business_id', u'hours', u'is_open', u'address',
       u'attributes', u'categories', u'city', u'review_count', u'name',
       u'longitude', u'state', u'stars', u'latitude', u'postal_code', u'type'],
      dtype='object')

In [23]:
business['stars'].value_counts()

3.5    12177
4.0    12141
3.0     9036
4.5     5447
2.5     4757
2.0     2489
5.0     1282
1.5      873
1.0      283
Name: stars, dtype: int64

In [29]:
review = pd.read_csv("C:/desktop/ml-project/yelp_academic_dataset/csv files/yelp_academic_dataset_review.csv")

In [30]:
user= pd.read_csv("C:/desktop/ml-project/yelp_academic_dataset/csv files/yelp_academic_dataset_user.csv")

In [31]:
len(review)

4153150

In [32]:
review.columns

Index([u'funny', u'user_id', u'review_id', u'text', u'business_id', u'stars',
       u'date', u'useful', u'type', u'cool'],
      dtype='object')

In [34]:
review['user_id'].unique().shape[0]

1029432L

In [35]:
data = pd.merge(business, review, on = 'business_id', how = 'left')

In [36]:
newdata = pd.merge(data, user, on = 'user_id', how = 'left')

In [37]:
newdata.columns

Index([u'neighborhood', u'business_id', u'hours', u'is_open', u'address',
       u'attributes', u'categories', u'city', u'review_count_x', u'name_x',
       u'longitude', u'state', u'stars_x', u'latitude', u'postal_code',
       u'type_x', u'funny_x', u'user_id', u'review_id', u'text', u'stars_y',
       u'date', u'useful_x', u'type_y', u'cool_x', u'yelping_since',
       u'useful_y', u'compliment_photos', u'compliment_list',
       u'compliment_funny', u'compliment_plain', u'review_count_y', u'elite',
       u'fans', u'type', u'compliment_note', u'funny_y', u'compliment_writer',
       u'compliment_cute', u'average_stars', u'compliment_more', u'friends',
       u'compliment_hot', u'cool_y', u'name_y', u'compliment_profile',
       u'compliment_cool'],
      dtype='object')

In [43]:
newdata['review_id'].unique().shape[0]

2540795L

In [44]:
df_business = pd.read_csv("C:/desktop/ml-project/res_city/sc/southcarolina.csv")

In [47]:
df_business['business_id'].unique().shape[0]

181L

In [49]:
newdata['attributes'].unique()

array([ '[\'Alcohol: none\', "Ambience: {\'romantic\': False, \'intimate\': False, \'classy\': False, \'hipster\': False, \'touristy\': False, \'trendy\': False, \'upscale\': False, \'casual\': False}", \'BikeParking: True\', \'BusinessAcceptsCreditCards: True\', "BusinessParking: {\'garage\': False, \'street\': False, \'validated\': False, \'lot\': False, \'valet\': False}", \'Caters: True\', \'GoodForKids: True\', "GoodForMeal: {\'dessert\': False, \'latenight\': False, \'lunch\': False, \'dinner\': False, \'breakfast\': False, \'brunch\': False}", \'HasTV: True\', \'NoiseLevel: quiet\', \'OutdoorSeating: False\', \'RestaurantsAttire: casual\', \'RestaurantsDelivery: True\', \'RestaurantsGoodForGroups: True\', \'RestaurantsPriceRange2: 1\', \'RestaurantsReservations: False\', \'RestaurantsTableService: False\', \'RestaurantsTakeOut: True\', \'WiFi: free\']',
       '[\'Alcohol: none\', "Ambience: {\'romantic\': False, \'intimate\': False, \'classy\': False, \'hipster\': False, \'dive