In [1]:
import numpy as np
import pandas as pd
import random

from collections import Counter
from scipy.sparse import csr_matrix
import scipy.sparse

import sklearn
from sklearn.decomposition import TruncatedSVD

In [13]:
# load user and business data

r = pd.read_csv("review_r_and_f", index_col="review_id")
print(r.shape, len(r.index))

b = pd.read_csv("business_r_and_f.csv", index_col="business_id")
print(b.shape, len(b.index))

(2314623, 8) 2314623
(26945, 183) 26945


In [18]:
# map 'city' column from business to review
r['city'] = r['business_id'].map(b['city'])
r.columns

Index(['business_id', 'cool', 'date', 'funny', 'stars', 'text', 'useful',
       'user_id', 'city'],
      dtype='object')

In [19]:
# drop irrelevant columns
r.drop(['cool', 'date', 'funny', 'text', 'useful'], axis=1, inplace=True)
r.head()

Unnamed: 0_level_0,business_id,stars,user_id,city
review_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
x7mDIiDB3jEiPGPHOmDzyw,iCQpiavjjPzJ5_3gPD5Ebg,2,msQe1u7Z_XuqjGoqhB0J5g,Las Vegas
dDl8zu1vWPdKGihJrwQbpw,pomGBqfbxcqPv14c3XH-ZQ,5,msQe1u7Z_XuqjGoqhB0J5g,Las Vegas
LZp4UX5zK3e-c5ZGSeo3kA,jtQARsP6P-LbkyjbO1qNGg,1,msQe1u7Z_XuqjGoqhB0J5g,Las Vegas
Er4NBWCmCD4nM8_p1GRdow,elqbBhBfElMNSrjFqW3now,2,msQe1u7Z_XuqjGoqhB0J5g,Las Vegas
jsDu6QEJHbwP2Blom1PLCA,Ums3gaP2qM3W1XcA5r6SsQ,5,msQe1u7Z_XuqjGoqhB0J5g,Las Vegas


In [20]:
# reorder columns
col = ['business_id', 'user_id', 'stars', 'city']
r = r[col]
r.head()

Unnamed: 0_level_0,business_id,user_id,stars,city
review_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
x7mDIiDB3jEiPGPHOmDzyw,iCQpiavjjPzJ5_3gPD5Ebg,msQe1u7Z_XuqjGoqhB0J5g,2,Las Vegas
dDl8zu1vWPdKGihJrwQbpw,pomGBqfbxcqPv14c3XH-ZQ,msQe1u7Z_XuqjGoqhB0J5g,5,Las Vegas
LZp4UX5zK3e-c5ZGSeo3kA,jtQARsP6P-LbkyjbO1qNGg,msQe1u7Z_XuqjGoqhB0J5g,1,Las Vegas
Er4NBWCmCD4nM8_p1GRdow,elqbBhBfElMNSrjFqW3now,msQe1u7Z_XuqjGoqhB0J5g,2,Las Vegas
jsDu6QEJHbwP2Blom1PLCA,Ums3gaP2qM3W1XcA5r6SsQ,msQe1u7Z_XuqjGoqhB0J5g,5,Las Vegas


In [51]:
r.groupby('business_id')['stars'].count().sort_values(ascending=True).head()

business_id
qj4KHXGgwyutHpvWRLI87g    1
-LG_SXZrWLTnfSXRPZQvzA    2
EACF3CoVCrTkF1xvoaIlpw    3
FJDfjRKQU4vLOrZhtFz62A    3
dA1aI3CqagvBQceLhFUJDg    3
Name: stars, dtype: int64

In [50]:
r.groupby('stars')['business_id'].count().sort_values(ascending=False)

stars
5    923879
4    611659
3    312266
1    255307
2    211512
Name: business_id, dtype: int64

In [48]:
print(r.business_id.nunique(), r.user_id.nunique())

26945 712070


In [52]:
print(r[r['city'] == 'Las Vegas'].business_id.nunique(), r[r['city'] == 'Las Vegas'].user_id.nunique())

7573 393985


In [56]:
print(r[r['city'] == 'Phoenix'].business_id.nunique(), r[r['city'] == 'Phoenix'].user_id.nunique())

4697 147098


In [57]:
print(r[r['city'] == 'Scottsdale'].business_id.nunique(), r[r['city'] == 'Scottsdale'].user_id.nunique())

1847 90499


In [58]:
print(r[r['city'] == 'Charlotte'].business_id.nunique(), r[r['city'] == 'Charlotte'].user_id.nunique())

3253 62917


In [59]:
print(r[r['city'] == 'Toronto'].business_id.nunique(), r[r['city'] == 'Toronto'].user_id.nunique())

9575 84361


In [215]:
# save
ub_ph.to_csv('/data/ub_ph')

#### Try excluding businesses with less than 5 reviews

In [143]:
# business ratings count
bratings = r.groupby('business_id').size().sort_values(ascending=False)
sum(bratings > 5)/r.business_id.nunique()

0.8410094637223975

In [144]:
sum(bratings > 10)/r.business_id.nunique()

0.7002041195026907

In [172]:
for c in cities:
    brat = r[r['city'] == c].groupby('business_id').size().sort_values(ascending=False)
    print(c, ":", sum(brat > 4)/r[r['city'] == c].business_id.nunique())

Las Vegas : 0.9144328535586954
Phoenix : 0.8986587183308494
Scottsdale : 0.9339469409853817
Charlotte : 0.8874884721795266
Toronto : 0.8370757180156658


In [180]:
for c in cities:
    brat = r[r['city'] == c].groupby('business_id').size().sort_values(ascending=False)
    print(c, ":", sum(brat > 4))

Las Vegas : 6925
Phoenix : 4221
Scottsdale : 1725
Charlotte : 2887
Toronto : 8015


In [158]:
for c in cities:
    brat = r[r['city'] == c].groupby('business_id').size().sort_values(ascending=False)
    print(c, ":", sum(brat > 9)/r[r['city'] == c].business_id.nunique())

Las Vegas : 0.800475373035785
Phoenix : 0.7462209921226315
Scottsdale : 0.836491608012994
Charlotte : 0.7239471257300953
Toronto : 0.6306005221932115


#### Try excluding users with only 1 review

In [149]:
# user ratings count
uratings = r.groupby('user_id').size().sort_values(ascending=False)
sum(uratings > 1)/r.user_id.nunique()

0.4258345387391689

In [155]:
cities = ['Las Vegas', 'Phoenix', 'Scottsdale', 'Charlotte', 'Toronto']

for c in cities:
    urat = r[r['city'] == c].groupby('user_id').size().sort_values(ascending=False)
    print(c, ":", sum(urat > 1)/r[r['city'] == c].user_id.nunique())

Las Vegas : 0.4014315265809612
Phoenix : 0.38180668669866347
Scottsdale : 0.34784914750439233
Charlotte : 0.3915952763164169
Toronto : 0.48594729792202557


#### Try excluding users with no positive stars

In [196]:
# users with no positive rating
u_negative = list(set(r['user_id']) - set(r[r['stars'] < 4].groupby('user_id').size().sort_values(ascending=False).index))
len(u_negative)

366811

In [201]:
u_lv_ph_positive = r[~(r['user_id'].isin(u_negative)) & (r['city'].str.contains('Las Vegas|Phoenix'))]
print(u_lv_ph_positive.business_id.nunique(), u_lv_ph_positive.user_id.nunique())

12229 254312


#### Try excluding users who haven't been to both cities

In [27]:
u_lv = Counter([u for u in r['user_id'][r['city'] == 'Las Vegas']])
u_ph = Counter([u for u in r['user_id'][r['city'] == 'Phoenix']])

In [28]:
# get intersection of users in Las Vegas and Phoenix
u_lv_and_ph = list(set([u for u in u_lv.keys()]) & set([u for u in u_ph.keys()]))
len(u_lv_and_ph)

15127

In [29]:
u_lv_ph = r[(r['user_id'].isin(u_lv_and_ph)) & (r['city'].str.contains('Las Vegas|Phoenix'))]
print(u_lv_ph.business_id.nunique(), u_lv_ph.user_id.nunique())

10238 15127


In [203]:
# user business matrix excluding users who has left no positive star
#ub_lv = r[(r['business_id'].isin(bratings[bratings > 4].index)) & (r['city'] == 'Las Vegas')].pivot_table(
#values='stars', index='business_id', columns='user_id', fill_value=0)

# user business matrix excluding users who has only been to one of the city
ub_lv = u_lv_ph.pivot_table(values='stars', index='business_id', columns='user_id', fill_value=0)

ub_lv.head()

user_id,--3WaS23LcIXtxyFULJHTA,--41c9Tl0C9OGewIR7Qyzg,--4q8EyqThydQm-eKZpS-A,--CIuK7sUpaNzalLAlHJKA,--ty7Z9fEt08E3dS3_qoSA,-0AvEZuSWLtUpasFb4fmfg,-0IiMAZI2SsQ7VmyzJjokQ,-0_qOZXT_xwI7RdGZEAynA,-0aInSHjCWLfiNqfgmWnow,-0udWcFQEt2M8kM3xcIofw,...,zwpP3Lt8USKS91Ds52G8UA,zx26BmC67gLmQrAgY-nT1g,zx3y74_pvIRuQSVIrgzCew,zx6limw6-cG5APUL6LCDVA,zxCOwzZmH1s0KbQbhPIdBw,zxMHVPoMWY5myDNWYAKt8g,zy5vPwlDT95fMMF4c7bRkg,zy7D8MZ8NwXO8uDraQby8g,zzOd64i66_Z5roGjXmwCXA,zzaq5Fn1U2Feut3dGxidNg
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
--9e1ONYQuAa-CB_Rrw7Tw,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
--g-a85VwrdZJNf0R95GcQ,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
-050d_XIor1NpCuWkbIVaQ,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
-0BxAGlIk5DJAGVkpqBXxg,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
-0WegMt6Cy966qlDKhu6jA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [206]:
# ugh named this wrong; but should really be ub_lv_ph
ub_lv_ph = ub_lv.copy()
ub_lv_ph.shape

(10238, 15127)

In [216]:
# save
ub_lv_ph.to_csv('data/ub_lv_ph')

In [63]:
# build a user-business matrix
ub_ph = r[r['city'] == 'Phoenix'].pivot_table(values='stars', index='business_id', columns='user_id', fill_value=0)
ub_ph.head()

user_id,---94vtJ_5o_nikEs6hUjg,---cu1hq55BP9DWVXXKHZg,--2HUmLkcNHZp0xw6AMBPg,--3WaS23LcIXtxyFULJHTA,--41c9Tl0C9OGewIR7Qyzg,--4q8EyqThydQm-eKZpS-A,--4rAAfZnEIAKJE80aIiYg,--5XzJ2pRsVVJiJUfzZlgQ,--BIMVqCxdg5AbJu-ysD8g,--CIuK7sUpaNzalLAlHJKA,...,zzcRUD3kTXhHQXJIhxlKoQ,zzhOHf6bPcrFkzyWKTyp0g,zzitZPIJBmyxt8DefrJygg,zzjQuRnY_Kgr_zaw6A-ACQ,zzkgqTZLZxLdIRtMBOUSeQ,zzmbLv7WZPxhGbpk0BdPPQ,zznZTSALzMXf3cjggfNdBQ,zzpV8L0FOeyejQwomFSDPA,zzzAgDDWe4PTnJuwhi3XQQ,zzzTrQ2bkvteEngjkXCTAA
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
--g-a85VwrdZJNf0R95GcQ,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
-050d_XIor1NpCuWkbIVaQ,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
-0WegMt6Cy966qlDKhu6jA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
-0aIra_B6iALlfqAriBSYA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
-0tgMGl7D9B10YjSN2ujLA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Inspect user data

In [160]:
u.head()

Unnamed: 0_level_0,name,review_count,average_stars,yelping_since,friends,fans,cool,elite,funny,useful,...,compliment_funny,compliment_hot,compliment_list,compliment_more,compliment_note,compliment_photos,compliment_plain,compliment_profile,compliment_writer,friend_count
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
lzlZwIpuSWXEnNS91wxjHw,Susan,1,2.0,2015-09-28,,0,0,,0,0,...,0,0,0,0,0,0,0,0,0,0
XvLBr-9smbI0m_a7dXtB7w,Daipayan,2,5.0,2015-09-05,,0,0,,0,0,...,0,0,0,0,0,0,0,0,0,0
QPT4Ud4H5sJVr68yXhoWFw,Andy,1,4.0,2016-07-21,,0,0,,0,0,...,0,0,0,0,0,0,0,0,0,0
s4FoIXE_LSGviTHBe8dmcg,Shashank,3,3.0,2017-06-18,,0,0,,0,0,...,0,0,0,0,0,0,0,0,0,0
nnB0AE1Cxp_0154xkhXelw,Mike,11,2.77,2014-07-11,,1,0,,0,8,...,0,0,0,0,0,0,0,0,0,0


In [161]:
u[u['review_count'] == 0]

Unnamed: 0_level_0,name,review_count,average_stars,yelping_since,friends,fans,cool,elite,funny,useful,...,compliment_funny,compliment_hot,compliment_list,compliment_more,compliment_note,compliment_photos,compliment_plain,compliment_profile,compliment_writer,friend_count
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
zf_GuCZfW4PzaCZeQmnfLw,marathonm,0,4.09,2007-08-01,,0,0,,0,0,...,0,0,0,0,0,0,0,0,0,0
7ieFGBmuwf3zAhJREFuSCw,h,0,4.16,2006-04-09,,0,0,,0,0,...,0,0,0,0,0,0,0,0,0,0
TlPw3-NxLEcq-4RncidNLg,mightym,0,3.72,2010-07-22,,0,0,,0,0,...,0,0,0,0,0,0,0,0,0,0
MEQAAU6FlZmqbAh1s54kRA,Kim,0,3.90,2009-04-20,"plb8ktY-CS1z-hAPIUNFkQ, T_uvJIHzn4kS0ii6xfzgzA...",1,7,,1,22,...,1,3,0,1,0,0,0,0,0,32
2XSfXylobBnDK8dbdDWrxA,Lolo,0,3.35,2010-08-09,,0,0,,0,0,...,0,0,0,0,0,0,0,0,0,0
zRGMRSiYWeiYEniPR53l_A,DerR,0,3.49,2010-11-14,,0,0,,0,0,...,0,0,0,0,0,0,0,0,0,0
KwAKvmk3z5S1aMxv_jbNoQ,andybry,0,3.78,2008-12-18,,0,0,,0,0,...,0,0,0,0,0,0,0,0,0,0
eGFRj_0LH2DGIwY9WF8jSw,jou,0,3.65,2011-06-27,,0,0,,0,0,...,0,0,0,0,0,0,0,0,0,0
WZpOesRUy0SMfU4c5RG9vA,M,0,2.70,2009-01-20,"Tcg6slnVIrbPmP_yLLeGjA, -JPY-Ca7ij19WGAKJVcZsQ",0,0,,0,0,...,1,0,0,0,1,0,0,0,0,2
FEcTkPic4BGx9gF8JSctgQ,BjörnOverme,0,4.48,2011-05-29,,0,0,,0,0,...,0,0,0,0,0,0,0,0,0,0


In [163]:
u[(u['review_count'] == 0) & (u['friend_count'] == 0)]

Unnamed: 0_level_0,name,review_count,average_stars,yelping_since,friends,fans,cool,elite,funny,useful,...,compliment_funny,compliment_hot,compliment_list,compliment_more,compliment_note,compliment_photos,compliment_plain,compliment_profile,compliment_writer,friend_count
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
zf_GuCZfW4PzaCZeQmnfLw,marathonm,0,4.09,2007-08-01,,0,0,,0,0,...,0,0,0,0,0,0,0,0,0,0
7ieFGBmuwf3zAhJREFuSCw,h,0,4.16,2006-04-09,,0,0,,0,0,...,0,0,0,0,0,0,0,0,0,0
TlPw3-NxLEcq-4RncidNLg,mightym,0,3.72,2010-07-22,,0,0,,0,0,...,0,0,0,0,0,0,0,0,0,0
2XSfXylobBnDK8dbdDWrxA,Lolo,0,3.35,2010-08-09,,0,0,,0,0,...,0,0,0,0,0,0,0,0,0,0
zRGMRSiYWeiYEniPR53l_A,DerR,0,3.49,2010-11-14,,0,0,,0,0,...,0,0,0,0,0,0,0,0,0,0
KwAKvmk3z5S1aMxv_jbNoQ,andybry,0,3.78,2008-12-18,,0,0,,0,0,...,0,0,0,0,0,0,0,0,0,0
eGFRj_0LH2DGIwY9WF8jSw,jou,0,3.65,2011-06-27,,0,0,,0,0,...,0,0,0,0,0,0,0,0,0,0
FEcTkPic4BGx9gF8JSctgQ,BjörnOverme,0,4.48,2011-05-29,,0,0,,0,0,...,0,0,0,0,0,0,0,0,0,0
BzgMoLrtoRmheawNz7znqA,berlinga,0,4.46,2008-03-14,,0,0,,0,0,...,0,0,0,0,0,0,0,0,0,0
zxs7YgsUpveJ7j1qFBWq0Q,lu,0,4.15,2010-08-20,,0,0,,0,0,...,0,0,0,0,0,0,0,0,0,0


In [171]:
r[r.index.isin(u[u['review_count'] == 1].index)]

Unnamed: 0_level_0,business_id,user_id,stars,city
review_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
4P6BUkdQpobImZhCIY8Q_A,6Z6IyosSMciZtwk8hRLRag,NX_cc7A7UZGK-EKB6ITetQ,5,Las Vegas
5XrFbIQXoXwxaMCCN_PCtA,tjw1yJdmatifLYHbsI9HGA,YHWsLBS8jzZiPjKHMFOaAA,3,Toronto
eW_Zg9Hl-8S99Rybqm5Dbg,hqvqLihniz0ynit9c36JHg,qfQSP1X7HFfh2XF80KqrAg,5,Las Vegas
CvF5jQHP0uFLyuAFNNgK_g,A7waf6G3cvnLfAqKeLL8DA,UMRPnn-PmNn8noRrfBAC4g,5,Toronto
YUAMxrEKIvyjqqrmjzY_gA,RESDUcs7fIiihp38-d6_6g,PnTSCTTvI1TKSmuv0omw4Q,5,Las Vegas
lZEzZxB_rzZJHxTrAMq96A,D-2t7fHQu6hNLjjfRHSZaA,LHZuJrGEI9p83PeDvDVXXA,5,Scottsdale
3JqnSeP6vNdVieprugglvw,pyqnGlIfP9Zw8LLxBjyAiA,vNKNUyfd2TxzSV7ENg7H_w,5,Las Vegas
swnXgq4b4eYj-2GmdCdydA,n-AQFKbW6MnpamdtA6ECpQ,HrduXLtltyC8BKy5s4jNQQ,1,Charlotte
0MfEh7uGfDoGsMdxMDmoqg,86d1M8MMrb94n5djgKWqEQ,_MnXOJuyvWYdrWaralTcwQ,4,Toronto
_CWX8JJFqMEKeym4iZSc5Q,yTWSFoGjkK6pl7kJm8f0Dg,sq3gM1TxI8ys_rbz1a4gxw,4,Toronto


### SVD Matrix Factorization

In [208]:
# make sparse matrix
ub_lv_ph_sparse = csr_matrix(ub_lv_ph)

In [214]:
# save the csr for later use
scipy.sparse.save_npz('data/ub_lv_ph_sparse', ub_lv_ph_sparse)

In [3]:
# load csr
ub = scipy.sparse.load_npz('data/ub_lv_ph_sparse.npz')
ub

<10238x15127 sparse matrix of type '<class 'numpy.int64'>'
	with 169900 stored elements in Compressed Sparse Row format>

#### Find out how many components is needed

In [6]:
# initiate an SVD object
SVD = TruncatedSVD(n_components=ub.shape[1]-1, random_state=42)
#svd_mat = SVD.fit_transform(ub_ph)

ub_svd = SVD.fit(ub)
#corr_mat = np.corrcoef(svd_mat)

In [7]:
# create list of explained variances
svd_var_ratios = ub_svd.explained_variance_ratio_

In [8]:
# Create a function
def select_n_components(var_ratio, goal_var: float) -> int:
    # Set initial variance explained so far
    total_variance = 0.0
    
    # Set initial number of features
    n_components = 0
    
    # For the explained variance of each feature:
    for explained_variance in var_ratio:
        
        # Add the explained variance to the total
        total_variance += explained_variance
        
        # Add one to the number of components
        n_components += 1
        
        # If we reach our goal level of explained variance
        if total_variance >= goal_var:
            # End the loop
            break
            
    # Return the number of components
    return n_components

In [9]:
# run function
select_n_components(svd_var_ratios, 0.95)

2868

In [55]:
# do the same with transposed
SVD = TruncatedSVD(n_components=ub.shape[0]-1, random_state=42)
ub_svd = SVD.fit(ub.transpose())
svd_var_ratios = ub_svd.explained_variance_ratio_
select_n_components(svd_var_ratios, 0.95)

2867

#### Create correlation matrix

In [10]:
# initiate an SVD object
SVD = TruncatedSVD(n_components=2868, random_state=42)
svd_mat = SVD.fit_transform(ub)
corr_mat = np.corrcoef(svd_mat)

In [11]:
corr_mat[:5,:5]

array([[ 1.00000000e+00,  9.97285847e-05,  2.96663451e-02,
         2.96348032e-02,  1.66017218e-02],
       [ 9.97285847e-05,  1.00000000e+00,  4.47369411e-02,
        -1.09237336e-03,  2.96200440e-04],
       [ 2.96663451e-02,  4.47369411e-02,  1.00000000e+00,
         3.50603791e-04,  2.43472512e-02],
       [ 2.96348032e-02, -1.09237336e-03,  3.50603791e-04,
         1.00000000e+00,  4.72716255e-03],
       [ 1.66017218e-02,  2.96200440e-04,  2.43472512e-02,
         4.72716255e-03,  1.00000000e+00]])

In [45]:
# result is a square business-business matrix
print(corr_mat.shape)

(10238, 10238)


In [56]:
# create user-user correlation matrix
SVD_u = TruncatedSVD(n_components=2867, random_state=42)
svd_mat_u = SVD_u.fit_transform(ub.transpose())
corr_mat_u = np.corrcoef(svd_mat_u)
print(corr_mat_u.shape)

(15127, 15127)


In [64]:
# load the original matrix
ub_lv_ph = pd.read_csv('data/ub_lv_ph', index_col='business_id')
ub_lv_ph

Unnamed: 0_level_0,--3WaS23LcIXtxyFULJHTA,--41c9Tl0C9OGewIR7Qyzg,--4q8EyqThydQm-eKZpS-A,--CIuK7sUpaNzalLAlHJKA,--ty7Z9fEt08E3dS3_qoSA,-0AvEZuSWLtUpasFb4fmfg,-0IiMAZI2SsQ7VmyzJjokQ,-0_qOZXT_xwI7RdGZEAynA,-0aInSHjCWLfiNqfgmWnow,-0udWcFQEt2M8kM3xcIofw,...,zwpP3Lt8USKS91Ds52G8UA,zx26BmC67gLmQrAgY-nT1g,zx3y74_pvIRuQSVIrgzCew,zx6limw6-cG5APUL6LCDVA,zxCOwzZmH1s0KbQbhPIdBw,zxMHVPoMWY5myDNWYAKt8g,zy5vPwlDT95fMMF4c7bRkg,zy7D8MZ8NwXO8uDraQby8g,zzOd64i66_Z5roGjXmwCXA,zzaq5Fn1U2Feut3dGxidNg
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
--9e1ONYQuAa-CB_Rrw7Tw,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
--g-a85VwrdZJNf0R95GcQ,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
-050d_XIor1NpCuWkbIVaQ,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
-0BxAGlIk5DJAGVkpqBXxg,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
-0WegMt6Cy966qlDKhu6jA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
-0aIra_B6iALlfqAriBSYA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
-0tgMGl7D9B10YjSN2ujLA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
-1UMR00eXtwaeh59pEiDjA,0,0,0,0,5,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
-1m9o3vGRA8IBPNvNqKLmA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
-1vfRrlnNnNJ5boOVghMPA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [65]:
pd.DataFrame(corr_mat, index=ub_lv_ph.index, columns=ub_lv_ph.index)

business_id,--9e1ONYQuAa-CB_Rrw7Tw,--g-a85VwrdZJNf0R95GcQ,-050d_XIor1NpCuWkbIVaQ,-0BxAGlIk5DJAGVkpqBXxg,-0WegMt6Cy966qlDKhu6jA,-0aIra_B6iALlfqAriBSYA,-0tgMGl7D9B10YjSN2ujLA,-1UMR00eXtwaeh59pEiDjA,-1m9o3vGRA8IBPNvNqKLmA,-1vfRrlnNnNJ5boOVghMPA,...,zttcrQP4MxNS5X5itzStXg,zusXbyXnJfS1Q-pZE9elmQ,zuwba6QEBIDZT0tJZmNhdQ,zvsJSNdg9UXEIL6FKiGa3Q,zwNC-Ow4eIMan2__bS9-rg,zwddv6HjTjDx9fOr3MFtwQ,zwmps5SXn30g-f5wqg_r9A,zxVw9U56heInhfyprhkxIg,zzsU528uoRB6qZUGhKDa6w,zzzaIBwimxVej4tY6qFOUQ
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
--9e1ONYQuAa-CB_Rrw7Tw,1.000000e+00,9.972858e-05,0.029666,0.029635,0.016602,-0.000227,0.000042,0.017442,2.526335e-07,-0.000145,...,0.000073,-0.000035,-0.000073,-0.000005,0.034365,-0.000163,0.011417,-0.000137,-0.000124,-0.000032
--g-a85VwrdZJNf0R95GcQ,9.972858e-05,1.000000e+00,0.044737,-0.001092,0.000296,-0.002292,-0.003259,0.000043,8.578470e-04,-0.004422,...,-0.001473,-0.000940,-0.000115,0.002901,-0.000903,-0.002208,-0.000739,-0.006433,-0.005138,-0.001196
-050d_XIor1NpCuWkbIVaQ,2.966635e-02,4.473694e-02,1.000000,0.000351,0.024347,0.000584,0.039988,0.029662,-8.771990e-04,0.000093,...,-0.000130,0.000301,0.000221,0.000284,0.008232,0.000610,0.015238,-0.000530,0.000483,-0.000352
-0BxAGlIk5DJAGVkpqBXxg,2.963480e-02,-1.092373e-03,0.000351,1.000000,0.004727,-0.003316,-0.000530,-0.000189,-4.958490e-04,-0.009625,...,-0.000941,-0.003795,0.001310,-0.002982,-0.000771,-0.001015,0.001435,-0.000569,-0.000077,0.000786
-0WegMt6Cy966qlDKhu6jA,1.660172e-02,2.962004e-04,0.024347,0.004727,1.000000,-0.000756,0.000490,0.015033,-5.059583e-03,0.000016,...,-0.000582,0.001907,0.001778,0.000744,0.000137,0.001120,-0.004749,-0.002551,0.003352,-0.001247
-0aIra_B6iALlfqAriBSYA,-2.266604e-04,-2.292321e-03,0.000584,-0.003316,-0.000756,1.000000,-0.000076,0.000157,7.814958e-04,-0.001121,...,-0.001913,0.000845,0.004263,-0.002691,-0.000106,-0.000193,0.000037,0.001542,-0.000535,0.000792
-0tgMGl7D9B10YjSN2ujLA,4.241144e-05,-3.259191e-03,0.039988,-0.000530,0.000490,-0.000076,1.000000,-0.000030,-5.571636e-04,-0.001091,...,-0.000097,-0.000265,-0.000242,-0.001207,0.000125,0.000430,0.047517,0.003680,0.003134,-0.000638
-1UMR00eXtwaeh59pEiDjA,1.744188e-02,4.302254e-05,0.029662,-0.000189,0.015033,0.000157,-0.000030,1.000000,-1.883643e-04,-0.000195,...,-0.000207,0.000048,-0.000238,0.000097,0.000042,-0.000078,-0.000122,-0.000474,0.000085,-0.000375
-1m9o3vGRA8IBPNvNqKLmA,2.526335e-07,8.578470e-04,-0.000877,-0.000496,-0.005060,0.000781,-0.000557,-0.000188,1.000000e+00,-0.007923,...,-0.000934,0.000570,0.001073,0.002419,0.001704,-0.002690,-0.001941,-0.008888,-0.000186,0.001096
-1vfRrlnNnNJ5boOVghMPA,-1.445703e-04,-4.421744e-03,0.000093,-0.009625,0.000016,-0.001121,-0.001091,-0.000195,-7.922679e-03,1.000000,...,-0.000602,0.000225,0.004435,0.000153,0.000004,-0.001136,-0.002498,0.004572,-0.006734,-0.000621


In [68]:
# save the business-business and user-user correlation matrix
pd.DataFrame(corr_mat, index=ub_lv_ph.index, columns=ub_lv_ph.index).to_csv('results/CF_business')
pd.DataFrame(corr_mat_u, index=ub_lv_ph.columns, columns=ub_lv_ph.columns).to_csv('results/CF_user')

### Recommending a Business

In [114]:
random.choice(u_lv_and_ph)

'-hJ9Q0OSYqdOTVBLOMdg7A'

In [21]:
# return list of business_id the user has left review
r[r['user_id'] == '-hJ9Q0OSYqdOTVBLOMdg7A']

Unnamed: 0_level_0,business_id,user_id,stars,city
review_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
08qJLkJdAVY6L_3R0Jqffw,Tp07u-0AOFRfQhtNpAXaQA,-hJ9Q0OSYqdOTVBLOMdg7A,4,Phoenix
DcLdUnDi5ATXvywe4yJMXw,pSQFynH1VxkfSmehRXlZWw,-hJ9Q0OSYqdOTVBLOMdg7A,2,Phoenix
BoguEYl3yC1bVU0KjdRqIw,NG2mL-wQQfBdsXsyUbw8pA,-hJ9Q0OSYqdOTVBLOMdg7A,4,Las Vegas
o8F3m8mQiuWYTbJEmFIyyw,pomGBqfbxcqPv14c3XH-ZQ,-hJ9Q0OSYqdOTVBLOMdg7A,5,Las Vegas
e96mj8MSNvJ3OWoA_90x9Q,Lau3N654yNyWdIIybea1uA,-hJ9Q0OSYqdOTVBLOMdg7A,3,Las Vegas
fu0nJ48BBtuGbitGgnoJOw,C3gKBIsZKR3bp43BbOu8qA,-hJ9Q0OSYqdOTVBLOMdg7A,4,Phoenix
oT6crDH5T2rzZKeZg3orFQ,9avLLw9uke50m8qO9doyMQ,-hJ9Q0OSYqdOTVBLOMdg7A,5,Las Vegas
zlysQ4yI60iafYn-_GZpAQ,jgtEoeE2uoA27J_UvbE20g,-hJ9Q0OSYqdOTVBLOMdg7A,5,Las Vegas
gGtN0IzeSh-BBmIcRNMJfQ,qJEtXdrOQeo9py0Wqr6LKQ,-hJ9Q0OSYqdOTVBLOMdg7A,5,Las Vegas
eE6KFM3a8IoKRf8bK9Liyw,079CV1EE5WLdQqVEVYFeHQ,-hJ9Q0OSYqdOTVBLOMdg7A,4,Phoenix


In [24]:
ub_pd = pd.read_csv('data/ub_lv_ph', index_col='business_id')
ub_pd.head()

Unnamed: 0_level_0,--3WaS23LcIXtxyFULJHTA,--41c9Tl0C9OGewIR7Qyzg,--4q8EyqThydQm-eKZpS-A,--CIuK7sUpaNzalLAlHJKA,--ty7Z9fEt08E3dS3_qoSA,-0AvEZuSWLtUpasFb4fmfg,-0IiMAZI2SsQ7VmyzJjokQ,-0_qOZXT_xwI7RdGZEAynA,-0aInSHjCWLfiNqfgmWnow,-0udWcFQEt2M8kM3xcIofw,...,zwpP3Lt8USKS91Ds52G8UA,zx26BmC67gLmQrAgY-nT1g,zx3y74_pvIRuQSVIrgzCew,zx6limw6-cG5APUL6LCDVA,zxCOwzZmH1s0KbQbhPIdBw,zxMHVPoMWY5myDNWYAKt8g,zy5vPwlDT95fMMF4c7bRkg,zy7D8MZ8NwXO8uDraQby8g,zzOd64i66_Z5roGjXmwCXA,zzaq5Fn1U2Feut3dGxidNg
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
--9e1ONYQuAa-CB_Rrw7Tw,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
--g-a85VwrdZJNf0R95GcQ,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
-050d_XIor1NpCuWkbIVaQ,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
-0BxAGlIk5DJAGVkpqBXxg,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
-0WegMt6Cy966qlDKhu6jA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [42]:
# generate list of all business_id
business_list = list(ub_pd.index)

# extract the correlation of the specific business_id
corr_example = corr_mat[business_list.index('Tp07u-0AOFRfQhtNpAXaQA')]
#list(ub[(corr_example < 1.0) & (corr_example > 0.9)])

# return dictionary
dict_example = dict(zip(list(ub_pd.index), list(corr_example)))

In [35]:
# create Las Vegas business list from the intersection
b_lv_intersect = set(u_lv_ph.business_id[u_lv_ph['city'] == 'Las Vegas'])

In [41]:
# subset only Las Vegas
dict_example_subset = {x: dict_example[x] for x in b_lv_intersect}
sorted(dict_example_subset.items(), key=lambda kv: kv[1], reverse=True)

[('i0ToFQWsWPzFEC1eAxBh6Q', 0.17584771765011203),
 ('NG2mL-wQQfBdsXsyUbw8pA', 0.14925638910423733),
 ('hmgbkFngXKjpWqJItOPYpA', 0.14348642697264236),
 ('9avLLw9uke50m8qO9doyMQ', 0.1188264836679445),
 ('L-q2fn9CDKmWX5c2hXUD5w', 0.11418424064388745),
 ('3Gw8YcAjo3AcMbtbX1TGSA', 0.10052927464498415),
 ('Z3y47L3EY-axAEVbwmXZaA', 0.09839533072103013),
 ('8tMy7yEeNVYylVVFRlPmdQ', 0.09728087146271658),
 ('gyV9Ar7o-K-VfDkRh7ETRQ', 0.09110276885624914),
 ('JVTIY3KAIaIiL0fxY82SoQ', 0.08253619730200429),
 ('zm-nB9xWL0RWZ-zoL7JNuQ', 0.08174081310391973),
 ('UDE4VL0k8Zrko7RkJ4agGQ', 0.08102943401008346),
 ('s4sr0anCKMyQw53xNv6DQA', 0.07113645171317556),
 ('JgkfjCL1NfMBC_M_hax_3Q', 0.0705797559958959),
 ('igU-mCXzyn-gmwSzJgeMEQ', 0.06912266754067066),
 ('UNMBMKI9FjnO9flxPxV-1w', 0.06556049143285897),
 ('0WPXDs4ini3D7jpiy1Daqw', 0.06212422427912537),
 ('KhWl4Mwhm_Oqq0zIIz-0wQ', 0.06168590774616813),
 ('2B46bRpDh49eDyjXGhL_ZQ', 0.059620890324238396),
 ('w55sCX7ZkLIJNhl_RhwjEg', 0.0593730121545443),
 (

In [43]:
# compare with the user's actual visits
# return list of business_id the user has left review
r[(r['user_id'] == '-hJ9Q0OSYqdOTVBLOMdg7A') & (r['city'] == 'Las Vegas')]

Unnamed: 0_level_0,business_id,user_id,stars,city
review_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
BoguEYl3yC1bVU0KjdRqIw,NG2mL-wQQfBdsXsyUbw8pA,-hJ9Q0OSYqdOTVBLOMdg7A,4,Las Vegas
o8F3m8mQiuWYTbJEmFIyyw,pomGBqfbxcqPv14c3XH-ZQ,-hJ9Q0OSYqdOTVBLOMdg7A,5,Las Vegas
e96mj8MSNvJ3OWoA_90x9Q,Lau3N654yNyWdIIybea1uA,-hJ9Q0OSYqdOTVBLOMdg7A,3,Las Vegas
oT6crDH5T2rzZKeZg3orFQ,9avLLw9uke50m8qO9doyMQ,-hJ9Q0OSYqdOTVBLOMdg7A,5,Las Vegas
zlysQ4yI60iafYn-_GZpAQ,jgtEoeE2uoA27J_UvbE20g,-hJ9Q0OSYqdOTVBLOMdg7A,5,Las Vegas
gGtN0IzeSh-BBmIcRNMJfQ,qJEtXdrOQeo9py0Wqr6LKQ,-hJ9Q0OSYqdOTVBLOMdg7A,5,Las Vegas
o9RCF0g9MWHYwnj02MfYfQ,UDE4VL0k8Zrko7RkJ4agGQ,-hJ9Q0OSYqdOTVBLOMdg7A,5,Las Vegas
nX4ADyrvk2oEcXrAxmjzuw,L-q2fn9CDKmWX5c2hXUD5w,-hJ9Q0OSYqdOTVBLOMdg7A,5,Las Vegas
Cd7CAMibHCl-pKklvzVBeQ,FNe5PPA9pyj8FjcDefCBpg,-hJ9Q0OSYqdOTVBLOMdg7A,5,Las Vegas
K_Egq7OwwNZI-RigEPJsVw,2iTsRqUsPGRH1li1WVRvKQ,-hJ9Q0OSYqdOTVBLOMdg7A,5,Las Vegas


### Define function to iterate through high-rated businesses a user has been to

In [None]:
def recommender(user_no):
    
    #get all book isbn
    business_id = df_predict.index

    #user predicted rating to all books
    user_predicted_rating = df_predict[user_no]

    #combine book rating and book detail
    user_rating_book = pd.concat([user_predicted_rating,business.set_index('business_id')], axis=1)

    #books already read by user
    already_read = u_lv_ph[u_lv_ph['user_id'].isin([user_no])]['business_id']

    #recommendation without books being read by user
    all_rec = user_rating_book[~user_rating_book.index.isin(already_read)]

    return all_rec.sort_values(by=[user_no], ascending=False).iloc[0:10]

In [None]:
def recommender(user_no, city='Las Vegas'):
    
    #get all business_id
    business_id = df_predict.index
    b_all = list(ub_pd.index)
    
    #get list of business_id user has been in another city, that is 4 or 5 star rating
    b_fav = list(r[(r['user_id'] == user_no) & (r['city'] != city) & r['stars'] >= 4]['business_id'])
    
    #iterate
    dict_example = dict(zip(list(ub_pd.index), list(corr_example)))
    b_lv_intersect = set(u_lv_ph.business_id[u_lv_ph['city'] == 'Las Vegas'])
    dict_example_subset = {x: dict_example[x] for x in b_lv_intersect}
    sorted(dict_example_subset.items(), key=lambda kv: kv[1], reverse=True)
    
    for b in b_fav:
        corr_ex = corr_mat[b_all.index(b)]
        res = 

    #user predicted rating to all books
    user_predicted_rating = df_predict[user_no]
    corr_example = corr_mat[business_list.index('Tp07u-0AOFRfQhtNpAXaQA')

    #combine book rating and book detail
    user_rating_book = pd.concat([user_predicted_rating,business.set_index('business_id')], axis=1)

    #books already read by user
    already_read = u_lv_ph[u_lv_ph['user_id'].isin([user_no])]['business_id']

    #recommendation without books being read by user
    all_rec = user_rating_book[~user_rating_book.index.isin(already_read)]

    return all_rec.sort_values(by=[user_no], ascending=False).iloc[0:10]