In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from scipy.linalg import svd
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import MinMaxScaler

In [9]:
pd.set_option("display.max_columns", 500)
pd.set_option("display.max_rows", 500)
pd.set_option("display.precision", 4)
pd.options.display.max_colwidth = 500

In [42]:
aspect_df = pickle.load(open('aspect_df.p', 'rb'))
aspect_df = aspect_df.sample(frac=0.5, replace=False, random_state=42)

In [16]:
df_subset = pickle.load(open('df.p', 'rb'))

In [54]:
scaler = MinMaxScaler(feature_range=(0, 1))

In [55]:
group_aspect = aspect_df.groupby('business_id').mean().reset_index()
features = scaler.fit_transform(group_aspect.iloc[:, 1:6])
group_aspect.iloc[:, 1:6] = features

In [66]:
features = scaler.fit_transform(aspect_df.iloc[:, 2:6])
aspect_df.iloc[:, 2:6] = features

In [67]:
aspect_df['calc_rating'] = (aspect_df['price_score']+aspect_df['ambience_score']+aspect_df['service_score']+aspect_df['food_score'])*aspect_df['sentiment_score']

In [22]:
df_user = pd.pivot_table(aspect_df, values=['calc_rating'], index=['business_id', 'user_id'], aggfunc=np.mean).unstack()
df_user.columns = df_user.columns.droplevel()

In [23]:
df_user = df_user.fillna(-100)

In [24]:
food_usage, usage, usage_user = svd(df_user, full_matrices=False)

In [25]:
usages = [f'usage_{i}' for i in range(len(usage))]
usages = pd.Series(data=usage, index=usages)

In [26]:
df_food_usage = pd.DataFrame(data=food_usage, index=df_user.index, columns=usages.index)

In [27]:
df_usage_user = pd.DataFrame(data=usage_user, index=usages.index, columns=df_user.columns)
df_user_usage = df_usage_user.T

In [28]:
explained_varaince = (usages / usages.sum()).cumsum()
(explained_varaince <= 0.8).sum()

1160

In [29]:
dists_users = pairwise_distances(df_user_usage.iloc[:, :1144], metric='cosine')
dists_users = pd.DataFrame(dists_users, index=df_user_usage.index, columns=df_user_usage.index)

In [30]:
def get_user_neighbors(user, neighborhood=10):
    '''
    Given a user, return the closest users by the `dists_users` matrix
    '''
    neighbors = dists_users[user].sort_values()[:neighborhood]
    
    return neighbors

In [31]:
top_users = (df_user > 0).sum(axis=0).sort_values(ascending=False).index[:10]
top_users

Index(['6Ki3bAL0wx9ymbdJqbSWMA', 'Lfv4hefW1VbvaC2gatTFWA',
       'rCWrxuRC8_pfagpchtHp6A', 'fmzIm7RxEdii5Jz44PtO7g',
       '4wp4XI9AxKNqJima-xahlg', '8AwcaBJjiMpQ__FPxktwwQ',
       'd0D7L-vfQDIADolnPAcb9A', '4m9NXICYBC5i9t4aTt-I6w',
       'ACwBMSJzgW6vOvV7vOrk8Q', '5JVY32_bmTBfIGpCCsnAfw'],
      dtype='object', name='user_id')

In [32]:
user = 'rCWrxuRC8_pfagpchtHp6A'

n_neighbors = 60

neighbors = get_user_neighbors(user, n_neighbors+1)
neighbors = neighbors[1:]  # We don't want to include the user themself
neighbors

user_id
8tZg2zUibRdMpi4J07x8lQ    0.4429
MJK355xlNSmNIMQR9uiMhw    0.4761
uV6BVZvfZlkXE_kDl5TYvA    0.6184
cMICMnkK9tY8LZuo3c0wmw    0.6218
-GuvrWICxU0Geuzf6C6ueg    0.6219
JTxAi7AjAI9Kyomz_sjrGA    0.6221
Fzn6uxciZpmDqdqumbllsw    0.6224
oA9_53PYHfZ0hfj6yiLvXw    0.6459
RMyS4XeIhDTVTwZWrgIYQQ    0.6669
jE72VXCulJovxqa-1-xRKQ    0.6708
ova6bEhtQdJBoDpv-SXGBg    0.6737
5N8gnPF1Z-gNRSW61ml4wQ    0.6745
GELAYOjfMP3UFzsvGW_9hw    0.6749
q24Yt8KSOY9mmDFJQCWfnQ    0.6754
Bx9eipleQL2MXYnIF2NgQQ    0.6754
LYofqaVL9Gi0bD2MH8Mp0g    0.6754
rG3DBzCbrdduHPn6sWAGxA    0.6759
v2f0S4pYC9K9z9vSWvssdQ    0.6782
4PnnMO-RjQupXGy1peKBOA    0.6810
l3gaXTfWJfawjBRBUdsC-g    0.7378
-hH7gZo6LNoNmr4vSJXPnQ    0.7459
EVUcN4EEO-3gqhcsOTA2HQ    0.7459
harBoh7qn8PeCiz2haZ0nA    0.7459
0enntixMjOfDclGF-ivgzA    0.7460
I26_9YN1UBHuRb01AFCVmw    0.7461
TCtvPbF143x-3WsVpO8q2A    0.7461
bCEEgcAFFs23vt9XOj4_7g    0.7462
T2GVsPR6hK_TtUXtQXK3kw    0.7557
1LUpRfjduqrBbLpQU4IQcw    0.7717
e9eEpVE3BlwUQKVVYYv-FA    0.7780
Ha

In [33]:
untried_food = df_user[user] < -50
df_rec = df_user.loc[untried_food, neighbors.index].copy()

df_rec = df_rec.replace(-100, np.nan)  # Replace the un-rated beers with NAN values
df_rec = df_rec.dropna(how='all')  # Only keep rows where at least one neighbor rated the beer

df_rec

user_id,8tZg2zUibRdMpi4J07x8lQ,MJK355xlNSmNIMQR9uiMhw,uV6BVZvfZlkXE_kDl5TYvA,cMICMnkK9tY8LZuo3c0wmw,-GuvrWICxU0Geuzf6C6ueg,JTxAi7AjAI9Kyomz_sjrGA,Fzn6uxciZpmDqdqumbllsw,oA9_53PYHfZ0hfj6yiLvXw,RMyS4XeIhDTVTwZWrgIYQQ,jE72VXCulJovxqa-1-xRKQ,ova6bEhtQdJBoDpv-SXGBg,5N8gnPF1Z-gNRSW61ml4wQ,GELAYOjfMP3UFzsvGW_9hw,q24Yt8KSOY9mmDFJQCWfnQ,Bx9eipleQL2MXYnIF2NgQQ,LYofqaVL9Gi0bD2MH8Mp0g,rG3DBzCbrdduHPn6sWAGxA,v2f0S4pYC9K9z9vSWvssdQ,4PnnMO-RjQupXGy1peKBOA,l3gaXTfWJfawjBRBUdsC-g,-hH7gZo6LNoNmr4vSJXPnQ,EVUcN4EEO-3gqhcsOTA2HQ,harBoh7qn8PeCiz2haZ0nA,0enntixMjOfDclGF-ivgzA,I26_9YN1UBHuRb01AFCVmw,TCtvPbF143x-3WsVpO8q2A,bCEEgcAFFs23vt9XOj4_7g,T2GVsPR6hK_TtUXtQXK3kw,1LUpRfjduqrBbLpQU4IQcw,e9eEpVE3BlwUQKVVYYv-FA,HaJKQ4UgxVxV2fHjSL4muw,zIG_6Qupb_KcofxUhvWQxQ,-2d53Lq30jLAKAfB6UOMeA,4-l4Bzfi1HWMAnAZPcDQeA,8pMD14LKEbn3TLQf6z3dCg,bXLK0qX7Z_wHqIqN6P-61Q,VVuEWxu6EOlM983QI5L_xw,14aFc0knShstCJUEczUbwA,2P_6qkzuh6BUbukeTCM_bQ,H3WIdW9E-QUjZXPwEBdG2g,hOVKuWWPSZqWnZd7nodFBw,z8Pw1e7Q2S-01XmnkXLENw,eG0lzb04HP4IIUhL3I2eXw,IConfV9eyeT_KcnT3-ek9A,ck6zktLIoKeXNczooBaNmA,4FKi9pf5EIXFzsH36R7sWA,uOYhRRceTon7sgG7uOY_UA,dahbN08U85CHJNpCazrTrQ,f61VnlgdQcNvdxS5uHPcoA,ZrLYsX1yGYSpcZ2DNQbAvQ,Z_FzGGu5vggFwelcMy17oA,tgzykwiRJZyyLTUCqwBNEQ,F6VWtEmBci_w_lqcmnGJ1A,e5pyf1lm0H6t-6P8sq3tBQ,MClukJ-vTxTvi3vSUlur2A,Qtpzo30r83Jaqr4CX4-Urg,7D0V9E-9feDNFUkq581jlQ,fcNckRiv5_57ZW03t3HYmw,Ml_csiUbdMHpmgfcExSyiA,CNXwJj_a8qVRbfY1fC8bDQ
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1
6PaLW1K478xyePOxUI56vg,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4.895,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
WqV7zB_AbmUm9ww4f5W0kA,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4.8891,,,,,,,,


In [34]:
avg_food_rating = np.nanmean(df_rec, axis=1)
avg_food_rating = pd.Series(index=df_rec.index, data=avg_food_rating)
avg_food_rating.sort_values(ascending=False, inplace=True)

# Return up to `n_recommendations`
n_recommendations = 5
avg_food_rating[:n_recommendations]

business_id
6PaLW1K478xyePOxUI56vg    4.8950
WqV7zB_AbmUm9ww4f5W0kA    4.8891
dtype: float64

In [35]:
def get_recommendations(user, n_recommendations=5, n_neighbors=300):
    neighbors = get_user_neighbors(user, n_neighbors+1)
    neighbors = neighbors[1:]  # We don't want to include the user themself
    
    untried_food = df_user[user] < -40
    df_rec = df_user.loc[untried_food, neighbors.index].copy()

    df_rec = df_rec.replace(-100, np.nan)  # Replace the un-rated beers with NAN values
    df_rec = df_rec.dropna(how='all')  # Only keep rows where at least one neighbor rated the beer
    
    # Get the average rating among the top `n_neighbors` of our user
    avg_food_rating = np.nanmean(df_rec, axis=1)
    avg_food_rating = pd.Series(index=df_rec.index, data=avg_food_rating)
    avg_food_rating.sort_values(ascending=False, inplace=True)

    # Return up to `n_recommendations`
    return avg_food_rating[:n_recommendations]

In [51]:
def get_recommendation_eval(user, n_recommendations=15, n_neighbors=300):

    # Only get information for the top 100 beers
    df_eval = df_subset[df_subset.business_id.isin(df_user.index)]
    df_eval = df_eval[['business_id', 'business_name', 'business_city', 'business_categories', 'business_stars']].copy() \
                .drop_duplicates() \
                .set_index('business_id')

    # Get the user rank for each beer 
    df_eval['user_rank'] = df_user[user]

    # Calculate recommendations
    recommendations = get_recommendations(user, n_recommendations, n_neighbors)

    # Assign the recommendations to the dataframe (these have the same indices)
    df_eval['recommended_rank'] = recommendations

    # Only look at beers that have either ranked or recommended
    df_eval = df_eval[df_eval['recommended_rank'] > 0]

    return df_eval.sort_values(['business_name', 'business_categories', 'business_stars'])

In [49]:
get_recommendation_eval('5uXd58gs7dt9eqNw5vRawQ')

Unnamed: 0_level_0,business_name,business_city,business_categories,business_stars,user_rank,recommended_rank
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2EhBiUnUTl_H2s4DXdKCTg,Bistro 19,Pittsburgh,"Bistros, Restaurants, American (New)",4.0,-100.0,5.0157
kBd3_wYt0BX-RfS8ymUTfQ,Brugge On North,Pittsburgh,"Restaurants, American (Traditional)",4.0,-100.0,5.0056
E3mP15gcpSBys8PGeJsuvQ,Buon Cibo,Murrysville,"Seafood, Restaurants, Italian",4.0,-100.0,5.0053
in39cUCMBRjpwxw7qNSOfg,Condado Tacos - Lawrenceville,Pittsburgh,"Tex-Mex, Nightlife, Tacos, Restaurants, Mexican, Bars",4.0,-100.0,5.0107
Ltf3TUXESd0V9emwx8NXWA,Dive Bar and Grille,Pittsburgh,"Burgers, Nightlife, Bars, Restaurants, Breakfast & Brunch, American (Traditional)",4.0,-100.0,5.0112
ytlpNwWei-ESC15Vl0ZRlg,Eureka Chocolates and Gifts,Pittsburgh,"Candy Stores, Coffee & Tea, Specialty Food, Shopping, Cards & Stationery, Arts & Crafts, Chocolatiers & Shops, Flowers & Gifts, Desserts, Food, Event Planning & Services",5.0,-100.0,5.0098
Y8ceMeW3U0OxO-cRtRQbOA,Helicon Brewing,Oakdale,"Breweries, Food",4.0,-100.0,5.0307
0NoHJtug9xefI2OnsANaMA,Juniper Grill - Peters Township,McMurray,"American (New), Bars, Nightlife, Restaurants, Food, Smokehouse, Southern",3.5,-100.0,5.0049
eNsM5ezjhPQWI6XlUGrUhw,Lin's Asian Fusion,Pittsburgh,"Asian Fusion, Restaurants",3.5,-100.0,5.0058
Mh7C6p0KsuZLCB0vjrPgbQ,Modern Cafe,Pittsburgh,"Sandwiches, Beer Bar, Pubs, Restaurants, Bars, Sports Bars, Nightlife",4.0,-100.0,5.0065


In [65]:
df_subset[df_subset.user_id=='i79grgTjhFxqaQs9jcZKPA'][['business_id', 'business_name', 'business_categories','review_text', 'review_stars', 'business_stars', 'business_city']]

Unnamed: 0,business_id,business_name,business_categories,review_text,review_stars,business_stars,business_city
142915,XItYW5ul3OW_AqpT2nDbBQ,Park Bruges,"Restaurants, French, Belgian, Religious Organizations","One of my favorites. Bloody Marys for brunch and a great beer selection. Mussels get all the press here, but my fav is the Bruges burger with coleslaw!!",5.0,4.0,Pittsburgh
142916,Hdnx6cZBo0JfZopnQDWVYg,Teppanyaki Kyoto,"Restaurants, Japanese","I'm really torn, because I'd like a business like this to exist on Bryant and do well, but aside from the atmosphere and friendly service, I'm not really sure what the fascination is. Maybe if the pricing was lower... (maybe I'd find one or two staples that tasted OK) everything comes out of a pre-packaged frozen bag and cooked straight from frozen state. It doesn't seem like there's much authentic about it, maybe this is how it's done in other restaurants, but don't do it right in front of ...",2.0,4.0,Pittsburgh
142917,7aZf5c1UNotq4MabBXMZLA,Sichuan Gourmet,"Restaurants, Szechuan, Chinese, Soup, Seafood","Ok, ok, ok... So we ordered delivery on Christmas Eve, so I gave an extra star, because... Baby Jesus. But this was the most disgusting delivery food we've had in a long time. Vegetables were boiled down - which could explain why the containers were HALF FULL. Mostly luke warm at best. Spring rolls were 90% dough. General Tso's was soggy (I'll chalk that up to delivery, but have had somewhat decent breading from other places on delivery...) I'll just stop there, 'cause it's not worth my time.",2.0,3.5,Pittsburgh
142918,XNWXyFgCGaAwvg9QML2Ypw,Mad Mex - Shadyside,"Mexican, Tex-Mex, Restaurants","The plus side... open late. Do not go there if your expecting Mexican. Our food came out quickly, but still managed to taste like it was under a heat lamp for 15 min. The chips were definitely out of a bag, and the salsa was an effort on their part. Almost everything was an effort to eat. The only things standing between them and TGIFriday is the ""flare"" and the cactus for a door handle. Oh, and I guess TGIFriday probably wouldn't be playing any Fujia & Miagi - you know, the typical Mariachi.",2.0,3.5,Pittsburgh
