In [1]:
import numpy as np
import pandas as pd

# Load item and rating info

In [2]:
items = pd.read_excel('office.xlsx', sheet_name='Items', index_col=0)

In [3]:
ratings = pd.read_excel('office.xlsx', sheet_name='Ratings', index_col=0).stack().reset_index()
ratings.columns = ['Product', 'User', 'Rating']
ratings['User'] = ratings['User'].astype(int)
ratings['Rating'] = ratings['Rating'].astype(int)
ratings.sort_values(by=['User', 'Product'], inplace=True)
ratings.head()

Unnamed: 0,Product,User,Rating
37,72,64,4
426,1240,64,5
564,1300,64,4
573,1317,64,5
620,1327,64,3


In [4]:
good_ratings = ratings.loc[(ratings['Rating']==4) | (ratings['Rating']==5)]

# Load predicted rating

In [210]:
cbf = pd.read_excel('office.xlsx', sheet_name='CBF', index_col=0)
itemitem = pd.read_excel('office.xlsx', sheet_name='Item-Item', index_col=0)
useruser = pd.read_excel('office.xlsx', sheet_name='User-User', index_col=0).dropna(axis=1)

In [201]:
def get_rec(predicted_ratings, k=5, combined=False, get_product=True, stacked=True):
    predicted_ratings.columns = predicted_ratings.columns.astype(int)
    if combined:
        rec = predicted_ratings.apply(lambda user: get_rec_combined(user, k).index) if get_product else predicted_ratings.apply(lambda user: get_rec_combined(user, k).values) 
    else:
        rec = predicted_ratings.apply(lambda user: user.nlargest(k).index) if get_product else predicted_ratings.apply(lambda user: user.nlargest(k).values) 
    if not stacked:
        return rec
    else:
        rec = rec.stack().reset_index(level=1)
        rec.columns = ['User', 'Product'] if get_product else ['User', 'Rating']
        rec.sort_values(by=['User'], inplace=True)
        return rec

In [7]:
def get_metrics(rec, good_ratings, items):
    precision = len(pd.merge(rec, good_ratings[['Product', 'User']])) / 500
    price_div = items.loc[rec['Product'], 'Price'].std()
    avail_avg = items.loc[rec['Product'], 'Availability'].mean()
    return precision, price_div, avail_avg

# Get performance results for each algorithm

In [211]:
get_metrics(get_rec(cbf), good_ratings, items)

(0.104, 31.761838890123908, 0.5853634540762697)

In [212]:
get_metrics(get_rec(itemitem), good_ratings, items)

(0.214, 36.276372726040655, 0.6133728647426742)

In [213]:
get_metrics(get_rec(useruser), good_ratings, items)

(0.52, 40.496623332098146, 0.6964598020812371)

# Building hybrids

In [214]:
cbf = cbf[useruser.columns]
itemitem = itemitem[useruser.columns]

In [215]:
cbf.shape, itemitem.shape, useruser.shape

((200, 99), (200, 99), (200, 99))

## Sort items by rating

In [38]:
combined_ratings = pd.concat([cbf, itemitem, useruser])

In [71]:
def get_rec_combined(user, k=5):
    sorted_ratings = user.sort_values(ascending=False)
    sorted_non_duplicated = sorted_ratings.loc[~sorted_ratings.index.duplicated()]
    return sorted_non_duplicated.iloc[:k]

In [76]:
get_metrics(get_rec(combined_ratings, combined=True), good_ratings, items)

(0.288, 36.22843228446957, 0.630450961536144)

## Get items that appear in most recommenders

In [187]:
rec_cbf = get_rec(cbf)
rec_ii = get_rec(itemitem)
rec_uu = get_rec(useruser)

In [188]:
combined_rec = pd.concat([rec_cbf, rec_ii, rec_uu])

In [189]:
rating_cbf = get_rec(cbf, get_product=False)
rating_ii = get_rec(itemitem, get_product=False)
rating_uu = get_rec(useruser, get_product=False)

In [190]:
combined_rating = pd.concat([rating_cbf, rating_ii, rating_uu])

In [191]:
combined_rec['Rating'] = combined_rating['Rating']

In [192]:
combined_rec = combined_rec.groupby(['User', 'Product'])['Rating'].agg(['size', 'mean']).reset_index()

In [197]:
combined_rec = combined_rec.groupby('User').apply(lambda user: user.sort_values(by=['size', 'mean'], ascending=False)[:5]).reset_index(level=0, drop=True)

In [199]:
get_metrics(combined_rec[['User', 'Product']], good_ratings, items)

(0.308, 36.47239996721786, 0.6340136963630004)

## Weighted hybrid

In [216]:
p_cbf = 0.104
p_ii = 0.214
p_uu = 0.52

In [219]:
weighed_ratings = (cbf * p_cbf + itemitem * p_ii + useruser * p_uu) / (p_cbf + p_ii + p_uu)

In [220]:
get_metrics(get_rec(weighed_ratings), good_ratings, items)

(0.42, 35.994930360903346, 0.6900636798560479)