# Import Packages

In [1]:
from tqdm import tqdm
from lenskit import batch, topn, util
from lenskit import crossfold as xf
from lenskit.algorithms import Recommender, als, bias, item_knn as knn
from lenskit.metrics import topn
from lenskit.metrics.predict import rmse, mae
import pickle
import pandas as pd
%matplotlib inline

import pyarrow.parquet as pq

import numpy as np

# Data Processing

In [2]:
train = pq.read_table('cf_train_new.pq').to_pandas()

usr_idx_dict = {}
for idx, usr in enumerate(train.user_id.unique()):
    usr_idx_dict[usr] = idx

train['user'] = train.user_id.map(usr_idx_dict)

track_idx_dict = {}
for idx, track in enumerate(train.track_id.unique()):
    track_idx_dict[track] = idx

train['item'] = train.track_id.map(track_idx_dict)

train = train.rename({'count':'rating'}, axis = 1)

train.head()

Unnamed: 0,user_id,rating,track_id,user,item
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,1,TRIQAUQ128F42435AD,0,0
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,1,TRIRLYL128F42539D1,0,1
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,2,TRMHBXZ128F4238406,0,2
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,1,TRYQMNI128F147C1C7,0,3
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,1,TRAHZNE128F9341B86,0,4


In [3]:
val = pq.read_table('cf_test.pq').to_pandas()

# usr_idx_dict = {}
# for idx, usr in enumerate(val.user_id.unique()):
#     usr_idx_dict[usr] = idx

val['user'] = val.user_id.map(usr_idx_dict)

# track_idx_dict = {}
# for idx, track in enumerate(val.track_id.unique()):
#     track_idx_dict[track] = idx

val['item'] = val.track_id.map(track_idx_dict)

val = val.rename({'count':'rating'}, axis = 1)

val.head()

Unnamed: 0,user_id,rating,track_id,user,item
0,00007a02388c208ea7176479f6ae06f8224355b3,1,TRXYDST128F92EC024,1029318,54969.0
1,00007a02388c208ea7176479f6ae06f8224355b3,2,TRJREQL128F92EF09A,1029318,2086.0
2,00007a02388c208ea7176479f6ae06f8224355b3,3,TRFXKPH128E0793B8E,1029318,3020.0
3,00007a02388c208ea7176479f6ae06f8224355b3,1,TRMSLFG128F93172F0,1029318,13582.0
4,00007a02388c208ea7176479f6ae06f8224355b3,1,TRYNPHN128F92EF091,1029318,2130.0


In [4]:
truth = val[['user', 'item', 'rating']].dropna()

sub_ids = truth.user.unique()

In [5]:
truth['user'] = truth['user'].map(int)
truth['item'] = truth['item'].map(int)
truth.head()

Unnamed: 0,user,item,rating
0,1029318,54969,1
1,1029318,2086,2
2,1029318,3020,3
3,1029318,13582,1
4,1029318,2130,1


In [6]:
truth_listed = pd.DataFrame({'item':truth.groupby('user').item.apply(list)}).reset_index()

truth_listed.head()

Unnamed: 0,user,item
0,1029318,"[54969, 2086, 3020, 13582, 2130, 47446, 2644, ..."
1,1029319,"[116519, 8584, 72204, 183817, 132142, 89172, 1..."
2,1029320,"[16568, 3756, 1130, 11, 22, 200, 3366, 4607, 1..."
3,1029321,"[5673, 329872, 286165, 30764, 809, 796, 30751,..."
4,1029322,"[12628, 25977, 7882, 67393, 7979, 20684, 24574..."


In [7]:
len(truth_listed.user.unique())

100000

# ALS model - single machine

In [5]:
# algo_als = als.ImplicitMF(90, iterations = 30, reg = 0.01, weight = 40, method = 'lu') 
# #tunable: features, iterations, reg, weight

# fittable = util.clone(algo_als)

# fittable = Recommender.adapt(fittable)

# fittable.fit(train, n_jobs = -1)

<lenskit.algorithms.basic.TopN at 0x147573263b80>

In [8]:
with open('model_val_200.pkl', 'rb') as f:
    fittable = pickle.load(f)

In [9]:
recs = batch.recommend(fittable, sub_ids, 500, n_jobs = 4)
recs['user'] = recs['user'].map(int)
recs.head()

Unnamed: 0,item,score,user,rank
0,8992,0.066132,1029318,1
1,9452,0.057833,1029318,2
2,6713,0.054853,1029318,3
3,10191,0.049022,1029318,4
4,6712,0.048682,1029318,5


In [10]:
recs_listed = pd.DataFrame({'item':recs.groupby('user').item.apply(list)}).reset_index()

In [11]:
recs_listed.head()

Unnamed: 0,user,item
0,1029318,"[8992, 9452, 6713, 10191, 6712, 14357, 27163, ..."
1,1029319,"[744, 185, 863, 839, 675, 847, 15298, 701, 133..."
2,1029320,"[1247, 1303, 6481, 3398, 3366, 1746, 7631, 295..."
3,1029321,"[9508, 1463, 1685, 25868, 3511, 8138, 1340, 19..."
4,1029322,"[7749, 448, 7979, 3628, 7882, 7673, 460, 24016..."


In [12]:
recs_listed.to_pickle('recs_cf_200.pkl')

In [15]:
# len(truth_listed[truth_listed['user'] == 1010207].item.values[0])

# truth_listed.user.values[0]

# [truth_listed[truth_listed['user'] == 1010207].item.values[0]]

In [13]:
def pk(k, rec_df, truth_df):
    '''
    compute precision at k
    '''
    ct_usr = 0
    for usr in tqdm(truth_df.user.values):
        ct_rec = 0
        rec_all = rec_df[rec_df['user'] == usr].item.values[0][:k]
        val_all = truth_df[truth_df['user'] == usr].item.values[0]
        ttl = [rec_item in val_all for rec_item in rec_all]
        ct_rec = sum(ttl)
        ct_usr += ct_rec / k
    return ct_usr / len(truth_df.user.values)      

In [14]:
pk(500, recs_listed, truth_listed)

100%|██████████| 100000/100000 [02:01<00:00, 820.01it/s]


0.011210979999995166

In [15]:
def meanAP(rec_df, truth_df):
    ct_usr = 0
    for usr in tqdm(truth_df.user.values):
        ct_rec = 0
        rec_all = rec_df[rec_df['user'] == usr].item.values[0]
        val_all = truth_df[truth_df['user'] == usr].item.values[0]
        ttl = [rec_item in val_all for rec_item in rec_all]
        ttl = [v/(j+1) for j,v in enumerate(ttl)]
        ct_rec += sum(ttl)
        ct_usr += ct_rec / len(val_all)
        
    return ct_usr / len(truth_df.user.values)

In [16]:
meanAP(recs_listed, truth_listed)

100%|██████████| 100000/100000 [02:07<00:00, 783.13it/s]


0.04352420877165051

In [17]:
def ndcg(k, rec_df, truth_df):
    ct_usr = 0
    for usr in tqdm(truth_df.user.values):
        rec_all = rec_df[rec_df['user'] == usr].item.values[0]
        val_all = truth_df[truth_df['user'] == usr].item.values[0]
        n = min(max(len(rec_all), len(val_all)), k)
        idcg_n = min(len(val_all), k)
        idcg = sum([1/(np.log(j+2)) for j in range(idcg_n)])
        ttl = [rec_item in val_all for rec_item in rec_all[:n]]
        ttl = sum([v/np.log(j+2) for j, v in enumerate(ttl)])
        ttl *= 1/idcg
        ct_usr += ttl
    return ct_usr / len(truth_df.user.values)

In [18]:
ndcg(500,recs_listed, truth_listed)

100%|██████████| 100000/100000 [05:13<00:00, 318.65it/s]


0.24081916426642405

In [None]:
# combined = recs.join(truth, on = 'user', how = 'inner', lsuffix = '_rec', rsuffix = '_val')

# #combined = combined.dropna()

# len(combined.user.unique())

# combined.head()

# m_rmse = combined.groupby('user').apply(lambda df: rmse(df.score, df.rating))

# m_mae = combined.groupby('user').apply(lambda df: mae(df.score, df.rating))

# m_ndcg = topn.ndcg(recs, truth)


# m_rmse.describe()

# m_mae.describe()

# m_ndcg

# combined.loc[combined['item_rec'] != combined['item_val'], 'score'] = 0
# # combined_bias['score'][combined_bias['item_rec'] != combined_bias['item_val']] = 0

# m_rmse = combined.groupby('user').apply(lambda df: rmse(df.score, df.rating))

# m_mae = combined.groupby('user').apply(lambda df: mae(df.score, df.rating))

# m_ndcg = topn.ndcg(recs, truth)


# m_rmse.describe()

# m_mae.describe()

# m_ndcg

# Popularity baseline Model

In [16]:
# bias_errs = {}

# # search
# # damping_params = range(0, 301, 50)
# damping_params = range(550, 801, 50)
# for damp in damping_params:
#     bias_errs[str(damp)] = []
#     algo_bias = bias.Bias(items = True, users = True, damping = damp) #tunable: damping

#     fittable_bias = util.clone(algo_bias)

#     fittable_bias = Recommender.adapt(fittable_bias)

#     fittable_bias.fit(train, n_jobs = -1)

#     recs_bias = batch.recommend(fittable_bias, sub_ids, 500, n_jobs = 4)
#     recs_bias['user'] = recs_bias['user'].map(int)
#     # recs_bias.head()

#     recs_bias_listed = pd.DataFrame({'item':recs_bias.groupby('user').item.apply(list)}).reset_index()
#     bias_errs[str(damp)].append(pk(500, recs_bias_listed, truth_listed))
#     bias_errs[str(damp)].append(meanAP(recs_bias_listed, truth_listed))
#     bias_errs[str(damp)].append(ndcg(500,recs_bias_listed, truth_listed))

# # with open('broad_bias_errs.pkl', 'rb') as f:
# #     be2 = pickle.load(f)
# # for kk, vv in be2.items():
# #     print(kk, vv)

In [26]:
# for kk, vv in bias_errs.items():
#     print(kk, vv)

350 [0.001206999999999995, 0.0008415124671512697, 0.017014216311997087]
400 [0.0012439999999999958, 0.0008986458215045353, 0.017740772308239873]
450 [0.0012637999999999963, 0.0009570828751731549, 0.018252331042585022]
500 [0.0013548000000000004, 0.0010114033708371734, 0.019488620023233233]
550 [0.0014354000000000031, 0.0010508896009842248, 0.02056803929095203]
600 [0.0014624000000000032, 0.0011045797707726426, 0.021097409096712493]
650 [0.0014752000000000033, 0.0011639741300835903, 0.021482521759672257]
700 [0.0015016000000000042, 0.0012171644284973671, 0.02197270689039014]
750 [0.0015238000000000055, 0.0012451335710780935, 0.022388475391666926]
800 [0.0015364000000000066, 0.0012685335549458096, 0.022670488792884563]


In [25]:
# with open('broad_bias_errs_3.pkl', 'wb') as f:
#     pickle.dump(bias_errs, f)

In [8]:
algo_bias = bias.Bias(items = True, users = True, damping = (1000000, 10000000)) #tunable: damping

#fittable_bias = util.clone(algo_bias)
fittable_bias = algo_bias
fittable_bias = Recommender.adapt(fittable_bias)

fittable_bias.fit(train, n_jobs = -1)

<lenskit.algorithms.basic.TopN at 0x155479801940>

In [9]:
recs_bias = batch.recommend(fittable_bias, sub_ids, 500, n_jobs = 4)
recs_bias['user'] = recs_bias['user'].map(int)
recs_bias.head()

Unnamed: 0,item,score,user,rank
0,202,2.926611,1029318,1
1,199,2.916619,1029318,2
2,218,2.906885,1029318,3
3,206,2.895803,1029318,4
4,5017,2.892731,1029318,5


In [10]:
recs_bias_listed = pd.DataFrame({'item':recs_bias.groupby('user').item.apply(list)}).reset_index()

In [11]:
recs_bias_listed.head()

Unnamed: 0,user,item
0,1029318,"[202, 199, 218, 206, 5017, 1824, 222, 2674, 22..."
1,1029319,"[202, 199, 218, 206, 5017, 1824, 222, 2674, 22..."
2,1029320,"[202, 199, 218, 206, 5017, 1824, 222, 2674, 22..."
3,1029321,"[202, 199, 218, 206, 5017, 1824, 222, 2674, 22..."
4,1029322,"[202, 199, 218, 206, 5017, 1824, 222, 2674, 22..."


In [12]:
recs_bias_listed.to_pickle('recs_bias_final.pkl')

In [17]:
pk(500, recs_bias_listed, truth_listed)

100%|██████████| 100000/100000 [02:03<00:00, 809.57it/s]


0.003123780000000688

In [18]:
meanAP(recs_bias_listed, truth_listed)

100%|██████████| 100000/100000 [02:09<00:00, 772.06it/s]


0.011919438084520916

In [19]:
ndcg(500,recs_bias_listed, truth_listed)

100%|██████████| 100000/100000 [05:22<00:00, 310.25it/s]


0.06723044157342824

In [None]:
# combined_bias = recs_bias.join(truth, on = ['user'], how = 'inner', lsuffix = '_rec', rsuffix = '_val')

# combined_bias = combined_bias.dropna()

# len(combined_bias.user.unique())

# combined_bias.head()

# m_rmse_bias = combined_bias.groupby('user').apply(lambda df: rmse(df.score, df.rating))

# m_mae_bias = combined_bias.groupby('user').apply(lambda df: mae(df.score, df.rating))

# m_ndcg_bias = topn.ndcg(recs_bias, truth)

# # rla = topn.RecListAnalysis()

# # rla.add_metric(topn.ndcg)

# # m_ndcg = rla.compute(recs_bias, truth)

# m_rmse_bias.describe()

# m_mae_bias.describe()

# m_ndcg_bias

# combined_bias.loc[combined_bias['item_rec'] != combined_bias['item_val'], 'score'] = 0
# # combined_bias['score'][combined_bias['item_rec'] != combined_bias['item_val']] = 0

# m_rmse_bias = combined_bias.groupby('user').apply(lambda df: rmse(df.score, df.rating))

# m_mae_bias = combined_bias.groupby('user').apply(lambda df: mae(df.score, df.rating))

# m_ndcg_bias = topn.ndcg(recs_bias, truth)

# # rla = topn.RecListAnalysis()

# # rla.add_metric(topn.ndcg)

# # m_ndcg = rla.compute(recs_bias, truth)

# m_rmse_bias.describe()

# m_mae_bias.describe()

# m_ndcg_bias