# Import Packages

In [1]:
from tqdm import tqdm
from lenskit import batch, topn, util
from lenskit import crossfold as xf
from lenskit.algorithms import Recommender, als, bias, item_knn as knn
from lenskit.metrics import topn
from lenskit.metrics.predict import rmse, mae
import pickle
import pandas as pd
%matplotlib inline

import pyarrow.parquet as pq

import numpy as np

# Data Processing

In [2]:
train = pq.read_table('cf_train_new.pq').to_pandas()

usr_idx_dict = {}
for idx, usr in enumerate(train.user_id.unique()):
    usr_idx_dict[usr] = idx

train['user'] = train.user_id.map(usr_idx_dict)

track_idx_dict = {}
for idx, track in enumerate(train.track_id.unique()):
    track_idx_dict[track] = idx

train['item'] = train.track_id.map(track_idx_dict)

train = train.rename({'count':'rating'}, axis = 1)

train.head()

Unnamed: 0,user_id,rating,track_id,user,item
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,1,TRIQAUQ128F42435AD,0,0
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,1,TRIRLYL128F42539D1,0,1
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,2,TRMHBXZ128F4238406,0,2
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,1,TRYQMNI128F147C1C7,0,3
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,1,TRAHZNE128F9341B86,0,4


In [3]:
val = pq.read_table('cf_val.pq').to_pandas()

# usr_idx_dict = {}
# for idx, usr in enumerate(val.user_id.unique()):
#     usr_idx_dict[usr] = idx

val['user'] = val.user_id.map(usr_idx_dict)

# track_idx_dict = {}
# for idx, track in enumerate(val.track_id.unique()):
#     track_idx_dict[track] = idx

val['item'] = val.track_id.map(track_idx_dict)

val = val.rename({'count':'rating'}, axis = 1)

val.head()

Unnamed: 0,user_id,rating,track_id,user,item
0,0007140a3796e901f3190f12e9de6d7548d4ac4a,1,TRUFCYO128F422B898,1019318,3011.0
1,0007140a3796e901f3190f12e9de6d7548d4ac4a,2,TROBZPR128F14808FF,1019318,92073.0
2,0007140a3796e901f3190f12e9de6d7548d4ac4a,1,TRFGZUW128F92FC2AB,1019318,8868.0
3,0007140a3796e901f3190f12e9de6d7548d4ac4a,1,TRHTCXG12903CC2F60,1019318,1746.0
4,0007140a3796e901f3190f12e9de6d7548d4ac4a,1,TRKFBTS128EF34E530,1019318,29411.0


In [4]:
truth = val[['user', 'item', 'rating']].dropna()

sub_ids = truth.user.unique()

# ALS model - single machine

In [None]:
#final eval 200 20 0.1 1

In [5]:
algo_als = als.ImplicitMF(200, iterations = 20, reg = 0.1, weight = 1, method = 'lu') #try no regularization and 0.001
#tunable: features, iterations, reg, weight

fittable = util.clone(algo_als)

fittable = Recommender.adapt(fittable)

fittable.fit(train, n_jobs = -1)

<lenskit.algorithms.basic.TopN at 0x145f17cc4730>

In [18]:
with open('model_val_200.pkl', 'wb') as f:
    pickle.dump(fittable, f, protocol = pickle.HIGHEST_PROTOCOL)

In [19]:
with open('model_val_200.pkl', 'rb') as f:
    ff2 = pickle.load(f)

In [None]:
rr2 = batch.recommend(ff2, sub_ids[:2], 500, njobs = 4)

In [28]:
rr2

Unnamed: 0,item,score,user,rank
0,2551,0.126198,1019318,1
1,1685,0.149134,1019319,1


In [6]:
recs = batch.recommend(fittable, sub_ids, 500, n_jobs = 4)
recs['user'] = recs['user'].map(int)
recs.head()

Unnamed: 0,item,score,user,rank
0,2551,0.126198,1019318,1
1,17565,0.118739,1019318,2
2,4960,0.113786,1019318,3
3,9583,0.109301,1019318,4
4,11072,0.10373,1019318,5


In [7]:
truth['user'] = truth['user'].map(int)
truth['item'] = truth['item'].map(int)
truth.head()

Unnamed: 0,user,item,rating
0,1019318,3011,1
1,1019318,92073,2
2,1019318,8868,1
3,1019318,1746,1
4,1019318,29411,1


In [8]:
val.head()

Unnamed: 0,user_id,rating,track_id,user,item
0,0007140a3796e901f3190f12e9de6d7548d4ac4a,1,TRUFCYO128F422B898,1019318,3011.0
1,0007140a3796e901f3190f12e9de6d7548d4ac4a,2,TROBZPR128F14808FF,1019318,92073.0
2,0007140a3796e901f3190f12e9de6d7548d4ac4a,1,TRFGZUW128F92FC2AB,1019318,8868.0
3,0007140a3796e901f3190f12e9de6d7548d4ac4a,1,TRHTCXG12903CC2F60,1019318,1746.0
4,0007140a3796e901f3190f12e9de6d7548d4ac4a,1,TRKFBTS128EF34E530,1019318,29411.0


In [9]:
recs_listed = pd.DataFrame({'item':recs.groupby('user').item.apply(list)}).reset_index()

In [10]:
recs_listed.head()

Unnamed: 0,user,item
0,1019318,"[2551, 17565, 4960, 9583, 11072, 11092, 29423,..."
1,1019319,"[1685, 1246, 10165, 6169, 13216, 10222, 6434, ..."
2,1019320,"[366, 1276, 2594, 2494, 378, 2008, 12577, 1595..."
3,1019321,"[1780, 2707, 2709, 1706, 410, 14943, 1465, 161..."
4,1019322,"[12046, 4652, 9738, 4634, 10037, 251, 8502, 15..."


In [11]:
truth_listed = pd.DataFrame({'item':truth.groupby('user').item.apply(list)}).reset_index()

In [12]:
truth_listed.head()

Unnamed: 0,user,item
0,1019318,"[3011, 92073, 8868, 1746, 29411, 29423, 11667,..."
1,1019319,"[60775, 14821, 47393, 52175, 156297, 22462]"
2,1019320,"[378, 12629, 418, 7673, 16820, 9336]"
3,1019321,"[1780, 22800, 14943, 2239, 3303, 85910, 10222,..."
4,1019322,"[13185, 66586, 3057, 1158, 15007, 47018, 14870..."


In [13]:
len(truth_listed.user.unique())

10000

In [None]:
# len(truth_listed[truth_listed['user'] == 1010207].item.values[0])

# truth_listed.user.values[0]

# [truth_listed[truth_listed['user'] == 1010207].item.values[0]]

In [14]:
with open('recs_200_val.pkl', 'wb') as f:
    pickle.dump(recs_listed, f, protocol = pickle.HIGHEST_PROTOCOL)

In [15]:
def pk(k, rec_df, truth_df):
    '''
    compute precision at k
    '''
    ct_usr = 0
    for usr in tqdm(truth_df.user.values):
        ct_rec = 0
        rec_all = rec_df[rec_df['user'] == usr].item.values[0][:k]
        val_all = truth_df[truth_df['user'] == usr].item.values[0]
        ttl = [rec_item in val_all for rec_item in rec_all]
        ct_rec = sum(ttl)
        ct_usr += ct_rec / k
    return ct_usr / len(truth_df.user.values)      

In [16]:
pk(500, recs_listed, truth_listed)

100%|██████████| 10000/10000 [00:09<00:00, 1002.52it/s]


0.011219600000000194

In [21]:
def meanAP(rec_df, truth_df):
    ct_usr = 0
    for usr in tqdm(truth_df.user.values):
        ct_rec = 0
        rec_all = rec_df[rec_df['user'] == usr].item.values[0]
        val_all = truth_df[truth_df['user'] == usr].item.values[0]
        ttl = [rec_item in val_all for rec_item in rec_all]
        ttl = [v/(j+1) for j,v in enumerate(ttl)]
        ct_rec += sum(ttl)
        ct_usr += ct_rec / len(val_all)
        
    return ct_usr / len(truth_df.user.values)

In [22]:
meanAP(recs_listed, truth_listed)

100%|██████████| 10000/10000 [00:10<00:00, 948.16it/s]


0.0439124768155222

In [23]:
def ndcg(k, rec_df, truth_df):
    ct_usr = 0
    for usr in tqdm(truth_df.user.values):
        rec_all = rec_df[rec_df['user'] == usr].item.values[0]
        val_all = truth_df[truth_df['user'] == usr].item.values[0]
        n = min(max(len(rec_all), len(val_all)), k)
        idcg_n = min(len(val_all), k)
        idcg = sum([1/(np.log(j+2)) for j in range(idcg_n)])
        ttl = [rec_item in val_all for rec_item in rec_all[:n]]
        ttl = sum([v/np.log(j+2) for j, v in enumerate(ttl)])
        ttl *= 1/idcg
        ct_usr += ttl
    return ct_usr / len(truth_df.user.values)

In [24]:
ndcg(500,recs_listed, truth_listed)

100%|██████████| 10000/10000 [00:29<00:00, 343.32it/s]


0.2420377314444504

In [None]:
# combined = recs.join(truth, on = 'user', how = 'inner', lsuffix = '_rec', rsuffix = '_val')

# #combined = combined.dropna()

# len(combined.user.unique())

# combined.head()

# m_rmse = combined.groupby('user').apply(lambda df: rmse(df.score, df.rating))

# m_mae = combined.groupby('user').apply(lambda df: mae(df.score, df.rating))

# m_ndcg = topn.ndcg(recs, truth)


# m_rmse.describe()

# m_mae.describe()

# m_ndcg

# combined.loc[combined['item_rec'] != combined['item_val'], 'score'] = 0
# # combined_bias['score'][combined_bias['item_rec'] != combined_bias['item_val']] = 0

# m_rmse = combined.groupby('user').apply(lambda df: rmse(df.score, df.rating))

# m_mae = combined.groupby('user').apply(lambda df: mae(df.score, df.rating))

# m_ndcg = topn.ndcg(recs, truth)


# m_rmse.describe()

# m_mae.describe()

# m_ndcg

# Popularity baseline Model

In [None]:
bias_errs = {}

In [None]:
# search
# damping_params = range(0, 301, 50)
damping_params = range(550, 801, 50)
for damp in damping_params:
    bias_errs[str(damp)] = []
    algo_bias = bias.Bias(items = True, users = True, damping = damp) #tunable: damping

    fittable_bias = util.clone(algo_bias)

    fittable_bias = Recommender.adapt(fittable_bias)

    fittable_bias.fit(train, n_jobs = -1)

    recs_bias = batch.recommend(fittable_bias, sub_ids, 500, n_jobs = 4)
    recs_bias['user'] = recs_bias['user'].map(int)
    # recs_bias.head()

    recs_bias_listed = pd.DataFrame({'item':recs_bias.groupby('user').item.apply(list)}).reset_index()
    bias_errs[str(damp)].append(pk(500, recs_bias_listed, truth_listed))
    bias_errs[str(damp)].append(meanAP(recs_bias_listed, truth_listed))
    bias_errs[str(damp)].append(ndcg(500,recs_bias_listed, truth_listed))

In [None]:
# with open('broad_bias_errs.pkl', 'rb') as f:
#     be2 = pickle.load(f)
# for kk, vv in be2.items():
#     print(kk, vv)

In [None]:
for kk, vv in bias_errs.items():
    print(kk, vv)

In [None]:
with open('broad_bias_errs_3.pkl', 'wb') as f:
    pickle.dump(bias_errs, f)

In [None]:
algo_bias = bias.Bias(items = True, users = True, damping = (1000000, 10000000)) #tunable: damping

#fittable_bias = util.clone(algo_bias)
fittable_bias = algo_bias
fittable_bias = Recommender.adapt(fittable_bias)

fittable_bias.fit(train, n_jobs = -1)

In [None]:
recs_bias = batch.recommend(fittable_bias, sub_ids, 500, n_jobs = 4)
recs_bias['user'] = recs_bias['user'].map(int)
recs_bias.head()

In [None]:
recs_bias_listed = pd.DataFrame({'item':recs_bias.groupby('user').item.apply(list)}).reset_index()

In [None]:
recs_bias_listed.head()

In [None]:
pk(500, recs_bias_listed, truth_listed)

In [None]:
meanAP(recs_bias_listed, truth_listed)

In [None]:
ndcg(500,recs_bias_listed, truth_listed)

In [None]:
# combined_bias = recs_bias.join(truth, on = ['user'], how = 'inner', lsuffix = '_rec', rsuffix = '_val')

# combined_bias = combined_bias.dropna()

# len(combined_bias.user.unique())

# combined_bias.head()

# m_rmse_bias = combined_bias.groupby('user').apply(lambda df: rmse(df.score, df.rating))

# m_mae_bias = combined_bias.groupby('user').apply(lambda df: mae(df.score, df.rating))

# m_ndcg_bias = topn.ndcg(recs_bias, truth)

# # rla = topn.RecListAnalysis()

# # rla.add_metric(topn.ndcg)

# # m_ndcg = rla.compute(recs_bias, truth)

# m_rmse_bias.describe()

# m_mae_bias.describe()

# m_ndcg_bias

# combined_bias.loc[combined_bias['item_rec'] != combined_bias['item_val'], 'score'] = 0
# # combined_bias['score'][combined_bias['item_rec'] != combined_bias['item_val']] = 0

# m_rmse_bias = combined_bias.groupby('user').apply(lambda df: rmse(df.score, df.rating))

# m_mae_bias = combined_bias.groupby('user').apply(lambda df: mae(df.score, df.rating))

# m_ndcg_bias = topn.ndcg(recs_bias, truth)

# # rla = topn.RecListAnalysis()

# # rla.add_metric(topn.ndcg)

# # m_ndcg = rla.compute(recs_bias, truth)

# m_rmse_bias.describe()

# m_mae_bias.describe()

# m_ndcg_bias