In [26]:
import re
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# sns.set_theme(style="whitegrid")

import pandas as pd
%matplotlib inline

import pyarrow.parquet as pq
import numpy as np

import lenskit
from lenskit import batch, topn, util
from lenskit.algorithms import Recommender, als, bias, item_knn as knn
from lenskit import topn

from lenskit.algorithms.bias import Bias
from tqdm import tqdm

In [2]:
# load data
train = pq.read_table('/scratch/work/courses/DSGA1004-2021/MSD/cf_train_new.parquet').to_pandas()

usr_idx_dict = {}
for idx, usr in enumerate(train.user_id.unique()):
    usr_idx_dict[usr] = idx

train['user'] = train.user_id.map(usr_idx_dict)

track_idx_dict = {}
for idx, track in enumerate(train.track_id.unique()):
    track_idx_dict[track] = idx

train['item'] = train.track_id.map(track_idx_dict)

train = train.rename({'count':'rating'}, axis = 1)

train.head()

Unnamed: 0,user_id,rating,track_id,user,item
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,1,TRIQAUQ128F42435AD,0,0
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,1,TRIRLYL128F42539D1,0,1
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,2,TRMHBXZ128F4238406,0,2
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,1,TRYQMNI128F147C1C7,0,3
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,1,TRAHZNE128F9341B86,0,4


In [6]:
val = pq.read_table('/scratch/work/courses/DSGA1004-2021/MSD/cf_validation.parquet').to_pandas()

# usr_idx_dict = {}
# for idx, usr in enumerate(val.user_id.unique()):
#     usr_idx_dict[usr] = idx

val['user'] = val.user_id.map(usr_idx_dict)

# track_idx_dict = {}
# for idx, track in enumerate(val.track_id.unique()):
#     track_idx_dict[track] = idx

val['item'] = val.track_id.map(track_idx_dict)

val = val.rename({'count':'rating'}, axis = 1)

val.head()

Unnamed: 0,user_id,rating,track_id,user,item
0,0007140a3796e901f3190f12e9de6d7548d4ac4a,1,TRUFCYO128F422B898,1019318,3011.0
1,0007140a3796e901f3190f12e9de6d7548d4ac4a,2,TROBZPR128F14808FF,1019318,92073.0
2,0007140a3796e901f3190f12e9de6d7548d4ac4a,1,TRFGZUW128F92FC2AB,1019318,8868.0
3,0007140a3796e901f3190f12e9de6d7548d4ac4a,1,TRHTCXG12903CC2F60,1019318,1746.0
4,0007140a3796e901f3190f12e9de6d7548d4ac4a,1,TRKFBTS128EF34E530,1019318,29411.0


In [7]:
test = pq.read_table('/scratch/work/courses/DSGA1004-2021/MSD/cf_test.parquet').to_pandas()


test['user'] = test.user_id.map(usr_idx_dict)

test['item'] = test.track_id.map(track_idx_dict)

test = test.rename({'count':'rating'}, axis = 1)
test.head()

Unnamed: 0,user_id,rating,track_id,user,item
0,00007a02388c208ea7176479f6ae06f8224355b3,1,TRXYDST128F92EC024,1029318,54969.0
1,00007a02388c208ea7176479f6ae06f8224355b3,2,TRJREQL128F92EF09A,1029318,2086.0
2,00007a02388c208ea7176479f6ae06f8224355b3,3,TRFXKPH128E0793B8E,1029318,3020.0
3,00007a02388c208ea7176479f6ae06f8224355b3,1,TRMSLFG128F93172F0,1029318,13582.0
4,00007a02388c208ea7176479f6ae06f8224355b3,1,TRYNPHN128F92EF091,1029318,2130.0


In [10]:
truth = test[['user', 'item', 'rating']].dropna()

sub_ids = truth.user.unique()

### Load functions

In [12]:
def pk(k, rec_df, truth_df):
    '''
    compute precision at k
    '''
    ct_usr = 0
    for usr in tqdm(truth_df.user.values):
        ct_rec = 0
        rec_all = rec_df[rec_df['user'] == usr].item.values[0][:k]
        val_all = truth_df[truth_df['user'] == usr].item.values[0]
        ttl = [rec_item in val_all for rec_item in rec_all]
        ct_rec = sum(ttl)
        ct_usr += ct_rec / k
    return ct_usr / len(truth_df.user.values)

In [13]:
def meanAP(rec_df, truth_df):
    ct_usr = 0
    for usr in tqdm(truth_df.user.values):
        ct_rec = 0
        rec_all = rec_df[rec_df['user'] == usr].item.values[0]
        val_all = truth_df[truth_df['user'] == usr].item.values[0]
        ttl = [rec_item in val_all for rec_item in rec_all]
        ttl = [v/(j+1) for j,v in enumerate(ttl)]
        ct_rec += sum(ttl)
        ct_usr += ct_rec / len(val_all)
        
    return ct_usr / len(truth_df.user.values)

In [14]:
def ndcg(k, rec_df, truth_df):
    ct_usr = 0
    for usr in tqdm(truth_df.user.values):
        rec_all = rec_df[rec_df['user'] == usr].item.values[0]
        val_all = truth_df[truth_df['user'] == usr].item.values[0]
        n = min(max(len(rec_all), len(val_all)), k)
        idcg_n = min(len(val_all), k)
        idcg = sum([1/(np.log(j+2)) for j in range(idcg_n)])
        ttl = [rec_item in val_all for rec_item in rec_all[:n]]
        ttl = sum([v/np.log(j+2) for j, v in enumerate(ttl)])
        ttl *= 1/idcg
        ct_usr += ttl
    return ct_usr / len(truth_df.user.values)

## Model

In [18]:
algo_bias = bias.Bias(items = True, users = True, damping = (1000000, 10000000)) #tunable: damping

#fittable_bias = util.clone(algo_bias)
fittable_bias = algo_bias
fittable_bias = Recommender.adapt(fittable_bias)

fittable_bias.fit(train, n_jobs = -1)

<lenskit.algorithms.basic.TopN at 0x14692728d520>

In [19]:
recs_bias = batch.recommend(fittable_bias, sub_ids, 500, n_jobs = 4)
recs_bias['user'] = recs_bias['user'].map(int)
recs_bias.head()

Unnamed: 0,item,score,user,rank
0,202,2.926611,1029318,1
1,199,2.916619,1029318,2
2,218,2.906885,1029318,3
3,206,2.895803,1029318,4
4,5017,2.892731,1029318,5


In [20]:
recs_bias_listed = pd.DataFrame({'item':recs_bias.groupby('user').item.apply(list)}).reset_index()

In [21]:
recs_bias_listed.to_pickle('recs_bias_final.pkl')

In [22]:
truth_listed = pd.DataFrame({'item':truth.groupby('user').item.apply(list)}).reset_index()

truth_listed.head()

Unnamed: 0,user,item
0,1029318,"[54969.0, 2086.0, 3020.0, 13582.0, 2130.0, 474..."
1,1029319,"[116519.0, 8584.0, 72204.0, 183817.0, 132142.0..."
2,1029320,"[16568.0, 3756.0, 1130.0, 11.0, 22.0, 200.0, 3..."
3,1029321,"[5673.0, 329872.0, 286165.0, 30764.0, 809.0, 7..."
4,1029322,"[12628.0, 25977.0, 7882.0, 67393.0, 7979.0, 20..."


In [27]:
pk(500, recs_bias_listed, truth_listed)

100%|██████████| 100000/100000 [02:16<00:00, 735.10it/s]


0.003123780000000688

In [28]:
meanAP(recs_bias_listed, truth_listed)

100%|██████████| 100000/100000 [02:22<00:00, 703.96it/s]


0.011919438084520916

In [29]:
ndcg(500,recs_bias_listed, truth_listed)

100%|██████████| 100000/100000 [05:31<00:00, 301.61it/s]


0.06723044157342824