In [2]:
from lenskit import batch, topn, util
from lenskit import crossfold as xf
from lenskit.algorithms import Recommender, als, bias, item_knn as knn
from lenskit.metrics import topn
from lenskit.metrics.predict import rmse, mae
import pickle
import pandas as pd
from utils import *
import numpy as np
from lenskit.algorithms.svd import BiasedSVD

In [4]:
data = pd.read_csv('../data/amazon_video.csv').drop('Unnamed: 0', axis=1)

usr_idx_dict = {}
for idx, usr in enumerate(data.user.unique()):
    usr_idx_dict[usr] = idx

data['user'] = data.user.map(usr_idx_dict)

item_idx_dict = {}
for idx, track in enumerate(data.item.unique()):
    item_idx_dict[track] = idx

data['item'] = data.item.map(item_idx_dict)


In [67]:
# data_sparse = data.sample(frac=0.6)
# val_sparse_user = data_sparse.sample(frac=0.3)
# val_sparse_user = list(set(val_sparse_user.user))
# val_sparse = data_sparse[data_sparse.user.isin(val_sparse_user)]
# data_sparse = data_sparse[~data_sparse.user.isin(val_sparse_user)]

In [126]:
sum(sent.index.isin(data_sparse.index))

0

In [132]:
# 60% true ratings

data_sparse = data.sample(frac=0.6)
distinct_users_sparse = list(set(data_sparse.user))

#sample 30% users along with their items
val_sparse_user = np.random.choice(len(distinct_users_sparse), size = int(np.floor(len(distinct_users_sparse) * 0.3)), 
                                   replace=False)
val_sparse = data_sparse[data_sparse.user.isin(val_sparse_user)]

data_sparse = data_sparse[~data_sparse.user.isin(val_sparse_user)]


# 40% sparse

non_sparse = data.loc[list(set(data.index) - set(data_sparse.index) - set(val_sparse.index))]
distinct_users_nonsparse = list(set(non_sparse.user))

val_nonsparse_user = np.random.choice(len(distinct_users_nonsparse), size = int(np.floor(len(distinct_users_nonsparse) * 0.3)),
                                      replace=False)
val_nonsparse = non_sparse[non_sparse.user.isin(val_nonsparse_user)]

non_sparse = non_sparse[~non_sparse.user.isin(val_nonsparse_user)]

sent = data.loc[non_sparse.index].drop(columns=['rating'])
sent = sent.rename(columns={'senti_rating_finetune':'rating'})

non_sparse['rating'] = non_sparse.rating.mask(non_sparse.rating>-10)

data_sent = pd.concat([data_sparse, sent], axis=0)

data_sparse_o = pd.concat([data_sparse, non_sparse], axis=0)

# export validation set
val_data = pd.concat([val_sparse, val_nonsparse], axis=0).drop(columns=['senti_rating_finetune'])
val_data.to_parquet('../data/val_data.pq')

data_sparse_o.to_parquet('../data/train_sparse.pq')

data_sent.to_parquet('../data/train_sent.pq')

In [133]:
data_sparse.to_parquet('../data/train_sparse.pq')

In [25]:
data[data.user == 60155]

Unnamed: 0,user,item,rating,senti_rating_finetune
97019,60155,5095,5,5
97137,60155,6554,4,4
97696,60155,30822,3,5
97960,60155,45678,5,5
97972,60155,45683,4,4
...,...,...,...,...
276256,60155,12313,5,5
276423,60155,37874,4,4
276504,60155,28233,4,5
276809,60155,18209,3,5


In [3]:
item_counts = data.groupby('user')['item'].count()

In [8]:
sub_ids = item_counts[item_counts > 5].index.values
len(sub_ids)

6733

In [12]:
data_sub_truth = pd.DataFrame()
for u in sub_ids:
    data_sub_truth = data_sub_truth.append(data_truth.iloc[data_truth.user.values == u, :])

data_sub_bert = pd.DataFrame()
for u in sub_ids:
    data_sub_bert = data_sub_bert.append(data_bert.iloc[data_bert.user.values == u, :])

# BiasedSVD

In [19]:
algo_als = BiasedSVD(features=20, damping = 2, bias=True)

fittable = util.clone(algo_als)

fittable = Recommender.adapt(fittable)

fittable.fit(data_bert, n_jobs = -1)

<lenskit.algorithms.ranking.TopN at 0x14eb7af030d0>

In [20]:
K = 10
recs = batch.recommend(fittable, sub_ids, K, n_jobs = 10)
recs['user'] = recs['user'].map(int)
get_metrics(K, recs, data_sub_bert)

100%|██████████| 6733/6733 [00:05<00:00, 1276.50it/s]
100%|██████████| 6733/6733 [00:05<00:00, 1266.91it/s]
100%|██████████| 6733/6733 [00:05<00:00, 1160.75it/s]


{'pk': 0.0, 'meanAP': 0.0, 'NDCG': 0.0}

# BiasedMF ALS

In [21]:
algo_als = als.BiasedMF(features=30, iterations = 20, reg = 0.01, damping = 2, bias=True,  method = 'cd')

fittable = util.clone(algo_als)

fittable = Recommender.adapt(fittable)

fittable.fit(data_bert, n_jobs = -1)

<lenskit.algorithms.ranking.TopN at 0x14eb7bd0eb50>

In [None]:
with open('als.pkl', 'wb') as f:
    pickle.dump(fittable, f, protocol = pickle.HIGHEST_PROTOCOL)

In [None]:
with open('als.pkl', 'rb') as f:
    fittable = pickle.load(f)

In [22]:
K = 10
recs = batch.recommend(fittable, sub_ids, K, n_jobs = 10)
recs['user'] = recs['user'].map(int)
get_metrics(K, recs, data_sub_bert)

100%|██████████| 6733/6733 [00:05<00:00, 1267.42it/s]
100%|██████████| 6733/6733 [00:05<00:00, 1257.13it/s]
100%|██████████| 6733/6733 [00:05<00:00, 1148.84it/s]


{'pk': 0.0, 'meanAP': 0.0, 'NDCG': 0.0}

In [None]:
with open('recs_raw.pkl', 'wb') as f:
    pickle.dump(recs, f, protocol = pickle.HIGHEST_PROTOCOL)