In [1]:
import math
import pandas as pd
from sklearn.model_selection import KFold
from surprise import accuracy, Dataset, Reader, KNNWithMeans, NMF
from surprise.model_selection import cross_validate, train_test_split
from collections import defaultdict
from config import *

## Preprocessing

In [2]:
def filter_interaction(df_inter):
    print('origin shape: ', df_inter.shape)
    vc_user = df_inter['user_id'].value_counts()
    mask = df_inter['user_id'].isin(vc_user[vc_user >= 3].index)
    df_inter = df_inter.loc[mask].reset_index(drop=True)
    print('> 3 interactions: ', df_inter.shape)
    return df_inter

In [3]:
# douban book
df_douban_inter = pd.read_csv(data_path_douban + 'user_book.dat', sep='\t', header=None).rename({
    0: 'user_id',
    1: 'item_id',
    2: 'rating',
}, axis=1)
df_douban_inter['user_id'] = df_douban_inter['user_id'].astype('str')
df_douban_inter['item_id'] = df_douban_inter['item_id'].astype('str')
df_douban_inter = filter_interaction(df_douban_inter)

origin shape:  (792062, 3)
> 3 interactions:  (790197, 3)


In [4]:
# movie lens
df_movie_inter = pd.read_csv(data_path_movie + 'user_movie.dat', sep='\t', header=None).rename({
    0: 'user_id',
    1: 'item_id',
    2: 'rating',
}, axis=1)
df_movie_inter['user_id'] = df_movie_inter['user_id'].astype('str')
df_movie_inter['item_id'] = df_movie_inter['item_id'].astype('str')
df_movie_inter = filter_interaction(df_movie_inter)

origin shape:  (100000, 4)
> 3 interactions:  (100000, 4)


In [5]:
Kf = KFold(n_splits=5, random_state=RANDOM_STATE, shuffle=True)

## Collaborative Filtering & MF

In [6]:
def init_cf_mf():
    algo_UCF_s = KNNWithMeans(sim_options={
        "name": "cosine",
        "user_based": True
    })
    algo_UCF_p = KNNWithMeans(sim_options={
        "name": "pearson_baseline",
        "user_based": True,
        'shrinkage': 0
    })

    algo_ICF_s = KNNWithMeans(sim_options={
        "name": "cosine",
        "user_based": False
    })

    algo_ICF_p = KNNWithMeans(sim_options={
        "name": "pearson_baseline",
        "user_based": False,
        'shrinkage': 0
    })
    algo_NMF = NMF()
    return algo_UCF_s, algo_UCF_p, algo_ICF_s, algo_ICF_p, algo_NMF

In [7]:
def calc_eval_metrics(predictions, k=10, threshold=3.5):
    rmse = accuracy.rmse(predictions)
    ls_recall = []
    ls_ndcg = []
    
    dict_user_ratings = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        dict_user_ratings[uid].append((est, true_r))

    for uid, ratings in dict_user_ratings.items():
        ratings.sort(lambda x: x[0], reverse=True)
        true_ratings = sorted(ratings, key=lambda x: x[1], reverse=True)
        # recall
        num_rel = sum(1 for (est, true_r) in ratings if true_r > threshold)
        num_rel_top_k = sum(1 for (est, true_r) in ratings[:k] if true_r > threshold and est > threshold)
        recall = num_rel_top_k / num_rel if num_rel != 0 else 0
        ls_recall.append(recall)

        # ndcg
        dcg = sum(true_r / math.log(i + 2, base=2) for i, (est, true_r) in enumerate(ratings[:k]))
        idcg = sum(true_r / math.log(i + 2, base=2) for i, (est, true_r) in enumerate(true_ratings[:k]))
        ndcg = dcg / idcg
        ls_ndcg.append(ndcg)

    mean_recall = sum(ls_recall) / len(ls_recall)
    mean_ndcg = sum(ls_ndcg) / len(ls_ndcg)
    
    return [rmse, mean_recall, mean_ndcg]

## douban

In [8]:
# 1 fold => 3m 13.4s

ls_result = []
for i, (train_idx, test_idx) in enumerate(Kf.split(df_douban_inter)):
    print(f'\n***** Fold {i} *****')
    algo_UCF_s, algo_UCF_p, algo_ICF_s, algo_ICF_p, algo_NMF = init_cf_mf()
    
    df_train = df_douban_inter.iloc[train_idx]
    df_test = df_douban_inter.iloc[test_idx]

    reader = Reader(rating_scale=(1, 5))
    train_set = Dataset.load_from_df(df_train[['user_id', 'item_id', 'rating']], reader=reader)
    train_set = train_set.build_full_trainset()

    algo_UCF_s.fit(train_set)
    algo_UCF_p.fit(train_set)
    algo_ICF_s.fit(train_set)
    algo_ICF_p.fit(train_set)
    algo_NMF.fit(train_set)

    pred_UCF_s = algo_UCF_s.test(df_test.values)
    pred_UCF_p = algo_UCF_p.test(df_test.values)
    pred_ICF_s = algo_ICF_s.test(df_test.values)
    pred_ICF_p = algo_ICF_p.test(df_test.values)
    pred_NMF = algo_NMF.test(df_test.values)

    metrics_UCF_s = ['UCF_s', i] + calc_eval_metrics(pred_UCF_s)
    metrics_UCF_p = ['UCF_p', i] + calc_eval_metrics(pred_UCF_p)
    metrics_ICF_s = ['ICF_s', i] + calc_eval_metrics(pred_ICF_s)
    metrics_ICF_p = ['ICF_p', i] + calc_eval_metrics(pred_ICF_p)
    metrics_NMF = ['NMF', i] + calc_eval_metrics(pred_NMF)

    ls_result.extend([metrics_UCF_s, metrics_UCF_p, metrics_ICF_s, metrics_ICF_p, metrics_NMF])


***** Fold 0 *****
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
