In [23]:
import math
import pandas as pd
from sklearn.model_selection import KFold
from surprise import accuracy, Dataset, Reader, KNNWithMeans, NMF
from surprise.model_selection import cross_validate, train_test_split
from collections import defaultdict
from config import *

In [18]:
Kf = KFold(n_splits=5, random_state=RANDOM_STATE, shuffle=True)

## Import Data

In [None]:
df_douban_inter = pd.read_csv(data_preprocessing + 'douban_inter.csv', dtype={
    'user_id': str,
    'item_id': str,
})

df_movie_inter = pd.read_csv(data_preprocessing + 'movie_inter.csv', dtype={
    'user_id': str,
    'item_id': str,

})
df_yelp_inter = pd.read_csv(data_preprocessing + 'yelp_inter.csv', dtype={
    'user_id': str,
    'item_id': str,
})

## Collaborative Filtering & MF
* [Surprise Basic Usage](https://surprise.readthedocs.io/en/stable/getting_started.html)
* [Surprise Prediction Algorithms List](https://surprise.readthedocs.io/en/stable/prediction_algorithms_package.html)
* [Surprise Similarity Options](https://surprise.readthedocs.io/en/stable/prediction_algorithms.html)
* [Surprise Accuracy RMSE](https://surprise.readthedocs.io/en/stable/accuracy.html)
* [Surprise Recall](https://surprise.readthedocs.io/en/stable/FAQ.html)
* [NDGC 計算方式](https://ithelp.ithome.com.tw/articles/10299050)

In [8]:
def init_cf_mf():
    algo_UCF_s = KNNWithMeans(sim_options={
        "name": "cosine",
        "user_based": True
    })
    algo_UCF_p = KNNWithMeans(sim_options={
        "name": "pearson_baseline",
        "user_based": True,
        'shrinkage': 0
    })

    algo_ICF_s = KNNWithMeans(sim_options={
        "name": "cosine",
        "user_based": False
    })

    algo_ICF_p = KNNWithMeans(sim_options={
        "name": "pearson_baseline",
        "user_based": False,
        'shrinkage': 0
    })
    algo_NMF = NMF()
    return algo_UCF_s, algo_UCF_p, algo_ICF_s, algo_ICF_p, algo_NMF

In [9]:
def calc_cf_eval_metrics(predictions, k=10, threshold=3.5):
    rmse = accuracy.rmse(predictions)
    ls_recall = []
    ls_ndcg = []
    
    dict_user_ratings = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        dict_user_ratings[uid].append((est, true_r))

    for uid, ratings in dict_user_ratings.items():
        ratings.sort(key=lambda x: x[0], reverse=True)
        true_ratings = sorted(ratings, key=lambda x: x[1], reverse=True)
        # recall
        num_rel = sum(1 for (est, true_r) in ratings if true_r > threshold)
        num_rel_top_k = sum(1 for (est, true_r) in ratings[:k] if true_r > threshold and est > threshold)
        recall = num_rel_top_k / num_rel if num_rel != 0 else 0
        ls_recall.append(recall)

        # ndcg
        dcg = sum(true_r / math.log(i + 2, 2) for i, (est, true_r) in enumerate(ratings[:k]))
        idcg = sum(true_r / math.log(i + 2, 2) for i, (est, true_r) in enumerate(true_ratings[:k]))
        ndcg = dcg / idcg
        ls_ndcg.append(ndcg)

    mean_recall = sum(ls_recall) / len(ls_recall)
    mean_ndcg = sum(ls_ndcg) / len(ls_ndcg)
    
    return [rmse, mean_recall, mean_ndcg]

In [10]:
# 1 fold => 3m 13.4s
def calc_all_cf_model(df_inter, data_name):
    ls_result = []
    for i, (train_idx, test_idx) in enumerate(Kf.split(df_inter)):
        print(f'\n***** Fold {i} *****')
        algo_UCF_s, algo_UCF_p, algo_ICF_s, algo_ICF_p, algo_NMF = init_cf_mf()
        
        df_train = df_inter.iloc[train_idx]
        df_test = df_inter.iloc[test_idx]

        reader = Reader(rating_scale=(1, 5))
        train_set = Dataset.load_from_df(df_train[['user_id', 'item_id', 'rating']], reader=reader)
        train_set = train_set.build_full_trainset()

        algo_UCF_s.fit(train_set)
        algo_UCF_p.fit(train_set)
        algo_ICF_s.fit(train_set)
        algo_ICF_p.fit(train_set)
        algo_NMF.fit(train_set)

        pred_UCF_s = algo_UCF_s.test(df_test.values)
        pred_UCF_p = algo_UCF_p.test(df_test.values)
        pred_ICF_s = algo_ICF_s.test(df_test.values)
        pred_ICF_p = algo_ICF_p.test(df_test.values)
        pred_NMF = algo_NMF.test(df_test.values)

        metrics_UCF_s = ['UCF_s', i, data_name] + calc_cf_eval_metrics(pred_UCF_s)
        metrics_UCF_p = ['UCF_p', i, data_name] + calc_cf_eval_metrics(pred_UCF_p)
        metrics_ICF_s = ['ICF_s', i, data_name] + calc_cf_eval_metrics(pred_ICF_s)
        metrics_ICF_p = ['ICF_p', i, data_name] + calc_cf_eval_metrics(pred_ICF_p)
        metrics_NMF = ['NMF', i, data_name] + calc_cf_eval_metrics(pred_NMF)

        ls_result.extend([metrics_UCF_s, metrics_UCF_p, metrics_ICF_s, metrics_ICF_p, metrics_NMF])
    return ls_result

In [11]:
all_result = []

### douban

In [12]:
ls_result = calc_all_cf_model(df_douban_inter, 'douban')
all_result.extend(ls_result)


***** Fold 0 *****


KeyboardInterrupt: 

### Movie Len

In [None]:
ls_result = calc_all_cf_model(df_movie_inter, 'movie_len')
all_result.extend(ls_result)


***** Fold 0 *****
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.9587
RMSE: 0.9543
RMSE: 0.9469
RMSE: 0.9440
RMSE: 0.9674

***** Fold 1 *****
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.9554
RMSE: 0.9488
RMSE: 0.9441
RMSE: 0.9427
RMSE: 0.9628

***** Fold 2 *****
Computing the cosine similarity matrix

### Yelp

In [None]:
ls_result = calc_all_cf_model(df_yelp_inter, 'yelp')
all_result.extend(ls_result)


***** Fold 0 *****
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 1.0823
RMSE: 1.1398
RMSE: 1.0850
RMSE: 1.1391
RMSE: 1.1286

***** Fold 1 *****
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 1.0746
RMSE: 1.1275
RMSE: 1.0826
RMSE: 1.1390
RMSE: 1.1301

***** Fold 2 *****
Computing the cosine similarity matrix

In [None]:
pd.DataFrame(all_result, columns=['model', 'kfold', 'data', 'RMSE', 'Recall', 'NDCG']).to_csv('output/CF_MF_result.csv')

## Factorization Machine & BPR

In [19]:
df_douban_inter

Unnamed: 0,user_id,item_id,rating
0,10855,938,4
1,10027,3,3
2,741,2426,5
3,453,1263,4
4,11665,7717,5
...,...,...,...
790192,12832,2650,5
790193,7823,3050,4
790194,9347,18017,5
790195,10942,1443,4


In [None]:
from lightfm import LightFM
from lightfm.data import Dataset
