In [1]:
import math
import numpy as np
import pandas as pd
from pprint import pprint
from sklearn.model_selection import KFold
from surprise import accuracy, Dataset, Reader, KNNWithMeans, NMF
from surprise.model_selection import cross_validate, train_test_split
from collections import defaultdict
from config import *


In [2]:
Kf = KFold(n_splits=5, random_state=RANDOM_STATE, shuffle=True)

## Import Data

In [3]:
df_douban_inter = pd.read_csv(data_preprocessing + 'douban_inter.csv', dtype={'user_id': str, 'item_id': str,})
df_movie_inter = pd.read_csv(data_preprocessing + 'movie_inter.csv', dtype={'user_id': str, 'item_id': str,})
df_yelp_inter = pd.read_csv(data_preprocessing + 'yelp_inter.csv', dtype={'user_id': str, 'item_id': str,})

In [4]:
df_douban_user_features = pd.read_csv(data_preprocessing + 'douban_user_features.csv', dtype=str)
df_douban_user_features['location'] = 'location:' + df_douban_user_features['location']

df_douban_item_features = pd.read_csv(data_preprocessing + 'douban_item_features.csv', dtype=str)
for col in df_douban_item_features.columns[1:]:
    df_douban_item_features[col] = col + ":" + df_douban_item_features[col]
# df_douban_item_features

In [5]:
df_movie_user_features = pd.read_csv(data_preprocessing + "movie_user_features.csv", dtype=str)
for col in df_movie_user_features.columns[1:]:
    df_movie_user_features[col] = col + ":" + df_movie_user_features[col]
# df_movie_user_features

df_movie_item_features = pd.read_csv(data_preprocessing + 'movie_item_features.csv', dtype=str)
for col in df_movie_item_features.columns[1:]:
    df_movie_item_features[col] = col + ":" + df_movie_item_features[col]
# df_movie_item_features

In [6]:
df_yelp_user_features = pd.read_csv(data_preprocessing + "yelp_user_features.csv", dtype=str)
for col in df_yelp_user_features.columns[1:]:
    df_yelp_user_features[col] = col + ":" + df_yelp_user_features[col]
# df_yelp_user_features

df_yelp_item_features = pd.read_csv(data_preprocessing + "yelp_item_features.csv", dtype=str)
for col in df_yelp_item_features.columns[1:]:
    df_yelp_item_features[col] = col + ":" + df_yelp_item_features[col]
# df_yelp_item_features

## Collaborative Filtering & MF
* [Surprise Basic Usage](https://surprise.readthedocs.io/en/stable/getting_started.html)
* [Surprise Prediction Algorithms List](https://surprise.readthedocs.io/en/stable/prediction_algorithms_package.html)
* [Surprise Similarity Options](https://surprise.readthedocs.io/en/stable/prediction_algorithms.html)
* [Surprise Accuracy RMSE](https://surprise.readthedocs.io/en/stable/accuracy.html)
* [Surprise Recall](https://surprise.readthedocs.io/en/stable/FAQ.html)
* [NDGC 計算方式](https://ithelp.ithome.com.tw/articles/10299050)

In [7]:
def init_cf_mf():
    algo_UCF_s = KNNWithMeans(sim_options={
        "name": "cosine",
        "user_based": True
    })
    algo_UCF_p = KNNWithMeans(sim_options={
        "name": "pearson_baseline",
        "user_based": True,
        'shrinkage': 0
    })

    algo_ICF_s = KNNWithMeans(sim_options={
        "name": "cosine",
        "user_based": False
    })

    algo_ICF_p = KNNWithMeans(sim_options={
        "name": "pearson_baseline",
        "user_based": False,
        'shrinkage': 0
    })
    algo_NMF = NMF(n_factors=FACTORS, n_epochs=EPOCHS)
    return algo_UCF_s, algo_UCF_p, algo_ICF_s, algo_ICF_p, algo_NMF

In [8]:
def calc_cf_eval_metrics(predictions, k=10, threshold=3.5):
    rmse = accuracy.rmse(predictions)
    ls_recall = []
    ls_ndcg = []
    
    dict_user_ratings = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        dict_user_ratings[uid].append((est, true_r))

    for uid, ratings in dict_user_ratings.items():
        ratings.sort(key=lambda x: x[0], reverse=True)
        true_ratings = sorted(ratings, key=lambda x: x[1], reverse=True)
        # recall
        num_rel = sum(1 for (est, true_r) in ratings if true_r > threshold)
        num_rel_top_k = sum(1 for (est, true_r) in ratings[:k] if true_r > threshold and est > threshold)
        recall = num_rel_top_k / num_rel if num_rel != 0 else 0
        ls_recall.append(recall)

        # ndcg
        dcg = sum(true_r / math.log(i + 2, 2) for i, (est, true_r) in enumerate(ratings[:k]))
        idcg = sum(true_r / math.log(i + 2, 2) for i, (est, true_r) in enumerate(true_ratings[:k]))
        ndcg = dcg / idcg
        ls_ndcg.append(ndcg)

    mean_recall = sum(ls_recall) / len(ls_recall)
    mean_ndcg = sum(ls_ndcg) / len(ls_ndcg)
    
    return [rmse, mean_recall, mean_ndcg]

In [9]:
# 1 fold => 3m 13.4s
def calc_all_cf_model(df_inter, data_name):
    ls_result = []
    for i, (train_idx, test_idx) in enumerate(Kf.split(df_inter)):
        print(f'\n***** Fold {i} *****')
        algo_UCF_s, algo_UCF_p, algo_ICF_s, algo_ICF_p, algo_NMF = init_cf_mf()
        
        df_train = df_inter.iloc[train_idx]
        df_test = df_inter.iloc[test_idx]

        reader = Reader(rating_scale=(1, 5))
        train_set = Dataset.load_from_df(df_train[['user_id', 'item_id', 'rating']], reader=reader)
        train_set = train_set.build_full_trainset()

        algo_UCF_s.fit(train_set)
        algo_UCF_p.fit(train_set)
        algo_ICF_s.fit(train_set)
        algo_ICF_p.fit(train_set)
        algo_NMF.fit(train_set)

        pred_UCF_s = algo_UCF_s.test(df_test.values)
        pred_UCF_p = algo_UCF_p.test(df_test.values)
        pred_ICF_s = algo_ICF_s.test(df_test.values)
        pred_ICF_p = algo_ICF_p.test(df_test.values)
        pred_NMF = algo_NMF.test(df_test.values)

        metrics_UCF_s = ['UCF_s', i, data_name] + calc_cf_eval_metrics(pred_UCF_s)
        metrics_UCF_p = ['UCF_p', i, data_name] + calc_cf_eval_metrics(pred_UCF_p)
        metrics_ICF_s = ['ICF_s', i, data_name] + calc_cf_eval_metrics(pred_ICF_s)
        metrics_ICF_p = ['ICF_p', i, data_name] + calc_cf_eval_metrics(pred_ICF_p)
        metrics_NMF = ['NMF', i, data_name] + calc_cf_eval_metrics(pred_NMF)

        ls_result.extend([metrics_UCF_s, metrics_UCF_p, metrics_ICF_s, metrics_ICF_p, metrics_NMF])
    return ls_result

In [10]:
all_result = []

### douban

In [11]:
ls_result = calc_all_cf_model(df_douban_inter, 'douban')
all_result.extend(ls_result)


***** Fold 0 *****
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.7171
RMSE: 0.7271
RMSE: 0.7102
RMSE: 0.7227
RMSE: 1.0990

***** Fold 1 *****
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.7134
RMSE: 0.7248
RMSE: 0.7076
RMSE: 0.7204
RMSE: 1.0971

***** Fold 2 *****
Computing the cosine similarity matrix

### Movie Len

In [12]:
ls_result = calc_all_cf_model(df_movie_inter, 'movie_len')
all_result.extend(ls_result)


***** Fold 0 *****
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.9587
RMSE: 0.9543
RMSE: 0.9469
RMSE: 0.9440
RMSE: 1.3074

***** Fold 1 *****
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.9554
RMSE: 0.9488
RMSE: 0.9441
RMSE: 0.9427
RMSE: 1.2997

***** Fold 2 *****
Computing the cosine similarity matrix

### Yelp

In [13]:
ls_result = calc_all_cf_model(df_yelp_inter, 'yelp')
all_result.extend(ls_result)


***** Fold 0 *****
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 1.0823
RMSE: 1.1398
RMSE: 1.0850
RMSE: 1.1391
RMSE: 1.3201

***** Fold 1 *****
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 1.0746
RMSE: 1.1275
RMSE: 1.0826
RMSE: 1.1390
RMSE: 1.3182

***** Fold 2 *****
Computing the cosine similarity matrix

In [14]:
pd.DataFrame(all_result, columns=['model', 'kfold', 'data', 'RMSE', 'Recall', 'NDCG']).to_csv('output/CF_MF_result.csv')

## Factorization Machine & BPR
* 會將數值型特徵直接全展開 ^_^

In [15]:
from lightfm import LightFM
from lightfm.data import Dataset

In [16]:
def init_fm_mf():
    model_fm = LightFM(no_components=FACTORS)
    model_fm_bpr = LightFM(no_components=FACTORS, loss='bpr')
    model_mf_bpr = LightFM(no_components=FACTORS, loss='bpr')
    return model_fm, model_fm_bpr, model_mf_bpr

In [17]:
def calc_fm_evaluation(df_test, pred, k=10, threshold=3.5):
    ls_recall = []
    ls_ndcg = []
    dict_user_ratings = defaultdict(list)

    for i, x in df_test.iterrows():
        dict_user_ratings[x['user_id']].append((pred[i], x['rating']))
    
    for uid, ratings in dict_user_ratings.items():
        ratings.sort(key=lambda x: x[0], reverse=True)
        true_ratings = sorted(ratings, key=lambda x: x[1], reverse=True)
        # recall
        num_rel = sum(1 for (est, true_r) in ratings if true_r > threshold)
        num_rel_top_k = sum(1 for (est, true_r) in ratings[:k] if true_r > threshold and est > threshold)
        recall = num_rel_top_k / num_rel if num_rel != 0 else 0
        ls_recall.append(recall)

        # ndcg
        dcg = sum(true_r / math.log(i + 2, 2) for i, (est, true_r) in enumerate(ratings[:k]))
        idcg = sum(true_r / math.log(i + 2, 2) for i, (est, true_r) in enumerate(true_ratings[:k]))
        ndcg = dcg / idcg
        ls_ndcg.append(ndcg)

    mean_recall = sum(ls_recall) / len(ls_recall)
    mean_ndcg = sum(ls_ndcg) / len(ls_ndcg)
    
    return [mean_recall, mean_ndcg]

In [18]:
# 1 fold => 
def calc_all_fm_model(df_inter, fm_data, user_features, item_features, data_name):
    ls_result = []
    for i, (train_idx, test_idx) in enumerate(Kf.split(df_inter)):
        print(f'\n***** Fold {i} *****')
        model_fm, model_fm_bpr, model_mf_bpr = init_fm_mf()

        df_train = df_inter.iloc[train_idx].reset_index(drop=True)
        df_test = df_inter.iloc[test_idx].reset_index(drop=True)
        map_user_id, _, map_item_id, _ = fm_data.mapping()
        df_train = df_train.loc[df_train['rating'] > 3.5].reset_index(drop=True)
        train_interactions, train_weights = fm_data.build_interactions(((x['user_id'], x['item_id']) for idx, x in df_train.iterrows()))

        model_fm.fit(train_interactions,
                     user_features=user_features, 
                     item_features=item_features, 
                     epochs=EPOCHS, num_threads=THREADS, verbose=True)
        model_fm_bpr.fit(train_interactions,
                         user_features=user_features,
                         item_features=item_features,
                         epochs=EPOCHS, num_threads=THREADS, verbose=True)
        model_mf_bpr.fit(train_interactions, 
                         epochs=EPOCHS, num_threads=THREADS, verbose=True)
        pred_fm = model_fm.predict(
            [map_user_id[uid] for uid in df_test['user_id']],
            [map_item_id[iid] for iid in df_test['item_id']],
            user_features=user_features, item_features=item_features, 
            num_threads=THREADS)
        
        pred_fm_bpr = model_fm_bpr.predict(
            [map_user_id[uid] for uid in df_test['user_id']],
            [map_item_id[iid] for iid in df_test['item_id']],
            user_features=user_features, item_features=item_features, 
            num_threads=THREADS)
        
        pred_mf_bpr = model_mf_bpr.predict(
            [map_user_id[uid] for uid in df_test['user_id']],
            [map_item_id[iid] for iid in df_test['item_id']],
            num_threads=THREADS)

        metrics_fm = ['fm', i, data_name] + calc_fm_evaluation(df_test, pred_fm)
        metrics_fm_bpr = ['fm_bpr', i, data_name] + calc_fm_evaluation(df_test, pred_fm_bpr)
        metrics_mf_bpr = ['mf_bpr', i, data_name] + calc_fm_evaluation(df_test, pred_mf_bpr)

        ls_result.extend([metrics_fm, metrics_fm_bpr, metrics_mf_bpr])

    return ls_result

In [19]:
all_result = []

### douban

In [20]:
ls_item_features_unique = list(np.unique(df_douban_item_features.iloc[:, 1:].values.flatten()))

douban_data = Dataset()
douban_data.fit(users=df_douban_inter['user_id'], items=df_douban_inter['item_id'])

douban_data.fit_partial(
    users=df_douban_user_features['user_id'],
    user_features=df_douban_user_features['location']
)

douban_data.fit_partial(
    items=df_douban_item_features['item_id'],
    item_features=ls_item_features_unique
)

In [21]:
print('user features shape: ', douban_data.user_features_shape())
print('item features shape: ', douban_data.item_features_shape())
user_features = douban_data.build_user_features(((x['user_id'], [*x[1:]]) for i, x in df_douban_user_features.iterrows()))
item_features = douban_data.build_item_features(((x['item_id'], [*x[1:]]) for i, x in df_douban_item_features.iterrows()))

user features shape:  (11699, 12135)
item features shape:  (22347, 35034)


In [22]:
ls_result = calc_all_fm_model(df_douban_inter, douban_data, user_features, item_features, 'douban')
all_result.extend(ls_result)


***** Fold 0 *****


Epoch: 100%|██████████| 30/30 [00:23<00:00,  1.29it/s]
Epoch: 100%|██████████| 30/30 [00:40<00:00,  1.36s/it]
Epoch: 100%|██████████| 30/30 [00:15<00:00,  1.88it/s]



***** Fold 1 *****


Epoch: 100%|██████████| 30/30 [00:38<00:00,  1.27s/it]
Epoch: 100%|██████████| 30/30 [00:41<00:00,  1.39s/it]
Epoch: 100%|██████████| 30/30 [01:16<00:00,  2.56s/it]



***** Fold 2 *****


Epoch: 100%|██████████| 30/30 [00:37<00:00,  1.27s/it]
Epoch: 100%|██████████| 30/30 [00:39<00:00,  1.32s/it]
Epoch: 100%|██████████| 30/30 [01:08<00:00,  2.28s/it]



***** Fold 3 *****


Epoch: 100%|██████████| 30/30 [00:30<00:00,  1.00s/it]
Epoch: 100%|██████████| 30/30 [00:42<00:00,  1.43s/it]
Epoch: 100%|██████████| 30/30 [00:51<00:00,  1.72s/it]



***** Fold 4 *****


Epoch: 100%|██████████| 30/30 [00:18<00:00,  1.62it/s]
Epoch: 100%|██████████| 30/30 [00:35<00:00,  1.18s/it]
Epoch: 100%|██████████| 30/30 [00:06<00:00,  4.88it/s]


<pre>
with out features:
    recall:  0.04407316747683846
    precision:  0.11393753

with features:
    recall:  0.003990892983807165
    precision:  0.014177823
</pre>

### movie lens

In [23]:
ls_user_features_unique = list(np.unique(df_movie_user_features.iloc[:, 1:].values.flatten()))
ls_item_features_unique = list(np.unique(df_movie_item_features.iloc[:, 1:].values.flatten()))

movie_data = Dataset()
movie_data.fit(users=df_movie_inter['user_id'], items=df_movie_inter['item_id'])

movie_data.fit_partial(
    users=df_movie_user_features['user_id'],
    user_features=ls_user_features_unique
)

movie_data.fit_partial(
    items=df_movie_item_features['item_id'],
    item_features=ls_item_features_unique
)

In [24]:
print('user features shape: ', movie_data.user_features_shape())
print('item features shape: ', movie_data.item_features_shape())
user_features = movie_data.build_user_features(((x['user_id'], [*x[1:]]) for i, x in df_movie_user_features.iterrows()))
item_features = movie_data.build_item_features(((x['item_id'], [*x[1:]]) for i, x in df_movie_item_features.iterrows()))

user features shape:  (943, 972)
item features shape:  (1682, 1718)


In [25]:
ls_result = calc_all_fm_model(df_movie_inter, movie_data, user_features, item_features, 'movie_lens')
all_result.extend(ls_result)


***** Fold 0 *****


Epoch: 100%|██████████| 30/30 [00:47<00:00,  1.58s/it]
Epoch: 100%|██████████| 30/30 [01:49<00:00,  3.65s/it]
Epoch: 100%|██████████| 30/30 [00:07<00:00,  3.96it/s]



***** Fold 1 *****


Epoch: 100%|██████████| 30/30 [00:46<00:00,  1.55s/it]
Epoch: 100%|██████████| 30/30 [01:45<00:00,  3.52s/it]
Epoch: 100%|██████████| 30/30 [00:03<00:00,  9.15it/s]



***** Fold 2 *****


Epoch: 100%|██████████| 30/30 [00:46<00:00,  1.56s/it]
Epoch: 100%|██████████| 30/30 [01:41<00:00,  3.38s/it]
Epoch: 100%|██████████| 30/30 [00:03<00:00,  9.28it/s]



***** Fold 3 *****


Epoch: 100%|██████████| 30/30 [00:45<00:00,  1.51s/it]
Epoch: 100%|██████████| 30/30 [01:34<00:00,  3.16s/it]
Epoch: 100%|██████████| 30/30 [00:03<00:00,  9.46it/s]



***** Fold 4 *****


Epoch: 100%|██████████| 30/30 [00:46<00:00,  1.56s/it]
Epoch: 100%|██████████| 30/30 [01:42<00:00,  3.41s/it]
Epoch: 100%|██████████| 30/30 [00:01<00:00, 29.57it/s]


### yelp

In [26]:
ls_user_features_unique = list(np.unique(df_yelp_user_features.iloc[:, 1:].values.flatten()))
ls_item_features_unique = list(np.unique(df_yelp_item_features.iloc[:, 1:].values.flatten()))

yelp_data = Dataset()
yelp_data.fit(users=df_yelp_inter['user_id'], items=df_yelp_inter['item_id'])

yelp_data.fit_partial(
    users=df_yelp_user_features['user_id'],
    user_features=ls_user_features_unique
)

yelp_data.fit_partial(
    items=df_yelp_item_features['item_id'],
    item_features=ls_item_features_unique
)

In [27]:
print('user features shape: ', yelp_data.user_features_shape())
print('item features shape: ', yelp_data.item_features_shape())
user_features = yelp_data.build_user_features(((x['user_id'], [*x[1:]]) for i, x in df_yelp_user_features.iterrows()))
item_features = yelp_data.build_item_features(((x['item_id'], [*x[1:]]) for i, x in df_yelp_item_features.iterrows()))

user features shape:  (8533, 8555)
item features shape:  (14284, 15400)


In [28]:
ls_result = calc_all_fm_model(df_yelp_inter, yelp_data, user_features, item_features, 'yelp')
all_result.extend(ls_result)


***** Fold 0 *****


Epoch: 100%|██████████| 30/30 [16:19<00:00, 32.64s/it]
Epoch: 100%|██████████| 30/30 [35:11<00:00, 70.38s/it]
Epoch: 100%|██████████| 30/30 [00:16<00:00,  1.86it/s]



***** Fold 1 *****


Epoch: 100%|██████████| 30/30 [15:58<00:00, 31.93s/it]
Epoch: 100%|██████████| 30/30 [34:40<00:00, 69.34s/it]
Epoch: 100%|██████████| 30/30 [00:01<00:00, 20.97it/s]



***** Fold 2 *****


Epoch: 100%|██████████| 30/30 [15:43<00:00, 31.47s/it]
Epoch: 100%|██████████| 30/30 [31:59<00:00, 63.98s/it]
Epoch: 100%|██████████| 30/30 [00:06<00:00,  4.71it/s]



***** Fold 3 *****


Epoch: 100%|██████████| 30/30 [16:18<00:00, 32.62s/it]
Epoch: 100%|██████████| 30/30 [32:37<00:00, 65.24s/it]
Epoch: 100%|██████████| 30/30 [00:17<00:00,  1.75it/s]



***** Fold 4 *****


Epoch: 100%|██████████| 30/30 [16:40<00:00, 33.35s/it]
Epoch: 100%|██████████| 30/30 [33:57<00:00, 67.92s/it]
Epoch: 100%|██████████| 30/30 [00:15<00:00,  1.95it/s]


### output

In [29]:
pd.DataFrame(all_result, columns=['model', 'kfold', 'data', 'Recall', 'NDCG']).to_csv('output/FM_result.csv', index=False)