In [1]:
from __future__ import (absolute_import, division, print_function, unicode_literals)
from collections import defaultdict

import os

import numpy as np
import pandas as pd

from surprise import Dataset
from surprise import Reader
from surprise import KNNWithMeans
from surprise import SVD
from surprise.model_selection import cross_validate
from surprise.model_selection import KFold

import warnings
warnings.filterwarnings('ignore')

In [2]:
data = Dataset.load_builtin('ml-1m')
kf = KFold(n_splits = 5)
sim_options = {'name' : 'cosine', 'user_based' : True}

## 1) Precision & Recall & F1-measure

In [3]:
# https://github.com/NicolasHug/Surprise/blob/master/examples/precision_recall_at_k.py
def precision_recall_at_k(predictions, k=10, threshold=3.5):
    '''Return precision and recall at k metrics for each user.'''

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])

        # Precision@K: Proportion of recommended items that are relevant
        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1

        # Recall@K: Proportion of relevant items that are recommended
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1

    return precisions, recalls

In [4]:
def get_P_R_F(data, algo):
    for trainset, testset in kf.split(data):
        algo.fit(trainset)
        predictions = algo.test(testset)
        precisions, recalls = precision_recall_at_k(predictions, k = 5, threshold = 4)

        P = sum(prec for prec in precisions.values()) / len(precisions)
        R = sum(rec for rec in recalls.values()) / len(recalls)
        F1 = 2 * P * R / (P + R)

        print('precision : ', P)
        print('recall : ', R)
        print('F1 : '  , F1)

### CF with mean

In [5]:
algo = KNNWithMeans(k = 40, min_k = 1, simoptions = sim_options)
get_P_R_F(data, algo)

Computing the msd similarity matrix...
Done computing similarity matrix.
precision :  0.8960327977472392
recall :  0.25124918911282607
F1 :  0.392453671252401
Computing the msd similarity matrix...
Done computing similarity matrix.
precision :  0.892438808773982
recall :  0.24479873906868221
F1 :  0.38420802320194314
Computing the msd similarity matrix...
Done computing similarity matrix.
precision :  0.8941985745068938
recall :  0.252165878024387
F1 :  0.39339386033959955
Computing the msd similarity matrix...
Done computing similarity matrix.
precision :  0.8944859038142775
recall :  0.2527350191033974
F1 :  0.39411399752590437
Computing the msd similarity matrix...
Done computing similarity matrix.
precision :  0.8917784771685859
recall :  0.2506184731078807
F1 :  0.39127583497903196


### SVD

In [6]:
algo = SVD(n_factors = 100, n_epochs = 20, biased = False, lr_all = 0.005, reg_all = 0)
get_P_R_F(data, algo)

precision :  0.8585349685117774
recall :  0.2762021346480857
F1 :  0.41794560222390637
precision :  0.8670890618960975
recall :  0.27740442583355596
F1 :  0.4203332669703428
precision :  0.8594934261407702
recall :  0.27886434590833664
F1 :  0.4211014814118063
precision :  0.8621443173656113
recall :  0.27293350460906834
F1 :  0.4146113428734373
precision :  0.8586991331235265
recall :  0.27372730451453614
F1 :  0.41512524131655154


### PMF

In [7]:
algo = SVD(n_factors = 100, n_epochs = 20, biased = False, lr_all = 0.005, reg_all = 0.02)
get_P_R_F(data, algo)

precision :  0.8937248081276701
recall :  0.2660466028040788
F1 :  0.41003329932589
precision :  0.8975780171223524
recall :  0.26280896467346976
F1 :  0.4065739328245895
precision :  0.8976265386101576
recall :  0.2668757359215723
F1 :  0.41142855332017153
precision :  0.8976139188069722
recall :  0.26139909152551544
F1 :  0.40488840215778094
precision :  0.8958360927152422
recall :  0.2619849017997134
F1 :  0.40540900862997803


### PMF with bias

In [8]:
algo = SVD(n_factors = 100, n_epochs = 20, biased = True, lr_all = 0.005, reg_all = 0.02)
get_P_R_F(data, algo)

precision :  0.8857623937286223
recall :  0.2804385158505202
F1 :  0.4260018819280496
precision :  0.8856531344932472
recall :  0.2818663695289159
F1 :  0.42763454113016247
precision :  0.8873427152318012
recall :  0.28090913024244457
F1 :  0.4267276295404396
precision :  0.8814284136712632
recall :  0.28087169410561497
F1 :  0.42599719319340734
precision :  0.8825379729356667
recall :  0.2812655488762812
F1 :  0.42657978380309536


## 2) NDCG

In [9]:
df = pd.DataFrame(data.raw_ratings, columns = ['uid', 'iid', 'rate', 'timestamp'])
del df['timestamp']
user = list(set(df.uid))

In [14]:
def addRating(uid, pred_data):
    add = df[df.uid == uid]
    add['pred'] = 0.0
    for iid, i in zip(add.iid, add.index):
        add.set_value(i, 'pred', pred_data.iloc[int(uid) - 1][int(iid) + 1])
    
    return add

def DCG(data):
    pred_sort = list(data.sort_values(by = ['pred'], axis = 0, ascending = False).rate)
    dcg = pred_sort[0]
    for i in range(1, len(pred_sort)):
        dcg += pred_sort[i] / np.log2(i + 1)
        
    return dcg

def IDCG(data):
    rate_sort = list(data.sort_values(by = ['rate'], axis = 0, ascending = False).rate)
    idcg = rate_sort[0]
    for i in range(1, len(rate_sort)):
        idcg += rate_sort[i] / np.log2(i + 1)
        
    return idcg

In [15]:
def NDCG(uid, pred_data):
    table = addRating(uid, pred_data)
    dcg = DCG(table)
    idcg = IDCG(table)
    
    return dcg / idcg

In [16]:
# Assignment#4에서 구한 예측 레이팅 불러오기
pred_rating_mean = pd.read_csv('pred_rating_mean.csv')
pred_rating_svd = pd.read_csv('pred_rating_svd.csv')
pred_rating_pmf = pd.read_csv('pred_rating_pmf.csv')
pred_rating_pmf_bi = pd.read_csv('pred_rating_pmf_bi.csv')

In [34]:
def get_NDCG(user, pred_data):
    NDCG_ = {}
    for u in user:
        value = NDCG(u, pred_data)
        NDCG_[u] = value
    return NDCG_;

### CF with mean

In [35]:
NDCG_CF_mean = get_NDCG(user, pred_rating_mean)

In [36]:
NDCG_CF_mean

{'721': 0.98788082327378,
 '1090': 0.9984037682017489,
 '5538': 0.9972476773569118,
 '2284': 0.999022734331057,
 '3816': 0.9942839821263912,
 '1954': 0.9962492842930194,
 '4601': 0.9963842121431341,
 '4861': 0.9948855909279686,
 '644': 0.9958579876178277,
 '3839': 0.993911044292596,
 '205': 0.9958957942843545,
 '2989': 0.9987700404064923,
 '735': 0.9944376676286452,
 '3314': 0.9934390479690334,
 '1813': 0.9997346668918418,
 '5473': 0.9974750332947141,
 '5167': 0.9973214651266601,
 '5744': 0.9936331206628325,
 '3122': 0.9985375141393372,
 '2275': 0.9996052765458439,
 '2841': 0.9928659647644559,
 '3652': 0.9926628313277801,
 '132': 0.9983151249103542,
 '4103': 0.99467409009528,
 '4523': 0.9930669966621616,
 '4582': 0.9994049500636739,
 '3991': 0.9986594511025079,
 '3315': 0.9956356972718393,
 '2447': 0.9977144323321939,
 '2759': 1.0,
 '3071': 0.9982702073163513,
 '867': 0.9966589790010595,
 '5378': 0.9970707752185426,
 '872': 0.9944685833276257,
 '18': 0.9962962462666726,
 '3411': 0.9873

### SVD

In [37]:
NDCG_SVD = get_NDCG(user, pred_rating_svd)

In [38]:
NDCG_SVD

{'721': 0.9920344091292078,
 '1090': 0.9898794898642217,
 '5538': 0.9944398769573045,
 '2284': 0.9955541225976854,
 '3816': 0.9943183446105817,
 '1954': 0.9895776374993955,
 '4601': 0.9917389793516905,
 '4861': 0.9942553047310011,
 '644': 0.9956990266550093,
 '3839': 0.9893810436638313,
 '205': 0.9926904653914923,
 '2989': 0.9973776867859758,
 '735': 0.9903151008664763,
 '3314': 0.994051877774411,
 '1813': 0.9484514193555176,
 '5473': 0.9912771164502728,
 '5167': 0.9891909173385754,
 '5744': 0.9895820749494473,
 '3122': 0.9882007810319665,
 '2275': 0.9956666607344519,
 '2841': 0.9953077622258398,
 '3652': 0.9895100156496363,
 '132': 0.9945719443880252,
 '4103': 0.9955400270257256,
 '4523': 0.9947461056651404,
 '4582': 0.9751066215958287,
 '3991': 0.9981990548360082,
 '3315': 0.9963094789208288,
 '2447': 0.99530303798762,
 '2759': 0.9883265740244557,
 '3071': 0.9925225670630482,
 '867': 0.9934320494312547,
 '5378': 0.9949591364093333,
 '872': 0.9898403163613682,
 '18': 0.997437892312185

### PMF

In [39]:
NDCG_PMF = get_NDCG(user, pred_rating_pmf)

In [40]:
NDCG_PMF

{'721': 0.9889381354537843,
 '1090': 0.9881624166016875,
 '5538': 0.9879905246442954,
 '2284': 0.988723112018736,
 '3816': 0.9913279469051449,
 '1954': 0.9894290694603466,
 '4601': 0.9901368121338139,
 '4861': 0.9925043721308359,
 '644': 0.9873445447650102,
 '3839': 0.9618611953552733,
 '205': 0.9833095675295711,
 '2989': 0.9932794092797121,
 '735': 0.9883701011508575,
 '3314': 0.9895373721834392,
 '1813': 0.9793358411817835,
 '5473': 0.9880406490447186,
 '5167': 0.9926890186857571,
 '5744': 0.990104151388117,
 '3122': 0.9857823022123239,
 '2275': 0.9884247114097003,
 '2841': 0.9920263797330309,
 '3652': 0.984856911442783,
 '132': 0.9922351361199107,
 '4103': 0.9922080407113018,
 '4523': 0.9928607721516906,
 '4582': 0.9686496064369575,
 '3991': 0.9959278648388761,
 '3315': 0.9919398769949634,
 '2447': 0.9930306236227465,
 '2759': 0.9776062939443405,
 '3071': 0.9888426520086793,
 '867': 0.9838712422864756,
 '5378': 0.9921353295341612,
 '872': 0.9855311343947761,
 '18': 0.994942525294989

### PMF with bias

In [41]:
NDCG_PMF_bi = get_NDCG(user, pred_rating_pmf_bi)

In [42]:
NDCG_PMF_bi

{'721': 0.9886039755131475,
 '1090': 0.9816501218593241,
 '5538': 0.9893742467658652,
 '2284': 0.9918916379411644,
 '3816': 0.9915310668753669,
 '1954': 0.986391120115436,
 '4601': 0.9892478911562771,
 '4861': 0.9854239790390847,
 '644': 0.9928638389569627,
 '3839': 0.9562135305055363,
 '205': 0.9887204760809417,
 '2989': 0.9934256002312384,
 '735': 0.9918709532356923,
 '3314': 0.9895807439722151,
 '1813': 0.9893685018168475,
 '5473': 0.9927237974955426,
 '5167': 0.9946200497291724,
 '5744': 0.9896724720596128,
 '3122': 0.964058376047047,
 '2275': 0.9928121556168306,
 '2841': 0.9929220183039006,
 '3652': 0.9849478315312341,
 '132': 0.9952287881828087,
 '4103': 0.9897866035740708,
 '4523': 0.9930499408601294,
 '4582': 0.9792857353698097,
 '3991': 0.9939861201641836,
 '3315': 0.9933191833847601,
 '2447': 0.995960898193523,
 '2759': 0.9692752532427965,
 '3071': 0.9847290539174625,
 '867': 0.9709703898301466,
 '5378': 0.9931817597529662,
 '872': 0.9846338762764815,
 '18': 0.995749920947993