# 6조 (17011709 정선아, 17011741 문성용, 17011742 김소영)

In [1]:
from __future__ import (absolute_import, division, print_function, unicode_literals)
from collections import defaultdict

import os

import numpy as np
import pandas as pd

from surprise import Dataset
from surprise import Reader
from surprise import KNNWithMeans
from surprise import SVD
from surprise.model_selection import cross_validate
from surprise.model_selection import KFold

import warnings
warnings.filterwarnings('ignore')

In [2]:
data = Dataset.load_builtin('ml-1m')
kf = KFold(n_splits = 5)
sim_options = {'name' : 'cosine', 'user_based' : True}

## 1) Precision & Recall & F1-measure

In [3]:
# https://github.com/NicolasHug/Surprise/blob/master/examples/precision_recall_at_k.py에서 가져온 코드
def precision_recall_at_k(predictions, k=10, threshold=3.5):
    '''Return precision and recall at k metrics for each user.'''

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])

        # Precision@K: Proportion of recommended items that are relevant
        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1

        # Recall@K: Proportion of relevant items that are recommended
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1

    return precisions, recalls

In [4]:
def get_P_R_F(data, algo):
    for trainset, testset in kf.split(data):
        algo.fit(trainset)
        predictions = algo.test(testset)
        precisions, recalls = precision_recall_at_k(predictions, k = 5, threshold = 4)

        P = sum(prec for prec in precisions.values()) / len(precisions)
        R = sum(rec for rec in recalls.values()) / len(recalls)
        F1 = 2 * P * R / (P + R)

        print('precision : ', P)
        print('recall : ', R)
        print('F1 : '  , F1)

### CF with mean

In [5]:
algo = KNNWithMeans(k = 40, min_k = 1, simoptions = sim_options)
get_P_R_F(data, algo)

Computing the msd similarity matrix...
Done computing similarity matrix.
precision :  0.8962039644415173
recall :  0.2490751859424374
F1 :  0.3898126828045115
Computing the msd similarity matrix...
Done computing similarity matrix.
precision :  0.8911543772438719
recall :  0.2487028423850319
F1 :  0.38887787489132974
Computing the msd similarity matrix...
Done computing similarity matrix.
precision :  0.8958947600905677
recall :  0.2494088494133466
F1 :  0.3901918748102418
Computing the msd similarity matrix...
Done computing similarity matrix.
precision :  0.89377037405383
recall :  0.25096818574626323
F1 :  0.39189372513007664
Computing the msd similarity matrix...
Done computing similarity matrix.
precision :  0.8880987899884119
recall :  0.2517205559522374
F1 :  0.3922599172448734


### SVD

In [6]:
algo = SVD(n_factors = 100, n_epochs = 20, biased = False, lr_all = 0.005, reg_all = 0)
get_P_R_F(data, algo)

precision :  0.8611101910124881
recall :  0.2771596537950983
F1 :  0.4193469650613405
precision :  0.8609160178896915
recall :  0.27210717707825505
F1 :  0.413515678001709
precision :  0.8611267683466096
recall :  0.27760110523686166
F1 :  0.4198540286711317
precision :  0.8629411764706003
recall :  0.2765924951788015
F1 :  0.4189135593458251
precision :  0.8649314765694208
recall :  0.2790423324148288
F1 :  0.4219545844589454


### PMF

In [7]:
algo = SVD(n_factors = 100, n_epochs = 20, biased = False, lr_all = 0.005, reg_all = 0.02)
get_P_R_F(data, algo)

precision :  0.895736049014749
recall :  0.26456336312922696
F1 :  0.40847894797351236
precision :  0.8949304174950438
recall :  0.2653282415822242
F1 :  0.40930582530837656
precision :  0.8973243870112788
recall :  0.2651972224976996
F1 :  0.40939958994027487
precision :  0.9012170889220201
recall :  0.2623175644341266
F1 :  0.4063567356770022
precision :  0.9001574150787187
recall :  0.264476511509726
F1 :  0.40883317498230615


### PMF with bias

In [8]:
algo = SVD(n_factors = 100, n_epochs = 20, biased = True, lr_all = 0.005, reg_all = 0.02)
get_P_R_F(data, algo)

precision :  0.8807811291570121
recall :  0.28089336116659297
F1 :  0.42594646586773366
precision :  0.8849373309038829
recall :  0.2790187150746062
F1 :  0.4242670122182881
precision :  0.886203181968856
recall :  0.28260905579790874
F1 :  0.42855308390653163
precision :  0.886988231394013
recall :  0.28514419886197023
F1 :  0.43155456177525325
precision :  0.8838940448569345
recall :  0.2849378859430448
F1 :  0.43095143776027


## 2) NDCG

In [3]:
df = pd.DataFrame(data.raw_ratings, columns = ['uid', 'iid', 'rate', 'timestamp'])
del df['timestamp']
user = list(set(df.uid))

In [4]:
def addRating(uid, pred_data): 
    add = df[df.uid == uid] # 해당 사용자 행만 추출
    add['pred'] = 0.0 # 데이터프레임에 예측 레이팅 열 추가
    for iid, i in zip(add.iid, add.index):
        add.set_value(i, 'pred', pred_data.iloc[int(uid) - 1][int(iid) - 1]) # uid, iid 1부터 시작하므로 -1
    
    return add

def DCG(data):
    pred_sort = list(data.sort_values(by = ['pred'], axis = 0, ascending = False).rate)
    # 예측 레이팅을 기준으로 내림차순으로 정렬한 실제 레이팅 리스트
    dcg = pred_sort[0] # 예측 레이팅이 가장 높은 아이템의 실제 레이팅 저장
    for i in range(1, len(pred_sort)): # 리스트 돌며
        dcg += pred_sort[i] / np.log2(i + 1) # 순서대로 가중치를 줄여가며 더해주기
        
    return dcg

def IDCG(data):
    rate_sort = list(data.sort_values(by = ['rate'], axis = 0, ascending = False).rate)
    # 실제 레이팅을 기준으로 내림차순으로 정렬한 실제 레이팅 리스트
    idcg = rate_sort[0] # 실제 레이팅이 가장 높은 아이템의 실제 레이팅 저장
    for i in range(1, len(rate_sort)): # 리스트 돌며
        idcg += rate_sort[i] / np.log2(i + 1) # 순서대로 가중치를 줄여가며 더해주기
        
    return idcg

In [5]:
def NDCG_oneUser(uid, pred_data): # 한 사용자의 NDCG
    table = addRating(uid, pred_data) # 예측 레이팅 열 추가한 데이터프레임
    dcg = DCG(table)
    idcg = IDCG(table)
    
    return dcg / idcg

In [6]:
def NDCG_byUser(user, pred_data): # 모든 사용자의(사용자 ID별) NDCG
    NDCG = {} # 딕셔너리 생성
    for u in user: # 각 유저를 돌며
        value = NDCG_oneUser(u, pred_data) # NDCG 값 구하고
        NDCG[u] = value # key로는 user_id, value로는 NDCG 값 저장
    return NDCG;

In [7]:
def NDCG(user, pred_data):
    NDCG = NDCG_byUser(user, pred_data)
    return sum(NDCG.values()) / len(NDCG)

In [8]:
# Assignment#4에서 구한 예측 레이팅 불러오기
pred_rating_cf_mean = pd.read_csv('pred_rating_cf_mean.csv')
pred_rating_svd = pd.read_csv('pred_rating_svd.csv')
pred_rating_pmf = pd.read_csv('pred_rating_pmf.csv')
pred_rating_pmf_bias = pd.read_csv('pred_rating_pmf_bias.csv')

### CF with mean

In [9]:
NDCG_CFwM = NDCG(user, pred_rating_cf_mean)

In [10]:
NDCG_CFwM

0.9964565651583643

### SVD

In [12]:
NDCG_SVD = NDCG(user, pred_rating_svd)

In [13]:
NDCG_SVD

0.992619673506265

### PMF

In [14]:
NDCG_PMF = NDCG(user, pred_rating_pmf)

In [15]:
NDCG_PMF

0.9879664227885725

### PMF with bias

In [16]:
NDCG_PMFwB = NDCG(user, pred_rating_pmf_bias)

In [17]:
NDCG_PMFwB

0.9877145321683876