# Module 5. Custom Metric 으로 성능 데이터 및 Cold Start 성능 체크 하기 

이번 모듈에서는 모듈1에서 테스트 용으로 분리했던 데이터를 가지고 Custom 지표를 통해 추가적인 성능을 평가해 보도록 합니다. 
또한 HRNN Coldstart 성능도 추가적으로 확인해 보도록 합니다.Coldstart 아이템은 신규로 등록된 아이템 이기 때문에 성능을 예측하기가 어려운 부분이 있습니다. 

In [11]:
import pandas as pd, numpy as np
import io
import scipy.sparse as ss
import json
import time
import os
import boto3
from botocore.exceptions import ClientError
from metrics import mean_reciprocal_rank, ndcg_at_k, precision_at_k
!pip install tqdm
from tqdm import tqdm_notebook

[33mYou are using pip version 10.0.1, however version 20.2b1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [12]:
%store -r

In [13]:
# Configure the SDK to Personalize:
personalize = boto3.client('personalize')
personalize_runtime = boto3.client('personalize-runtime')

In [14]:
#read holdout data and coldstart data
df_holdout = pd.read_csv(validation_interaction_filename)
df_coldstart=pd.read_csv(coldstart_interation_filename)

## Custom Metric으로 테스트 데이터 세트 평가하기

이번 파트에서는 앞장에 남겨두었던 데이터 세트를 활용하여 모델 성능을 평가 하도록 합니다.
테스트 데이터 셋에 있는 모든 고유한 사용자에 대해 테스트 데이터 세트 Interaction Ground Truth data와 Campaign에서 생성된 결과를 비교 하도록 합니다.


In [15]:
test_users = df_holdout['USER_ID'].unique()
df_holdout.head()


Unnamed: 0,USER_ID,ITEM_ID,EVENT_VALUE,TIMESTAMP,EVENT_TYPE
0,1,48,5,978824351,RATING
1,1,2294,4,978824291,RATING
2,1,1907,4,978824330,RATING
3,2,2126,3,978300123,RATING
4,2,3257,3,978300073,RATING


In [16]:
relevance = []
for user_id in tqdm_notebook(test_users[:1000]):
    true_items = set(df_holdout[df_holdout['USER_ID']==user_id]['ITEM_ID'].values)
    rec_response = personalize_runtime.get_recommendations(
        campaignArn = hrnn_campaign_arn,
        userId = str(user_id)
    )
    rec_items = [int(x['itemId']) for x in rec_response['itemList']]
    relevance.append([int(x in true_items) for x in rec_items])

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  from ipykernel import kernelapp as app


HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))




In [17]:
print('mean_reciprocal_rank', np.mean([mean_reciprocal_rank(r) for r in relevance]))
print('precision_at_5', np.mean([precision_at_k(r, 5) for r in relevance]))
print('precision_at_10', np.mean([precision_at_k(r, 10) for r in relevance]))
print('precision_at_25', np.mean([precision_at_k(r, 25) for r in relevance]))
print('normalized_discounted_cumulative_gain_at_5', np.mean([ndcg_at_k(r, 5) for r in relevance]))
print('normalized_discounted_cumulative_gain_at_10', np.mean([ndcg_at_k(r, 10) for r in relevance]))
print('normalized_discounted_cumulative_gain_at_25', np.mean([ndcg_at_k(r, 25) for r in relevance]))

mean_reciprocal_rank 0.2959221616425931
precision_at_5 0.12619999999999998
precision_at_10 0.1019
precision_at_25 0.07532
normalized_discounted_cumulative_gain_at_5 0.23251565763932372
normalized_discounted_cumulative_gain_at_10 0.2881176642607127
normalized_discounted_cumulative_gain_at_25 0.3824062394227992


## Cold Start 성능 테스트 

이부분에서는 새롭게 더해진 새로운 아이템(ColdStart)에 대한 추천 성능을 테스트 해보도록 합니다. 


In [24]:
metrics=[]

def build_metric_matrix(solution,relevance):
    metrics.append([solution,
                np.mean([mean_reciprocal_rank(r) for r in relevance]),
                np.mean([precision_at_k(r, 5) for r in relevance]),
                np.mean([precision_at_k(r, 10) for r in relevance]),
                np.mean([precision_at_k(r, 10) for r in relevance]),
                np.mean([ndcg_at_k(r, 5) for r in relevance]),
                np.mean([ndcg_at_k(r, 10) for r in relevance]),
                np.mean([ndcg_at_k(r, 25) for r in relevance])])


In [25]:
users = df_coldstart['USER_ID'].unique()
users.shape

(5918,)

In [19]:
relevance = []
for user_id in  tqdm_notebook(users[:1000]):

    true_items = set(df_coldstart[df_coldstart['USER_ID']==user_id]['ITEM_ID'].values)

    rec_response = personalize_runtime.get_recommendations(
            campaignArn = hrnn_coldstart_campaign_arn,
            userId = str(user_id)
        )
    rec_items = [int(x['itemId']) for x in rec_response['itemList']]
    relevance.append([int(x in true_items) for x in rec_items])

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  from ipykernel import kernelapp as app


HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))




In [20]:
print('mean_reciprocal_rank', np.mean([mean_reciprocal_rank(r) for r in relevance]))
print('precision_at_5', np.mean([precision_at_k(r, 5) for r in relevance]))
print('precision_at_10', np.mean([precision_at_k(r, 10) for r in relevance]))
print('precision_at_25', np.mean([precision_at_k(r, 25) for r in relevance]))
print('normalized_discounted_cumulative_gain_at_5', np.mean([ndcg_at_k(r, 5) for r in relevance]))
print('normalized_discounted_cumulative_gain_at_10', np.mean([ndcg_at_k(r, 10) for r in relevance]))
print('normalized_discounted_cumulative_gain_at_25', np.mean([ndcg_at_k(r, 25) for r in relevance]))

mean_reciprocal_rank 0.08366653863728739
precision_at_5 0.026
precision_at_10 0.0248
precision_at_25 0.02172
normalized_discounted_cumulative_gain_at_5 0.06196028918264864
normalized_discounted_cumulative_gain_at_10 0.08900563769413401
normalized_discounted_cumulative_gain_at_25 0.13838265302454003


### A baseline

랜덤으로 추천하였을 경우 대비 Coldstart 성능이 얼마나 좋은 것인지 비교하여 보도록 합니다. 

In [21]:
len(rec_items)

25

In [22]:
relevance = []
for user_id in  tqdm_notebook(users[:1000]):

    true_items = set(df_coldstart[df_coldstart['USER_ID']==user_id]['ITEM_ID'].values)
    rec_items = np.random.permutation(cold_items)[:25]
    relevance.append([int(x in true_items) for x in rec_items])

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  from ipykernel import kernelapp as app


HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))




In [23]:
print('mean_reciprocal_rank', np.mean([mean_reciprocal_rank(r) for r in relevance]))
print('precision_at_5', np.mean([precision_at_k(r, 5) for r in relevance]))
print('precision_at_10', np.mean([precision_at_k(r, 10) for r in relevance]))
print('precision_at_25', np.mean([precision_at_k(r, 25) for r in relevance]))
print('normalized_discounted_cumulative_gain_at_5', np.mean([ndcg_at_k(r, 5) for r in relevance]))
print('normalized_discounted_cumulative_gain_at_10', np.mean([ndcg_at_k(r, 10) for r in relevance]))
print('normalized_discounted_cumulative_gain_at_25', np.mean([ndcg_at_k(r, 25) for r in relevance]))

mean_reciprocal_rank 0.11524125219346244
precision_at_5 0.04359999999999999
precision_at_10 0.0416
precision_at_25 0.041800000000000004
normalized_discounted_cumulative_gain_at_5 0.07944511540606182
normalized_discounted_cumulative_gain_at_10 0.11220961212879242
normalized_discounted_cumulative_gain_at_25 0.19113729217309675


HRNN Cold Start모델은 메타 데이터의 일부 정보를 활용하여 Interaction정보가 없는 새로운 아이템에 대해도 추천을 할수 있습니다. 메타 데이터 정보가 장르밖에 없었음에도 랜덤 추천 대비 약 3~4배의 성능이 있었음을 확인할 수 있습니다. 메타 데이터 성능을 향상 시키거나 Cold-start item비율을 줄인다면 더 좋은 성능을 기대해 볼 수 있습니다. 


## A quick test

In [None]:
# we had saved all the data before deleting the cold items

df=pd.read_csv(interaction_filename)
df = df.sort_values('TIMESTAMP', kind='mergesort').copy()

In [None]:
items_all = pd.read_csv('./ml-1m/movies.dat',sep='::', encoding='latin1',names=['ITEM_ID', '_TITLE', 'GENRE'],)
del items_all['_TITLE']

user_id = users[1]
hist_items = df[df['USER_ID']==user_id]['ITEM_ID'].tail(5).values
items_all.set_index('ITEM_ID').loc[hist_items]

In [None]:
rec_response = personalize_runtime.get_recommendations(
            campaignArn = hrnn_coldstart_campaign_arn,
            userId = str(user_id)
        )
rec_items = [int(x['itemId']) for x in rec_response['itemList']]

items_all.set_index('ITEM_ID').loc[rec_items[:5]]



In [None]:
##This code is only testing purpose
'''
def is_cold_item(rec_items):
    count=0
    np_cold_items=np.array(cold_items)
    for i in range(len(rec_items)):
        if np.where(np_cold_items==rec_items[i]):
            count+=1
        else:
            print("Item_id {} is not Coldstart Item".format(rec_items[i]))
    print(count)
is_cold_item(rec_items)
'''

이 사용자는액션|어드벤처|스릴러 아이템을 많이 선택하였고 모델도 장르에서 만이 선택하였다는 것을 다는 것을 알았습니다. 콜드 아이템에서 액션 | 어드벤처 | 스릴러 아이템을 추천합니다.

## Another quick test

In [None]:
user_id = users[2]
hist_items = df[df['USER_ID']==user_id]['ITEM_ID'].tail(10).values
items_all.set_index('ITEM_ID').loc[hist_items]

In [None]:
rec_response = personalize_runtime.get_recommendations(
            campaignArn = hrnn_coldstart_campaign_arn,
            userId = str(user_id)
        )
rec_items = [int(x['itemId']) for x in rec_response['itemList']]
items_all.set_index('ITEM_ID').loc[rec_items[:10]]

다시 한번 테스트를 통해 해당 사용자는 Comedy|Action을 주로 보았고 Amazon personalize 모델이 Comedy|Action 아이템을 추천하는 것을 볼 수 있습니다. 