In [9]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, manhattan_distances
import random

### 데이터 로드

In [2]:
filename="ml1m" # mlsmall ml1m lastfm abook

#train data load
train_data_df = pd.read_csv(
    './data/'+filename+'.train.rating', 
    sep='\t', header=None, names=['user', 'item'], 
    usecols=[0, 1], dtype={0: np.int32, 1: np.int32})

#test data load
#99개는 나중에 test과정에서 랜덤으로 뽑자
test_data_df = pd.read_csv(
    './data/'+filename+'.test.rating', 
    sep='\t', header=None, names=['user', 'item'], 
    usecols=[0, 1], dtype={0: np.int32, 1: np.int32})

#user, item num
num_users = train_data_df['user'].max() + 1
num_items = train_data_df['item'].max() + 1

print("n_user : {}, n_item : {}".format(num_users, num_items))
print("train : {}, test : {}".format(len(train_data_df), len(test_data_df)))

n_user : 6040, n_item : 3706
train : 994169, test : 6040


### 매트릭스 생성

In [8]:
u_lst = train_data_df['user'].tolist()
i_lst = train_data_df['item'].tolist()

# rating array 생성
mat = np.zeros((num_users, num_items))
for i in tqdm(range(len(train_data_df))):
    mat[u_lst[i], i_lst[i]] = 1

# user - item matrix    
user_item_df = pd.DataFrame(mat)

# item similarity matrix
item_sim = cosine_similarity(user_item_df.transpose(),user_item_df.transpose())
item_sim_df = pd.DataFrame(item_sim)

# interaction에 따른 score matrix
score = np.zeros((num_users, num_items))
for k in tqdm(range(len(train_data_df))):
    u,i=u_lst[k],i_lst[k]
    score[u]=np.array([x+y for x,y in zip(score[u],item_sim[i])])
score_df=pd.DataFrame(score)

#score_df -> dict로 변경
#score_dict = score_df.to_dict()
#그냥 to_dict로 변경하니까 item이 key값으로 나옴
score_dict = score_df.transpose().to_dict()

100%|█████████████████████████████████████████████████████████████████████| 994169/994169 [00:00<00:00, 1905981.40it/s]
100%|█████████████████████████████████████████████████████████████████████████| 994169/994169 [22:58<00:00, 721.05it/s]


### evaluate

In [15]:
def hit(gt_item, pred_items):
	if gt_item in pred_items:
		return 1
	return 0


def ndcg(gt_item, pred_items):
	if gt_item in pred_items:
		index = pred_items.index(gt_item)
		return np.reciprocal(np.log2(index+2))
	return 0

def evaluate(gt_item, full_pred_items, K):
    pred_items = full_pred_items[0:K]
    return hit(gt_item, pred_items), ndcg(gt_item, pred_items)

def user_test(test_user, K):
    # 강좌별 score
    pred = dict(sorted(score_dict[test_user].items(),key=(lambda x:x[1]), reverse=True))

    # test 100개 리스트 만들기
    asis = train_data_df[train_data_df['user']==test_user]['item'].tolist()
    gt = test_data_df[test_data_df['user']==test_user]['item'].tolist()
    
    full = set(range(0,num_items))
    test_cand_99 = random.sample(list(full-set(asis)-set(gt)),99)
    test_cand = gt.copy()
    test_cand.extend(test_cand_99)
    
    # 100개 score 다시 뽑아서 test_score에 저장
    test_score=dict()
    for item in test_cand:
        test_score.update({item:pred[item]})

    res = dict(sorted(test_score.items(),key=(lambda x:x[1]), reverse=True))

    return evaluate(gt[0], list(res.keys()), K)

In [18]:
fin_hr=[]
fin_ndcg=[]
for epoch in tqdm(range(100)):
    _hr=[]
    _ndcg=[]

    #for i in tqdm(range(num_users)):
    for i in range(num_users):
        temp1, temp2 = user_test(i,10)
        _hr.append(temp1)
        _ndcg.append(temp2)
    fin_hr.append(sum(_hr)/len(_hr))
    fin_ndcg.append(sum(_ndcg)/len(_ndcg))
    #print(epoch+1, sum(_hr)/len(_hr), sum(_ndcg)/len(_ndcg))

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [34:05<00:00, 20.46s/it]


In [19]:
print(fin_hr)
print(fin_ndcg)

[0.5245033112582781, 0.5178807947019868, 0.5230132450331125, 0.5188741721854304, 0.518046357615894, 0.5163907284768212, 0.518046357615894, 0.5254966887417218, 0.520364238410596, 0.5187086092715232, 0.5163907284768212, 0.5178807947019868, 0.5206953642384106, 0.5175496688741722, 0.5198675496688742, 0.5206953642384106, 0.5188741721854304, 0.5173841059602649, 0.5185430463576159, 0.5210264900662251, 0.5210264900662251, 0.5200331125827815, 0.5158940397350993, 0.5178807947019868, 0.5198675496688742, 0.5226821192052981, 0.5226821192052981, 0.5218543046357615, 0.5211920529801325, 0.5195364238410596, 0.5213576158940397, 0.5190397350993378, 0.5200331125827815, 0.5221854304635761, 0.5172185430463576, 0.525, 0.5236754966887417, 0.5205298013245033, 0.5218543046357615, 0.5220198675496689, 0.5210264900662251, 0.5147350993377483, 0.5195364238410596, 0.5233443708609271, 0.5177152317880794, 0.5183774834437086, 0.5177152317880794, 0.5211920529801325, 0.5208609271523179, 0.5158940397350993, 0.5180463576158