In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, manhattan_distances
import random

## 0. 데이터 로드

In [2]:
filename="lastfm" # mlsmall ml1m lastfm abook

#train data load
train_data_df = pd.read_csv(
    './data/'+filename+'.train.rating', 
    sep='\t', header=None, names=['user', 'item'], 
    usecols=[0, 1], dtype={0: np.int32, 1: np.int32})

#test data load
#99개는 나중에 test과정에서 랜덤으로 뽑자
test_data_df = pd.read_csv(
    './data/'+filename+'.test.rating', 
    sep='\t', header=None, names=['user', 'item'], 
    usecols=[0, 1], dtype={0: np.int32, 1: np.int32})

#user, item num
num_users = train_data_df['user'].max() + 1
num_items = train_data_df['item'].max() + 1

print("n_user : {}, n_item : {}".format(num_users, num_items))
print("train : {}, test : {}".format(len(train_data_df), len(test_data_df)))

n_user : 23566, n_item : 48123
train : 3011197, test : 23566


In [3]:
train_data_df_drop = train_data_df.drop_duplicates().copy()

u_lst = train_data_df_drop['user'].tolist()
i_lst = train_data_df_drop['item'].tolist()
print(len(u_lst), len(i_lst))

1529532 1529532


## 새로운 데이터셋은 [1.매트릭스 생성] 으로
## Last-FM 3개로 진행은 [2.불러와서 시작] 으로

## => [1.] 혹은 [2.] 완료한 이후에 [3.] 실행

# 

## @@@@@@@@@@@@@@@@@@@@@@@@@@@
## @@@@@@@@@ 1. 매트릭스 생성 @@@@@@@@@
## @@@@@@@@@@@@@@@@@@@@@@@@@@@

In [None]:
# rating array 생성
mat = np.zeros((num_users, num_items))
for i in tqdm(range(len(train_data_df_drop))):
    mat[u_lst[i], i_lst[i]] = 1
    
print(mat.shape)

# user - item matrix    
user_item_df = pd.DataFrame(mat)
print(user_item_df.shape)

# item similarity matrix
item_sim = cosine_similarity(user_item_df.transpose(),user_item_df.transpose())
item_sim_df = pd.DataFrame(item_sim)
print(item_sim_df.shape)

In [None]:
# interaction에 따른 score matrix
score = np.zeros((num_users, num_items))
for k in tqdm(range(len(train_data_df_drop))):
    u,i=u_lst[k],i_lst[k]
    score[u]=np.array([x+y for x,y in zip(score[u],item_sim[i])])

In [None]:
score_df=pd.DataFrame(score)

### 다음에 해야할게 to_dict() 인데, 메모리가 부족해서 분할작업 필요

#score_df -> dict로 변경

#score_dict = score_df.to_dict()

#그냥 to_dict로 변경하니까 item이 key값으로 나옴

score_dict = score_df.transpose().to_dict()


#### 분할 작업을 위한 저장

In [None]:
score_df.to_pickle('D://score_df.pickle')

In [None]:
score_df[0:8000].to_pickle('D://score_df_1.pickle')
score_df[8000:16000].to_pickle('D://score_df_2.pickle')
score_df[16000:23566].to_pickle('D://score_df_3.pickle')

# 

## @@@@@@@@@@@@@@@@@@@@@@@@@@@
## @@@@@@@@@ 2. 불러와서 시작 @@@@@@@@@
## @@@@@@@@@@@@@@@@@@@@@@@@@@@

In [4]:
file_num = 1 #1, 2, 3

if file_num == 1:
    st, en = 0, 8000
    score_df = pd.read_pickle('D://score_df_'+str(file_num)+'.pickle')
if file_num == 2:
    st, en = 8000, 16000
    score_df = pd.read_pickle('D://score_df_'+str(file_num)+'.pickle')
if file_num == 3:
    st, en = 16000, 23566
    score_df = pd.read_pickle('D://score_df_'+str(file_num)+'.pickle')
print(len(score_df))

# user별 item score을 dict 형태로 저장
score_dict = score_df.transpose().to_dict()
print(len(score_dict))

8000
8000


## 3. Evaluation

In [5]:
def hit(gt_item, pred_items):
	if gt_item in pred_items:
		return 1
	return 0


def ndcg(gt_item, pred_items):
	if gt_item in pred_items:
		index = pred_items.index(gt_item)
		return np.reciprocal(np.log2(index+2))
	return 0

def evaluate(gt_item, full_pred_items, K):
    pred_items = full_pred_items[0:K]
    return hit(gt_item, pred_items), ndcg(gt_item, pred_items)

def user_test(test_user, K):
    # 강좌별 score
    pred = dict(sorted(score_dict[test_user].items(),key=(lambda x:x[1]), reverse=True))

    # test 100개 리스트 만들기
    asis = train_data_df[train_data_df['user']==test_user]['item'].tolist()
    gt = test_data_df[test_data_df['user']==test_user]['item'].tolist()
    
    full = set(range(0,num_items))
    test_cand_99 = random.sample(list(full-set(asis)-set(gt)),99)
    test_cand = gt.copy()
    test_cand.extend(test_cand_99)
    
    # 100개 score 다시 뽑아서 test_score에 저장
    test_score=dict()
    for item in test_cand:
        test_score.update({item:pred[item]})

    res = dict(sorted(test_score.items(),key=(lambda x:x[1]), reverse=True))

    return evaluate(gt[0], list(res.keys()), K)

In [7]:
hr_5 = []
ndcg_5 = []
hr_10 = []
ndcg_10 = []
hr_20 = []
ndcg_20 = []

for epoch in tqdm(range(5)): # 반복 횟수 지정
    for i in range(st,en): # 범위에 해당하는 user만 test
        # eval 결과 temp_에 저장
        temp_5 = user_test(i,5)
        temp_10 = user_test(i,10)
        temp_20 = user_test(i,20)

        # 각 테스트 결과 저장
        hr_5.append(temp_5[0])
        ndcg_5.append(temp_5[1])
        hr_10.append(temp_10[0])
        ndcg_10.append(temp_10[1])
        hr_20.append(temp_20[0])
        ndcg_20.append(temp_20[1])
    
print("K=5  : {:.4f} / {:.4f}".format(sum(hr_5)/len(hr_5), sum(ndcg_5)/len(ndcg_5)))
print("K=10 : {:.4f} / {:.4f}".format(sum(hr_10)/len(hr_10), sum(ndcg_10)/len(ndcg_10)))
print("K=20 : {:.4f} / {:.4f}".format(sum(hr_20)/len(hr_20), sum(ndcg_20)/len(ndcg_20)))

100%|█████████████████████████████████████████████████████████████████████████████████| 5/5 [1:18:11<00:00, 938.21s/it]

K=5  : 0.8328 / 0.7655
K=10 : 0.8819 / 0.7820
K=20 : 0.9336 / 0.7944



