In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, manhattan_distances
import random

filename="ml1m" # mlsmall ml1m lastfm abook

#train data load
train_data_df = pd.read_csv(
    './data/'+filename+'.train.rating', 
    sep='\t', header=None, names=['user', 'item'], 
    usecols=[0, 1], dtype={0: np.int32, 1: np.int32})

#test data load
#99개는 나중에 test과정에서 랜덤으로 뽑자
test_data_df = pd.read_csv(
    './data/'+filename+'.test.rating', 
    sep='\t', header=None, names=['user', 'item'], 
    usecols=[0, 1], dtype={0: np.int32, 1: np.int32})

#user, item num
num_users = train_data_df['user'].max() + 1
num_items = train_data_df['item'].max() + 1

print("n_user : {}, n_item : {}".format(num_users, num_items))
print("train : {}, test : {}".format(len(train_data_df), len(test_data_df)))

u_lst = train_data_df['user'].tolist()
i_lst = train_data_df['item'].tolist()

# rating array 생성
mat = np.zeros((num_users, num_items))
for i in tqdm(range(len(train_data_df))):
    mat[u_lst[i], i_lst[i]] = 1

# user - item matrix    
user_item_df = pd.DataFrame(mat)

 18%|████████████▍                                                        | 179667/994169 [00:00<00:00, 1784216.27it/s]

n_user : 6040, n_item : 3706
train : 994169, test : 6040


100%|█████████████████████████████████████████████████████████████████████| 994169/994169 [00:00<00:00, 1748922.30it/s]


# Implicit library import
#### https://github.com/benfred/implicit
#### https://implicit.readthedocs.io/en/latest/models.html

In [2]:
from scipy.sparse import csr_matrix
from implicit.bpr import BayesianPersonalizedRanking

In [3]:
model = BayesianPersonalizedRanking(factors=16, iterations=100)

In [4]:
# user-item matrix
user_items = csr_matrix(mat).tocsr()

#item-user matrix
item_users = csr_matrix(mat.T).tocsr()

print(mat.shape) #user-item
print(mat.T.shape) #item-user

(6040, 3706)
(3706, 6040)


In [5]:
# model fitting => item-user mat
model.fit(item_users)

HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))




## evaluation

In [6]:
def hit(gt_item, pred_items):
	if gt_item in pred_items:
		return 1
	return 0


def ndcg(gt_item, pred_items):
	if gt_item in pred_items:
		index = pred_items.index(gt_item)
		return np.reciprocal(np.log2(index+2))
	return 0

def evaluate(gt_item, full_pred_items, K):
    pred_items = full_pred_items[0:K]
    return hit(gt_item, pred_items), ndcg(gt_item, pred_items)

def user_test(test_user, K):
    # 강좌별 추천 score dict 형식으로 저장
    recommendations = model.recommend(test_user, user_items, N=num_items)
    pred = {x:y for x,y in recommendations}
    
    # test 100개 리스트 만들기
    asis = train_data_df[train_data_df['user']==test_user]['item'].tolist()
    gt = test_data_df[test_data_df['user']==test_user]['item'].tolist()

    full = set(range(0,num_items))
    test_cand_99 = random.sample(list(full-set(asis)-set(gt)),99)
    test_cand = gt.copy()
    test_cand.extend(test_cand_99)

    # 100개 score 다시 뽑아서 test_score에 저장
    test_score=dict()
    for item in test_cand:
        if item in pred.keys():
            test_score.update({item:pred[item]})
        else:
            test_score.update({item:0})
    res = dict(sorted(test_score.items(),key=(lambda x:x[1]), reverse=True))
    return evaluate(gt[0], list(res.keys()), K)    

In [7]:
fin_hr=[]
fin_ndcg=[]
for epoch in tqdm(range(10)):
    _hr=[]
    _ndcg=[]

    #for i in tqdm(range(num_users)):
    for i in range(num_users):
        temp1, temp2 = user_test(i,10)
        _hr.append(temp1)
        _ndcg.append(temp2)
    fin_hr.append(sum(_hr)/len(_hr))
    fin_ndcg.append(sum(_ndcg)/len(_ndcg))
    #print(epoch+1, sum(_hr)/len(_hr), sum(_ndcg)/len(_ndcg))

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [04:42<00:00, 28.27s/it]


In [8]:
print(fin_hr,'\n',fin_ndcg)

[0.5834437086092715, 0.5852649006622517, 0.5917218543046358, 0.5862582781456953, 0.5869205298013245, 0.5850993377483443, 0.5913907284768212, 0.5846026490066225, 0.5839403973509933, 0.5864238410596027] 
 [0.3421733024160763, 0.3417395809007765, 0.3436753100654177, 0.34319141780561596, 0.3417458940311683, 0.34208597815351455, 0.3413774322907072, 0.34089836511339494, 0.3403802409635014, 0.34360076263552874]
