In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import os
print(os.getcwd())
os.chdir('/content/drive/MyDrive/015GithubRepos/Gitbook_recsys/NeuMF')
print(os.getcwd())

/content
/content/drive/MyDrive/015GithubRepos/Gitbook_recsys/NeuMF


In [3]:
from typing import Dict
import argparse

parser = argparse.ArgumentParser(description='NeuMF')

parser.add_argument('--ratings_path', required=False, default='../data/ml-1m/ratings.dat')
parser.add_argument('--layer_dim', required=False, default=256)
parser.add_argument('--emb_dim', required=False, default=256)
parser.add_argument('--neg_ratio', required=False, default=3)
parser.add_argument('--prep_data_dir', required=False, default='prep_data')
parser.add_argument('--prep_data_name', required=False, default='prep_data.pkl')
parser.add_argument('--test_ratio', required=False, default=0.1)
parser.add_argument('--batch_size', required=False, default=128)
parser.add_argument('--lr', required=False, default=0.0025)
parser.add_argument('--n_epoch', required=False, default=50)
parser.add_argument('--train_test_split_rs', required=False, default=1234)
parser.add_argument('--topk', required=False, default=25)
parser.add_argument('--model_save_path', required=False, default='result/neumf.pth')



args = parser.parse_args([])

In [4]:
from torch.utils.data import Dataset, DataLoader, random_split

# from arguments import args
from dataset.prepdataset import ML1mDataset
from dataset.customdataset import CustomTrainDataset, CustomTestDataset
from model import NeuMF
from trainer import Trainer


# if __name__ == '__main__':
# preprocess
# ml1m_dataset = ML1mDataset(args)
# ml1m_dataset.preprocess()

# # dataset
# train_dataset = CustomTrainDataset(args, ml1m_dataset)
# test_data = ml1m_dataset.load_prep_data()['test']

# train_dataloader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True)

# model = NeuMF(args)
# trainer = Trainer(args, model, train_dataloader, test_data)
# trainer.train()

# print('Done.')

# aiground 데이터 돌리기

In [5]:
import os

In [6]:
os.getcwd()

'/content/drive/MyDrive/015GithubRepos/Gitbook_recsys/NeuMF'

In [7]:
data_path = '../../aiground/data'

In [8]:
import pandas as pd
history_df = pd.read_csv(os.path.join(data_path, 'history_data.csv'), encoding='utf-8')
profile_df = pd.read_csv(os.path.join(data_path, 'profile_data.csv'), encoding='utf-8')
meta_df = pd.read_csv(os.path.join(data_path, 'meta_data.csv'), encoding='utf-8')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [9]:
# 데이터 전처리 (중복제거) 
# 참고 : drop_duplicates의 subset을 무엇으로 구성하냐에 따라서 제거되는 항목들이 다름 
# ex) 'profile_id', 'album_id' : 중복된 시청이력 모두 제거 / 'profile_id', 'album_id', 'log_time' : 같은 시간에 시청한 이력만 제거 
data = history_df[['profile_id', 'log_time', 'album_id']].drop_duplicates(subset=['profile_id', 'album_id', 'log_time']).sort_values(by = ['profile_id', 'log_time']).reset_index(drop = True)
data['rating'] = 1

args.n_users = data.profile_id.max()+1
args.n_items = data.album_id.max()+1

In [10]:
from sklearn.model_selection import train_test_split


In [11]:
# 학습 및 검증 데이터 분리
train, valid = train_test_split(data, test_size=int(len(data)*(args.test_ratio)), random_state=args.train_test_split_rs)
print('학습 데이터 크기:', train.shape)
print('검증 데이터 크기:', valid.shape)

학습 데이터 크기: (809327, 4)
검증 데이터 크기: (89925, 4)


# negative sampling

In [12]:
import numpy as np
print(f"Negative sampling for train data".ljust(60, '-'))
pos_items = train.groupby('profile_id')['album_id'].agg(lambda x: set(x)).to_dict()
neg_samples_user, neg_samples_item = [], []
for u in train['profile_id'].unique():
    u_pos_items = pos_items[u]  # u_pos_items : set
    neg_items = list(set(range(args.n_items)) - u_pos_items)
    neg_samples = np.random.choice(neg_items, min(len(u_pos_items) * args.neg_ratio, len(neg_items)), replace=False)
    neg_samples_user.extend([u] * len(neg_samples))
    neg_samples_item.extend(neg_samples)
print(f"train size after negative sampling: {len(train):,} --> {len(train)+len(neg_samples_user):,}")
print(f"".ljust(60, '-'))

Negative sampling for train data----------------------------
train size after negative sampling: 809,327 --> 1,879,679
------------------------------------------------------------


In [13]:
train = {
    'user' : train['profile_id'].tolist() + neg_samples_user,
    'item': train['album_id'].tolist() + neg_samples_item,
    'label' : [1] * len(train) + [0] * len(neg_samples_user)
    }

In [14]:
from torch.utils.data import Dataset

class CustomTrainDataset(Dataset):
    def __init__(self, args, prep_data):
        print(f"CustomTrainDataset Init".ljust(60, '='))
        self.args = args

        self.user = prep_data['user']
        self.item = prep_data['item']
        self.label = prep_data['label']

        # update_args = prep_data['update_args']
        # self.args.n_users = update_args['n_users']
        # self.args.n_items = update_args['n_items']
        print(f"".ljust(60, '='))

    def __getitem__(self, i):
        return self.user[i], self.item[i], self.label[i]
        
    def __len__(self):
        return len(self.user)

In [15]:
train_dataset = CustomTrainDataset(args, train)



In [16]:
# test_data = ml1m_dataset.load_prep_data()['test']
test_data = valid.groupby('profile_id')['album_id'].unique().to_dict()

In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True)

model = NeuMF(args)
trainer = Trainer(args, model, train_dataloader, test_data)
trainer.train()

cuda
Trainer.train()---------------------------------------------
        ratings_path : ../data/ml-1m/ratings.dat
           layer_dim : 256
             emb_dim : 256
           neg_ratio : 3
       prep_data_dir : prep_data
      prep_data_name : prep_data.pkl
          test_ratio : 0.1
          batch_size : 128
                  lr : 0.0025
             n_epoch : 50
 train_test_split_rs : 1234
                topk : 25
     model_save_path : result/neumf.pth
             n_users : 33033
             n_items : 25917
NeuMF(
  (GMF_user): Embedding(33033, 256)
  (GMF_item): Embedding(25917, 256)
  (MLP_user): Embedding(33033, 256)
  (MLP_item): Embedding(25917, 256)
  (MLP_linear): Sequential(
    (0): Linear(in_features=512, out_features=256, bias=True)
    (1): ReLU()
    (2): Linear(in_features=256, out_features=128, bias=True)
    (3): ReLU()
    (4): Linear(in_features=128, out_features=64, bias=True)
  )
  (output_layer): Linear(in_features=320, out_features=1, bias=True)
)


In [None]:
def recallk(actual, predicted, k = 25):
    """ label과 prediction 사이의 recall 평가 함수 
    Args:
        actual : 실제로 본 상품 리스트
        pred : 예측한 상품 리스트
        k : 상위 몇개의 데이터를 볼지 (ex : k=5 상위 5개의 상품만 봄)
    Returns: 
        recall_k : recall@k 
    """ 
    set_actual = set(actual)
    recall_k = len(set_actual & set(predicted[:k])) / min(k, len(set_actual))
    return recall_k

def unique(sequence):
    # preserves order
    seen = set()
    return [x for x in sequence if not (x in seen or seen.add(x))]

def ndcgk(actual, predicted, k = 25):
    set_actual = set(actual)
    idcg = sum([1.0 / np.log(i + 2) for i in range(min(k, len(set_actual)))])
    dcg = 0.0
    unique_predicted = unique(predicted[:k])
    for i, r in enumerate(unique_predicted):
        if r in set_actual:
            dcg += 1.0 / np.log(i + 2)
    ndcg_k = dcg / idcg
    return ndcg_k

def evaluation(test_data, pred):
    """ label과 prediction 사이의 recall, coverage, competition metric 평가 함수 
    Args:
        gt : 데이터 프레임 형태의 정답 데이터 
        pred : 데이터 프레임 형태의 예측 데이터 
    Returns: 
        rets : recall, ndcg, coverage, competition metric 결과 
            ex) {'recall': 0.123024, 'ndcg': 056809, 'coverage': 0.017455, 'score': 0.106470}
    """    
    # gt = gt.groupby('profile_id')['album_id'].unique().to_frame().reset_index()
    # gt.columns = ['profile_id', 'actual_list']
    gt = pd.DataFrame(test_data.items(), columns=['profile_id', 'actual_list'])


    evaluated_data = pd.merge(pred, gt, how = 'left', on = 'profile_id')

    evaluated_data['Recall@25'] = evaluated_data.apply(lambda x: recallk(x.actual_list, x.predicted_list), axis=1)
    evaluated_data['NDCG@25'] = evaluated_data.apply(lambda x: ndcgk(x.actual_list, x.predicted_list), axis=1)

    recall = evaluated_data['Recall@25'].mean()
    ndcg = evaluated_data['NDCG@25'] .mean()
    # coverage = (evaluated_data['predicted_list'].apply(lambda x: x[:args.topk]).explode().nunique())/meta_df.index.nunique()

    score = 0.75*recall + 0.25*ndcg
    rets = {"recall" :recall, 
            "ndcg" :ndcg, 
            "score" :score}
    return rets

In [None]:
device = 'cuda'

In [None]:
import numpy as np
import pandas as pd

# def valid_epoch(cfg, model, data, mode='valid'):
pred_list = []
model.eval()

query_user_ids = list(test_data.keys()) # 추론할 모든 user array 집합
full_item_ids = np.array([c for c in range(args.n_items)]) # 추론할 모든 item array 집합 

for user_id in query_user_ids:
    with torch.no_grad():
        user_ids = np.full(args.n_items, user_id) # array([user_id, user_id, ..., user_id]) 길이 n_itemss
        
        user_ids = torch.LongTensor(user_ids).to(device)
        item_ids = torch.LongTensor(full_item_ids).to(device)
        
        eval_output = model.forward(user_ids, item_ids).detach().cpu().numpy()
        pred_u_score = eval_output.reshape(-1)   
    
    pred_u_idx = np.argsort(pred_u_score)[::-1]
    pred_u = full_item_ids[pred_u_idx]
    pred_list.append(list(pred_u[:args.topk]))
    
pred = pd.DataFrame()
pred['profile_id'] = query_user_ids
pred['predicted_list'] = pred_list



SyntaxError: ignored

In [None]:
# 모델 성능 확인 
# if mode == 'valid':
rets = evaluation(test_data, pred)


# return pred

In [None]:
rets

{'recall': 0.11451343234974021,
 'ndcg': 0.07380236883516468,
 'score': 0.10433566647109632}

# aigroud 데이터 돌려보기 -> movielens 데이터 전처리가 이상한건가

In [None]:
# 패키지 로드
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from collections import defaultdict
import os, random

from scipy import sparse
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
%matplotlib inline

import torch
import torch.nn as nn
from torch.nn.init import normal_
from torch.utils.data import TensorDataset, DataLoader
import torch.nn.functional as F

import plotnine
from plotnine import *

In [None]:
import torch
# 하이퍼파라미터 
class cfg: 
    gpu_idx = 0
    device = torch.device("cuda:{}".format(gpu_idx) if torch.cuda.is_available() else "cpu")
    top_k = 25
    seed = 42
    neg_ratio = 100
    test_size = 0.2

In [None]:
# 데이터 불러오기 
data_path = '/content/drive/MyDrive/015GithubRepos/aiground/data'
history_df = pd.read_csv(os.path.join(data_path, 'history_data.csv'), encoding='utf-8')
profile_df = pd.read_csv(os.path.join(data_path, 'profile_data.csv'), encoding='utf-8')
meta_df = pd.read_csv(os.path.join(data_path, 'meta_data.csv'), encoding='utf-8')

In [None]:
# 데이터 전처리 (중복제거) 
# 참고 : drop_duplicates의 subset을 무엇으로 구성하냐에 따라서 제거되는 항목들이 다름 
# ex) 'profile_id', 'album_id' : 중복된 시청이력 모두 제거 / 'profile_id', 'album_id', 'log_time' : 같은 시간에 시청한 이력만 제거 
data = history_df[['profile_id', 'log_time', 'album_id']].drop_duplicates(subset=['profile_id', 'album_id', 'log_time']).sort_values(by = ['profile_id', 'log_time']).reset_index(drop = True)
data['rating'] = 1

cfg.n_users = data.profile_id.max()+1
cfg.n_items = data.album_id.max()+1

In [None]:
# 학습 및 검증 데이터 분리
train, valid = train_test_split(data, test_size=cfg.test_size, random_state=cfg.seed)
print('학습 데이터 크기:', train.shape)
print('검증 데이터 크기:', valid.shape)

학습 데이터 크기: (719401, 4)
검증 데이터 크기: (179851, 4)


In [None]:
# Matrix 형태로 변환 
train = train.to_numpy()
matrix = sparse.lil_matrix((cfg.n_users, cfg.n_items))
for (p, _, i, r) in tqdm(train):
    matrix[p, i] = r
    
train = sparse.csr_matrix(matrix)
train = train.toarray()
print("train 형태: \n", train)

  0%|          | 0/719401 [00:00<?, ?it/s]

train 형태: 
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [None]:
train.shape

(33033, 25917)

In [None]:
# 유저 특징 정보 추출 
profile_df = profile_df.set_index('profile_id')
user_features = profile_df[['age']].to_dict()
print("user_id 3의 age 정보 :", user_features['age'][3])

user_id 3의 age 정보 : 5


In [None]:
# 아이템 특징 정보 추출 
meta_df = meta_df.set_index('album_id')

# 범주형 데이터를 수치형 데이터로 변경 
le = LabelEncoder()
meta_df['genre_mid'] = le.fit_transform(meta_df['genre_mid'])
item_features = meta_df[['genre_mid']].to_dict()
print("album_id 749의 genre_mid 정보 :", item_features['genre_mid'][749])

album_id 749의 genre_mid 정보 : 1


In [None]:
# 추출한 특징 정보의 속성을 저장 
cfg.n_genres = meta_df['genre_mid'].nunique()
cfg.n_continuous_feats = 1

In [None]:
model

NameError: ignored

In [None]:
def make_UIdataset(train, neg_ratio):
    """ 유저별 학습에 필요한 딕셔너리 데이터 생성 
    Args:
        train : 유저-아이템의 상호작용을 담은 행렬 
            ex) 
                array([[0., 0., 0., ..., 0., 0., 0.],
                        [0., 0., 0., ..., 0., 0., 0.],
                        [0., 0., 0., ..., 0., 0., 0.],
                        ...,
                        [0., 0., 0., ..., 0., 0., 0.],
                        [0., 0., 0., ..., 0., 0., 0.],
                        [0., 0., 0., ..., 0., 0., 0.]])
        neg_ratio : negative sampling 활용할 비율 
            ex) 3 (positive label 1개당 negative label 3개)
    Returns:
        UIdataset : 유저별 학습에 필요한 정보를 담은 딕셔너리 
            ex) {'사용자 ID': [[positive 샘플, negative 샘플], ... , [1, 1, 1, ..., 0, 0]]}
                >>> UIdataset[3]
                    [array([   16,    17,    18, ...,  9586, 18991,  9442]),
                     array([5, 5, 5, ..., 5, 5, 5]),
                     array([4, 4, 4, ..., 5, 1, 1]),
                     array([1., 1., 1., ..., 0., 0., 0.])]
    """
    UIdataset = {}
    for user_id, items_by_user in enumerate(train):
        UIdataset[user_id] = []
        # positive 샘플 계산 
        pos_item_ids = np.where(items_by_user > 0.5)[0]
        num_pos_samples = len(pos_item_ids)

        # negative 샘플 계산 (random negative sampling) 
        num_neg_samples = neg_ratio * num_pos_samples
        neg_items = np.where(items_by_user < 0.5)[0]
        neg_item_ids = np.random.choice(neg_items, min(num_neg_samples, len(neg_items)), replace=False)
        UIdataset[user_id].append(np.concatenate([pos_item_ids, neg_item_ids]))
        
        # feature 추출 
        features = []
        for item_id in np.concatenate([pos_item_ids, neg_item_ids]): 
            features.append(user_features['age'][user_id])
        UIdataset[user_id].append(np.array(features))
        
        features = []
        for item_id in np.concatenate([pos_item_ids, neg_item_ids]): 
            features.append(item_features['genre_mid'][item_id])
        UIdataset[user_id].append(np.array(features))
        
        # label 저장  
        pos_labels = np.ones(len(pos_item_ids))
        neg_labels = np.zeros(len(neg_item_ids))
        UIdataset[user_id].append(np.concatenate([pos_labels, neg_labels]))

    return UIdataset

In [None]:
UIdataset = make_UIdataset(train, neg_ratio=cfg.neg_ratio)

In [None]:
def make_batchdata(user_indices, batch_idx, batch_size):
    """ 배치 데이터로 변환 
    Args:
        user_indices : 전체 유저의 인덱스 정보 
            ex) array([ 3100,  1800, 30098, ...,  2177, 11749, 20962])
        batch_idx : 배치 인덱스 (몇번째 배치인지)
            ex) 0 
        batch_size : 배치 크기 
            ex) 256 
    Returns 
        batch_user_ids : 배치내의 유저 인덱스 정보 
            ex) [22194, 22194, 22194, 22194, 22194, ...]
        batch_item_ids : 배치내의 아이템 인덱스 정보 
            ex) [36, 407, 612, 801, 1404, ...]
        batch_feat0 : 배치내의 유저-아이템 인덱스 정보에 해당하는 feature0 정보 
            ex) [6, 6, 6, 6, 6, ...]
        batch_feat1 : 배치내의 유저-아이템 인덱스 정보에 해당하는 feature1 정보 
            ex) [4,  4,  4, 23,  4, ...]
        batch_labels : 배치내의 유저-아이템 인덱스 정보에 해당하는 label 정보 
            ex) [1.0, 1.0, 1.0, 1.0, 1.0, ...]
    """
    batch_user_indices = user_indices[batch_idx*batch_size : (batch_idx+1)*batch_size]
    batch_user_ids = []
    batch_item_ids = []
    batch_feat0 = []
    batch_feat1 = []
    batch_labels = []
    for user_id in batch_user_indices:
        item_ids = UIdataset[user_id][0]
        feat0 = UIdataset[user_id][1]
        feat1 = UIdataset[user_id][2]
        labels = UIdataset[user_id][3]
        user_ids = np.full(len(item_ids), user_id)
        batch_user_ids.extend(user_ids.tolist())
        batch_item_ids.extend(item_ids.tolist())
        batch_feat0.extend(feat0.tolist())
        batch_feat1.extend(feat1.tolist())
        batch_labels.extend(labels.tolist())
    return batch_user_ids, batch_item_ids, batch_feat0, batch_feat1, batch_labels

def update_avg(curr_avg, val, idx):
    """ 현재 epoch 까지의 평균 값을 계산 
    """
    return (curr_avg * idx + val) / (idx + 1)

In [None]:
# 하이퍼 파라미터 설정 
cfg.batch_size = 256
cfg.emb_dim = 256
cfg.layer_dim = 256
cfg.dropout = 0.05
cfg.epochs = 25
cfg.learning_rate = 0.0025
cfg.reg_lambda = 0
cfg.check_epoch = 1

In [None]:
model = NeuMF(cfg).to(cfg.device)

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=cfg.learning_rate, weight_decay=cfg.reg_lambda)
criterion = torch.nn.BCEWithLogitsLoss(reduction='sum')

total_logs = defaultdict(list)
best_scores  = 0
for epoch in range(4):
    cfg.epoch = epoch

    model.train()
    curr_loss_avg = 0.0

    user_indices = np.arange(cfg.n_users)
    np.random.RandomState(cfg.epoch).shuffle(user_indices)
    batch_num = int(len(user_indices) / cfg.batch_size) + 1
    bar = tqdm(range(batch_num), leave=False)
    for step, batch_idx in enumerate(bar):
        user_ids, item_ids, feat0, feat1, labels = make_batchdata(user_indices, batch_idx, cfg.batch_size)

        user_ids = torch.LongTensor(user_ids).to(cfg.device)
        item_ids = torch.LongTensor(item_ids).to(cfg.device)
        labels = torch.FloatTensor(labels).to(cfg.device)
        labels = labels.view(-1, 1)

        # grad 초기화
        optimizer.zero_grad()

        # 모델 forward
        output = model.forward(user_ids, item_ids)
        output = output.view(-1, 1)

        loss = criterion(output, labels)

        # 역전파
        loss.backward()

        # 최적화
        optimizer.step()    
        if torch.isnan(loss):
            print('Loss NAN. Train finish.')
            break
        curr_loss_avg = update_avg(curr_loss_avg, loss, step)
        
        msg = f"epoch: {cfg.epoch}, "
        msg += f"loss: {curr_loss_avg.item():.5f}, "
        msg += f"lr: {optimizer.param_groups[0]['lr']:.6f}"
        bar.set_description(msg)

NameError: ignored

In [None]:
def valid_epoch(cfg, model, data, mode='valid'):
    pred_list = []
    model.eval()
    
    query_user_ids = data['profile_id'].unique() # 추론할 모든 user array 집합
    full_item_ids = np.array([c for c in range(cfg.n_items)]) # 추론할 모든 item array 집합 
    full_item_ids_feat1 = [item_features['genre_mid'][c] for c in full_item_ids]
    for user_id in query_user_ids:
        with torch.no_grad():
            user_ids = np.full(cfg.n_items, user_id) # array([user_id, user_id, ..., user_id]) 길이 n_items
            
            user_ids = torch.LongTensor(user_ids).to(cfg.device)
            item_ids = torch.LongTensor(full_item_ids).to(cfg.device)
            
            feat0 = np.full(cfg.n_items, user_features['age'][user_id])
            feat0 = torch.FloatTensor(feat0).to(cfg.device)
            feat1 = torch.LongTensor(full_item_ids_feat1).to(cfg.device)
            
            eval_output = model.forward(user_ids, item_ids).detach().cpu().numpy()
            pred_u_score = eval_output.reshape(-1)   
        
        pred_u_idx = np.argsort(pred_u_score)[::-1]
        pred_u = full_item_ids[pred_u_idx]
        pred_list.append(list(pred_u[:cfg.top_k]))
        
    pred = pd.DataFrame()
    pred['profile_id'] = query_user_ids
    pred['predicted_list'] = pred_list
    
    # 모델 성능 확인
    if mode == 'valid':
        rets = evaluation(data, pred)
        return rets, pred
    return pred

In [None]:
def recallk(actual, predicted, k = 25):
    """ label과 prediction 사이의 recall 평가 함수 
    Args:
        actual : 실제로 본 상품 리스트
        pred : 예측한 상품 리스트
        k : 상위 몇개의 데이터를 볼지 (ex : k=5 상위 5개의 상품만 봄)
    Returns: 
        recall_k : recall@k 
    """ 
    set_actual = set(actual)
    recall_k = len(set_actual & set(predicted[:k])) / min(k, len(set_actual))
    return recall_k

def unique(sequence):
    # preserves order
    seen = set()
    return [x for x in sequence if not (x in seen or seen.add(x))]

def ndcgk(actual, predicted, k = 25):
    set_actual = set(actual)
    idcg = sum([1.0 / np.log(i + 2) for i in range(min(k, len(set_actual)))])
    dcg = 0.0
    unique_predicted = unique(predicted[:k])
    for i, r in enumerate(unique_predicted):
        if r in set_actual:
            dcg += 1.0 / np.log(i + 2)
    ndcg_k = dcg / idcg
    return ndcg_k

def evaluation(gt, pred):
    """ label과 prediction 사이의 recall, coverage, competition metric 평가 함수 
    Args:
        gt : 데이터 프레임 형태의 정답 데이터 
        pred : 데이터 프레임 형태의 예측 데이터 
    Returns: 
        rets : recall, ndcg, coverage, competition metric 결과 
            ex) {'recall': 0.123024, 'ndcg': 056809, 'coverage': 0.017455, 'score': 0.106470}
    """    
    gt = gt.groupby('profile_id')['album_id'].unique().to_frame().reset_index()
    gt.columns = ['profile_id', 'actual_list']

    evaluated_data = pd.merge(pred, gt, how = 'left', on = 'profile_id')

    evaluated_data['Recall@25'] = evaluated_data.apply(lambda x: recallk(x.actual_list, x.predicted_list), axis=1)
    evaluated_data['NDCG@25'] = evaluated_data.apply(lambda x: ndcgk(x.actual_list, x.predicted_list), axis=1)

    recall = evaluated_data['Recall@25'].mean()
    ndcg = evaluated_data['NDCG@25'] .mean()
    coverage = (evaluated_data['predicted_list'].apply(lambda x: x[:cfg.top_k]).explode().nunique())/meta_df.index.nunique()

    score = 0.75*recall + 0.25*ndcg
    rets = {"recall" :recall, 
            "ndcg" :ndcg, 
            "coverage" :coverage, 
            "score" :score}
    return rets

In [None]:
valid_results, _ = valid_epoch(cfg, model, valid)

In [None]:
valid_results

{'recall': 0.4431989782384405,
 'ndcg': 0.32750094202036883,
 'coverage': 0.14161755485893418,
 'score': 0.41427446918392263}

# Movielens를 aiground 방식으로 전처리하여 돌리기

In [None]:
ratings_path = args.ratings_path
ratings = pd.read_csv(ratings_path, names=['user', 'item', 'rating', 'timestamp'], sep='::', engine='python')

In [None]:
ratings

Unnamed: 0,user,item,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


NeuMF(
  (GMF_user): Embedding(33033, 256)
  (GMF_item): Embedding(25917, 256)
  (MLP_user): Embedding(33033, 256)
  (MLP_item): Embedding(25917, 256)
  (MLP_linear): Sequential(
    (0): Linear(in_features=512, out_features=256, bias=True)
    (1): ReLU()
    (2): Linear(in_features=256, out_features=128, bias=True)
    (3): ReLU()
    (4): Linear(in_features=128, out_features=64, bias=True)
  )
  (output_layer): Linear(in_features=320, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)