# Import

In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict
from tqdm.notebook import tqdm
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
import os, random

import torch
import torch.nn as nn
from torch.nn.init import normal_
import torch.nn.functional as F

# 하이퍼파라미터 설정

In [2]:
# 하이퍼파라미터 
class cfg: 
    gpu_idx = 0
    device = torch.device("cuda:{}".format(gpu_idx) if torch.cuda.is_available() else "cpu")
    top_k = 25
    seed = 42
    neg_ratio = 100

In [3]:
torch.cuda.is_available()

True

# 시드 고정

In [4]:
# 시드 고정 
def seed_everything(random_seed):
    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(random_seed)
    random.seed(random_seed)
    
seed_everything(cfg.seed)

# Data Load

In [5]:
# 경로 설정
data_path = '../data'
saved_path = './saved'
output_path = './submission'

### 데이터 불러오기
- history_data : 시청 시작 데이터
- profile_data : 프로필 정보 
- meta_data : 콘텐츠 일반 메타 정보

In [6]:
history_df = pd.read_csv(os.path.join(data_path, 'history_data.csv'), encoding='utf-8')

## 중복 데이터 제거 ##
history_df = history_df[['profile_id', 'log_time', 'album_id']].drop_duplicates(subset=['profile_id', 'album_id', 'log_time']).sort_values(by = ['profile_id', 'log_time']).reset_index(drop = True)
history_df['rating']=1
history_df.head(3)

profile_df = pd.read_csv(os.path.join(data_path, 'profile_data.csv'), encoding='utf-8')
meta_df = pd.read_csv(os.path.join(data_path, 'meta_data.csv'), encoding='utf-8')

  meta_df = pd.read_csv(os.path.join(data_path, 'meta_data.csv'), encoding='utf-8')


# 전체 데이터를 통하여 table 생성

In [7]:
## 데이터가 있는 행과 열을 기준으로 table 형성을 위하여 rating 기준으로 dataframe 생성 ##

In [8]:
ratings_matrix_df = history_df.pivot_table('rating',index='profile_id',columns='album_id')
ratings_matrix_df.head()

album_id,0,1,2,3,4,5,6,7,8,9,...,25877,25893,25894,25895,25898,25912,25913,25914,25915,25916
profile_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,,,,,,,,,,,...,,,,,,,,,,
5,1.0,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
12,,,,,,,,,,,...,,,,,,,,,,
16,,,,,,,,,,,...,,,,,,,,,,


In [9]:
## ratings_matrix_df 와 동일한 크기의 zero DataFrame 생성 ##

In [10]:
ratings_total_matrix_df = pd.DataFrame(np.zeros(ratings_matrix_df.values.shape),index=ratings_matrix_df.index,columns=ratings_matrix_df.columns)
ratings_total_matrix_df.head()

album_id,0,1,2,3,4,5,6,7,8,9,...,25877,25893,25894,25895,25898,25912,25913,25914,25915,25916
profile_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
## zero DataFrame을 전체 데이터를 이용하여 (profile_id , album_id) 쌍의 개수를 세주기 ##

In [12]:
total_data = history_df.values

for row in tqdm(range(total_data.shape[0])):
    row_data =total_data[row] # row_data = profile_id ,log_time, album_id, rating
    ratings_total_matrix_df.loc[row_data[0],row_data[2]]+=1

ratings_total_matrix_df.head()

  0%|          | 0/899252 [00:00<?, ?it/s]

album_id,0,1,2,3,4,5,6,7,8,9,...,25877,25893,25894,25895,25898,25912,25913,25914,25915,25916
profile_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# 유저별로 최대 1 최소 0 으로 Minmaxscaler 적용

In [13]:
## MinMaxScaler 적용을 위해 transpose() 후 다시 transpose() 적용 ##
scaler = MinMaxScaler()
ratings_minmax_matrix_t = scaler.fit_transform(ratings_total_matrix_df.transpose())
ratings_minmax_matrix_df_t = pd.DataFrame(ratings_minmax_matrix_t,index=ratings_total_matrix_df.columns,columns=ratings_total_matrix_df.index)
ratings_minmax_matrix_df = ratings_minmax_matrix_df_t.transpose()
ratings_minmax_matrix_df.head()

album_id,0,1,2,3,4,5,6,7,8,9,...,25877,25893,25894,25895,25898,25912,25913,25914,25915,25916
profile_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# (profile_id와 index) 와 (album_id와 columns) 변환을 위한 dictionary 생성

In [14]:
index_list = list(ratings_minmax_matrix_df.index)
column_list = list(ratings_minmax_matrix_df.columns)

print('index 의 전체 길이 :',len(index_list),'columns 의 전체 길이 :',len(column_list))

real_to_fake_user = {real:fake for fake,real in enumerate(index_list)} # profile_id -> index
fake_to_real_user = {fake:real for fake,real in enumerate(index_list)} # index -> profile_id

real_to_fake_album = {real:fake for fake,real in enumerate(column_list)} # album_id -> column
fake_to_real_album = {fake:real for fake,real in enumerate(column_list)} # column -> album_id

print('profile_id 33032의 index 번호 :',real_to_fake_user[33032] ,'index 번호 8310의 profile_id :',fake_to_real_user[8310])
print('album_id 25916의 column 번호 :',real_to_fake_album[25916] ,'column 번호 20694의 album_id :',fake_to_real_album[20694])

index 의 전체 길이 : 8311 columns 의 전체 길이 : 20695
profile_id 33032의 index 번호 : 8310 index 번호 8310의 profile_id : 33032
album_id 25916의 column 번호 : 20694 column 번호 20694의 album_id : 25916


# 필요한 정보 추출

In [15]:
# 유저 특징 정보 추출 
profile_df = profile_df.set_index('profile_id')
user_features = profile_df[['age']].to_dict()
print("user_id 3의 age 정보 :", user_features['age'][3]) # 실제 id

user_id 3의 age 정보 : 5


In [16]:
# 아이템 특징 정보 추출 
meta_df = meta_df.set_index('album_id')

# 범주형 데이터를 수치형 데이터로 변경 
le = LabelEncoder()
meta_df['genre_mid'] = le.fit_transform(meta_df['genre_mid'])
item_features = meta_df[['genre_mid']].to_dict()
print("album_id 749의 genre_mid 정보 :", item_features['genre_mid'][749]) # 실제 album_id

album_id 749의 genre_mid 정보 : 1


In [17]:
# 추출한 특징 정보의 속성을 저장 
cfg.n_genres = meta_df['genre_mid'].nunique()
cfg.n_continuous_feats = 1 

In [18]:
cfg.n_users = history_df.profile_id.nunique()
cfg.n_items = history_df.album_id.nunique()

print(cfg.n_users,cfg.n_items)

8311 20695


# NeuMF 구현
## baseline 모델에서 이진분류가 아닌 회귀문제로 생각하기 위하여 모델 마지막 output에 sigmoid 추가

![](http://drive.google.com/uc?export=view&id=1tpajTLipLoFdvLICO-alAxeoKAE8-k61)

In [19]:
class NeuMF(nn.Module):
    """Neural Matrix Factorization Model
        참고 문헌 : https://arxiv.org/abs/1708.05031

    예시 :
        model = NeuMF(cfg) 
        output = model.forward(user_ids, item_ids, [feat0, feat1]) 
    """
    def __init__(self, cfg):
        """ 
        Args:
            cfg : config 파일로 네트워크 생성에 필요한 정보들을 담고 있음 
        """
        super(NeuMF, self).__init__()
        self.n_users = cfg.n_users
        self.n_items = cfg.n_items
        self.emb_dim = cfg.emb_dim
        self.layer_dim = cfg.layer_dim
        self.n_continuous_feats = cfg.n_continuous_feats
        self.n_genres = cfg.n_genres
        self.dropout = cfg.dropout
        self.build_graph()

    def build_graph(self):
        """Neural Matrix Factorization Model 생성
            구현된 모습은 위의 그림을 참고 
        """
        self.user_embedding_mf = nn.Embedding(num_embeddings=self.n_users, embedding_dim=self.emb_dim)
        self.item_embedding_mf = nn.Embedding(num_embeddings=self.n_items, embedding_dim=self.emb_dim)
        
        self.user_embedding_mlp = nn.Embedding(num_embeddings=self.n_users, embedding_dim=self.emb_dim)
        self.item_embedding_mlp = nn.Embedding(num_embeddings=self.n_items, embedding_dim=self.emb_dim)
                
        self.genre_embeddig = nn.Embedding(num_embeddings=self.n_genres, embedding_dim=self.n_genres//2)
        
        self.mlp_layers = nn.Sequential(
            nn.Linear(2*self.emb_dim + self.n_genres//2 + self.n_continuous_feats, self.layer_dim), 
            nn.ReLU(), 
            nn.Dropout(p=self.dropout), 
            nn.Linear(self.layer_dim, self.layer_dim//2), 
            nn.ReLU(), 
            nn.Dropout(p=self.dropout)
        )
        self.affine_output = nn.Linear(self.layer_dim//2 + self.emb_dim, 1)
        self.apply(self._init_weights)
        

    def _init_weights(self, module):
        if isinstance(module, nn.Embedding):
            normal_(module.weight.data, mean=0.0, std=0.01)
        elif isinstance(module, nn.Linear):
            normal_(module.weight.data, 0, 0.01)
            if module.bias is not None:
                module.bias.data.fill_(0.0)
    
    def forward(self, user_indices, item_indices, feats):
        """ 
        Args:
            user_indices : 유저의 인덱스 정보 
                ex) tensor([ 3100,  3100,  ..., 14195, 14195])
            item_indices : 아이템의 인덱스 정보
                ex) tensor([   50,    65,   ..., 14960, 11527])
            feats : 특징 정보 
        Returns: 
            output : 유저-아이템 쌍에 대한 추천 결과 
                ex) tensor([  9.4966,  22.0261, ..., -19.3535, -23.0212])
        """
        user_embedding_mf = self.user_embedding_mf(user_indices)
        item_embedding_mf = self.item_embedding_mf(item_indices)
        mf_output = torch.mul(user_embedding_mf, item_embedding_mf)
        
        user_embedding_mlp = self.user_embedding_mlp(user_indices)
        item_embedding_mlp = self.item_embedding_mlp(item_indices)
        genre_embedding_mlp = self.genre_embeddig(feats[1])
        input_feature = torch.cat((user_embedding_mlp, item_embedding_mlp, genre_embedding_mlp, feats[0].unsqueeze(1)), -1)
        mlp_output = self.mlp_layers(input_feature)
        
        output = torch.cat([mlp_output, mf_output], dim=-1)
        output = self.affine_output(output).squeeze(-1)
        return nn.Sigmoid()(output)

# 학습 및 추론에 필요한 데이터 셋 생성 코드 구현

In [20]:
def make_UIdataset(train, neg_ratio):
    """ 유저별 학습에 필요한 딕셔너리 데이터 생성 
    Args:
        train : 유저-아이템의 상호작용을 담은 행렬 
            ex) 
                array([[0., 0., 0., ..., 0., 0., 0.],
                        [0., 0., 0., ..., 0., 0., 0.],
                        [0., 0., 0., ..., 0., 0., 0.],
                        ...,
                        [0., 0., 0., ..., 0., 0., 0.],
                        [0., 0., 0., ..., 0., 0., 0.],
                        [0., 0., 0., ..., 0., 0., 0.]])
        neg_ratio : negative sampling 활용할 비율 
            ex) 3 (positive label 1개당 negative label 3개)
    Returns: 
        UIdataset : 유저별 학습에 필요한 정보를 담은 딕셔너리 
            ex) {'사용자 ID': [[positive 샘플, negative 샘플], ... , [1, 1, 1, ..., 0, 0]]}
                >>> UIdataset[3]
                    [array([   16,    17,    18, ...,  9586, 18991,  9442]),
                    array([5, 5, 5, ..., 5, 5, 5]),
                    array([4, 4, 4, ..., 5, 1, 1]),
                    array([1., 1., 1., ..., 0., 0., 0.])]
    """
    UIdataset = {}
    for user_id, items_by_user in enumerate(tqdm(train)):
        # 가짜 user id
        UIdataset[user_id] = []
        # positive 샘플 계산 
        pos_item_ids = np.where(items_by_user > 0)[0] # 가짜 아이템 id
        num_pos_samples = len(pos_item_ids)

        # negative 샘플 계산 (random negative sampling) 
        num_neg_samples = neg_ratio * num_pos_samples
        neg_items = np.where(items_by_user <= 0)[0] # 가짜 아이템 id
        neg_item_ids = np.random.choice(neg_items, min(num_neg_samples, len(neg_items)), replace=False)
        UIdataset[user_id].append(np.concatenate([pos_item_ids, neg_item_ids]))
        
        # feature 추출 
        features = []
        for item_id in np.concatenate([pos_item_ids, neg_item_ids]): 
            features.append(user_features['age'][fake_to_real_user[user_id]])
        UIdataset[user_id].append(np.array(features))
        
        features = []
        for item_id in np.concatenate([pos_item_ids, neg_item_ids]): 
            features.append(item_features['genre_mid'][fake_to_real_album[item_id]])
        UIdataset[user_id].append(np.array(features))
        
        # label 저장  
        pos_labels = items_by_user[pos_item_ids]
        neg_labels = items_by_user[neg_item_ids]
        UIdataset[user_id].append(np.concatenate([pos_labels, neg_labels]))

    return UIdataset

In [21]:
UIdataset = make_UIdataset(ratings_minmax_matrix_df.values, neg_ratio=cfg.neg_ratio)

  0%|          | 0/8311 [00:00<?, ?it/s]

In [22]:
# UIdataset [ 가짜 user _id ] = [[가짜 item id들],[나이정보],[장르정보],[label들 , label]]

In [23]:
def make_batchdata(user_indices, batch_idx, batch_size):
    """ 배치 데이터로 변환 
    Args:
        user_indices : 전체 유저의 인덱스 정보 
            ex) array([ 3100,  1800, 30098, ...,  2177, 11749, 20962])
        batch_idx : 배치 인덱스 (몇번째 배치인지)
            ex) 0 
        batch_size : 배치 크기 
            ex) 256 
    Returns 
        batch_user_ids : 배치내의 유저 인덱스 정보 
            ex) [22194, 22194, 22194, 22194, 22194, ...]
        batch_item_ids : 배치내의 아이템 인덱스 정보 
            ex) [36, 407, 612, 801, 1404, ...]
        batch_feat0 : 배치내의 유저-아이템 인덱스 정보에 해당하는 feature0 정보 
            ex) [6, 6, 6, 6, 6, ...]
        batch_feat1 : 배치내의 유저-아이템 인덱스 정보에 해당하는 feature1 정보 
            ex) [4,  4,  4, 23,  4, ...]
        batch_labels : 배치내의 유저-아이템 인덱스 정보에 해당하는 label 정보 
            ex) [1.0, 1.0, 1.0, 1.0, 1.0, ...]
    """
    batch_user_indices = user_indices[batch_idx*batch_size : (batch_idx+1)*batch_size]
    batch_user_ids = []
    batch_item_ids = []
    batch_feat0 = []
    batch_feat1 = []
    batch_labels = []
    for user_id in batch_user_indices:
        item_ids = UIdataset[user_id][0]
        feat0 = UIdataset[user_id][1]
        feat1 = UIdataset[user_id][2]
        labels = UIdataset[user_id][3]
        user_ids = np.full(len(item_ids), user_id)
        batch_user_ids.extend(user_ids.tolist())
        batch_item_ids.extend(item_ids.tolist())
        batch_feat0.extend(feat0.tolist())
        batch_feat1.extend(feat1.tolist())
        batch_labels.extend(labels.tolist())
    return batch_user_ids, batch_item_ids, batch_feat0, batch_feat1, batch_labels

def update_avg(curr_avg, val, idx):
    """ 현재 epoch 까지의 평균 값을 계산 
    """
    return (curr_avg * idx + val) / (idx + 1)

# 학습 및 검증 코드 생성

In [24]:
# cfg.n_users,cfg.n_items -> 8311 20695 개수 (0~8310 ,0~20694) 가짜 user id ,가짜 item id
def train_epoch(cfg, model, optimizer, criterion): 
    model.train()
    curr_loss_avg = 0.0

    user_indices = np.arange(cfg.n_users)
    np.random.RandomState(cfg.epoch).shuffle(user_indices)
    batch_num = int(len(user_indices) / cfg.batch_size) + 1 # 배치 전체 개수 int(8311/256) +1
    bar = tqdm(range(batch_num), leave=False)
    for step, batch_idx in enumerate(bar):
        user_ids, item_ids, feat0, feat1, labels = make_batchdata(user_indices, batch_idx, cfg.batch_size)
        # 배치 사용자 단위로 학습
        user_ids = torch.LongTensor(user_ids).to(cfg.device)
        item_ids = torch.LongTensor(item_ids).to(cfg.device)
        feat0 = torch.FloatTensor(feat0).to(cfg.device)
        feat1 = torch.LongTensor(feat1).to(cfg.device)
        labels = torch.FloatTensor(labels).to(cfg.device)
        labels = labels.view(-1, 1)

        # grad 초기화
        optimizer.zero_grad()

        # 모델 forward
        output = model.forward(user_ids, item_ids, [feat0, feat1])
        output = output.view(-1, 1)

        loss = criterion(output, labels)

        # 역전파
        loss.backward()

        # 최적화
        optimizer.step()    
        if torch.isnan(loss):
            print('Loss NAN. Train finish.')
            break
        curr_loss_avg = update_avg(curr_loss_avg, loss, step)
        
        msg = f"epoch: {cfg.epoch}, "
        msg += f"loss: {curr_loss_avg.item():.5f}, "
        msg += f"lr: {optimizer.param_groups[0]['lr']:.6f}"
        bar.set_description(msg)
    rets = {'losses': np.around(curr_loss_avg.item(), 5)}
    return rets

In [25]:
def recallk(actual, predicted, k = 25):
    """ label과 prediction 사이의 recall 평가 함수 
    Args:
        actual : 실제로 본 상품 리스트
        pred : 예측한 상품 리스트
        k : 상위 몇개의 데이터를 볼지 (ex : k=5 상위 5개의 상품만 봄)
    Returns: 
        recall_k : recall@k 
    """ 
    set_actual = set(actual)
    recall_k = len(set_actual & set(predicted[:k])) / min(k, len(set_actual))
    return recall_k

def unique(sequence):
    # preserves order
    seen = set()
    return [x for x in sequence if not (x in seen or seen.add(x))]

def ndcgk(actual, predicted, k = 25):
    set_actual = set(actual)
    idcg = sum([1.0 / np.log(i + 2) for i in range(min(k, len(set_actual)))])
    dcg = 0.0
    unique_predicted = unique(predicted[:k])
    for i, r in enumerate(unique_predicted):
        if r in set_actual:
            dcg += 1.0 / np.log(i + 2)
    ndcg_k = dcg / idcg
    return ndcg_k

def evaluation(gt, pred):
    """ label과 prediction 사이의 recall, coverage, competition metric 평가 함수 
    Args:
        gt : 데이터 프레임 형태의 정답 데이터 
        pred : 데이터 프레임 형태의 예측 데이터 
    Returns: 
        rets : recall, ndcg, coverage, competition metric 결과 
            ex) {'recall': 0.123024, 'ndcg': 056809, 'coverage': 0.017455, 'score': 0.106470}
    """    
    gt = gt.groupby('profile_id')['album_id'].unique().to_frame().reset_index()
    gt.columns = ['profile_id', 'actual_list']

    evaluated_data = pd.merge(pred, gt, how = 'left', on = 'profile_id')

    evaluated_data['Recall@25'] = evaluated_data.apply(lambda x: recallk(x.actual_list, x.predicted_list), axis=1)
    evaluated_data['NDCG@25'] = evaluated_data.apply(lambda x: ndcgk(x.actual_list, x.predicted_list), axis=1)

    recall = evaluated_data['Recall@25'].mean()
    ndcg = evaluated_data['NDCG@25'] .mean()
    coverage = (evaluated_data['predicted_list'].apply(lambda x: x[:cfg.top_k]).explode().nunique())/meta_df.index.nunique()

    score = 0.75*recall + 0.25*ndcg
    rets = {"recall" :recall, 
            "ndcg" :ndcg, 
            "coverage" :coverage, 
            "score" :score}
    return rets

## 예측 DataFrame을 통해 score 평가하기 ## 
def df_to_score(actual_df,predict_df):
    # actual = valid_data_df
    pred = pd.DataFrame()
    query_user_ids = actual_df['profile_id'].unique()
    pred_list=[]
    for user_id in query_user_ids:
        items = predict_df.loc[user_id,:].sort_values(ascending=False).index.values[:25]
        pred_list.append(list(items))

    pred['profile_id'] = query_user_ids
    pred['predicted_list'] = pred_list
    rets = evaluation(actual_df, pred)

    print('score :',rets['score'],'recall :',rets['recall'])

In [26]:
def valid_epoch(cfg, model, data, mode='valid'):
    pred_list = []
    model.eval()
    
    # data -> valid 에 있는 유저만 포함
    query_user_ids = data['profile_id'].unique() # 추론할 모든 user array 집합
    query_user_ids = [real_to_fake_user[real_id] for real_id in query_user_ids]
    full_item_ids = np.array([c for c in range(cfg.n_items)]) # 추론할 모든 fake item array 집합 
    full_item_ids_feat1 = [item_features['genre_mid'][fake_to_real_album[c]] for c in full_item_ids]
    for user_id in query_user_ids:
        with torch.no_grad():
            user_ids = np.full(cfg.n_items, user_id)
            
            user_ids = torch.LongTensor(user_ids).to(cfg.device)
            item_ids = torch.LongTensor(full_item_ids).to(cfg.device)
            
            feat0 = np.full(cfg.n_items, user_features['age'][fake_to_real_user[user_id]])
            feat0 = torch.FloatTensor(feat0).to(cfg.device)
            feat1 = torch.LongTensor(full_item_ids_feat1).to(cfg.device)
            
            eval_output = model.forward(user_ids, item_ids, [feat0, feat1]).detach().cpu().numpy()
            pred_u_score = eval_output.reshape(-1)   
        
        pred_u_idx = np.argsort(pred_u_score)[::-1]
        pred_u = full_item_ids[pred_u_idx]
        pred_list.append([fake_to_real_album[item] for item in list(pred_u[:cfg.top_k])])
        
    pred = pd.DataFrame()
    pred['profile_id'] = [fake_to_real_user[fake_id] for fake_id in query_user_ids]
    pred['predicted_list'] = pred_list
    
    # 모델 성능 확인 
    if mode == 'valid':
        rets = evaluation(data, pred)
        return rets, pred
    return pred

In [27]:
def test_epoch(cfg, model, data, mode='test'):
    pred_list = []
    model.eval()
    actual_pred_matrix=[]
    # data -> valid 에 있는 유저만 포함
    query_user_ids = data['profile_id'].unique() # 추론할 모든 user array 집합
    query_user_ids = [real_to_fake_user[real_id] for real_id in query_user_ids]
    full_item_ids = np.array([c for c in range(cfg.n_items)]) # 추론할 모든 fake item array 집합 
    full_item_ids_feat1 = [item_features['genre_mid'][fake_to_real_album[c]] for c in full_item_ids]
    for user_id in query_user_ids:
        with torch.no_grad():
            user_ids = np.full(cfg.n_items, user_id)
            
            user_ids = torch.LongTensor(user_ids).to(cfg.device)
            item_ids = torch.LongTensor(full_item_ids).to(cfg.device)
            
            feat0 = np.full(cfg.n_items, user_features['age'][fake_to_real_user[user_id]])
            feat0 = torch.FloatTensor(feat0).to(cfg.device)
            feat1 = torch.LongTensor(full_item_ids_feat1).to(cfg.device)
            
            eval_output = model.forward(user_ids, item_ids, [feat0, feat1]).detach().cpu().numpy()
            pred_u_score = eval_output.reshape(-1) 
            
        actual_pred_matrix.append(pred_u_score)
        pred_u_idx = np.argsort(pred_u_score)[::-1]
        pred_u = full_item_ids[pred_u_idx]
        pred_list.append([fake_to_real_album[item] for item in list(pred_u[:cfg.top_k])])
        
    pred = pd.DataFrame()
    pred['profile_id'] = [fake_to_real_user[fake_id] for fake_id in query_user_ids]
    pred['predicted_list'] = pred_list
    
    return pred,actual_pred_matrix

## 모델 학습

### 하이퍼파라미터 설정 & 최적화 기법 설정

In [28]:
# 하이퍼 파라미터 설정 
cfg.batch_size = 256
cfg.emb_dim = 256
cfg.layer_dim = 256
cfg.dropout = 0.05
cfg.epochs = 100
cfg.learning_rate = 0.001
cfg.reg_lambda = 0
cfg.check_epoch = 1

In [29]:
# model 생성 및 optimizer, loss 함수 설정 
model = NeuMF(cfg).to(cfg.device)
optimizer = torch.optim.Adam(model.parameters(), lr=cfg.learning_rate, weight_decay=cfg.reg_lambda)
criterion = torch.nn.MSELoss(reduction='sum')

### 학습 진행

In [30]:
## 100epoch 기준 연산시간 약 15분 소요 ##
for epoch in range(cfg.epochs+1):
    cfg.epoch = epoch
    train_results = train_epoch(cfg, model, optimizer, criterion)
    print('epoch : ',epoch , 'loss : ',train_results['losses'] )
    # cfg.check_epoch 번의 epoch 마다 성능 확인 
    
torch.save(model.state_dict(), os.path.join(saved_path, 'newMF_count_total_model.pth'))

  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  0 loss :  137200.375


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  1 loss :  1374.21289


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  2 loss :  1351.53174


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  3 loss :  1336.34961


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  4 loss :  1309.43689


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  5 loss :  1278.82385


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  6 loss :  1249.85022


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  7 loss :  1220.15332


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  8 loss :  1191.87268


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  9 loss :  1160.8064


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  10 loss :  1130.32678


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  11 loss :  1100.13843


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  12 loss :  1070.91589


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  13 loss :  1041.95276


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  14 loss :  1006.82635


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  15 loss :  966.78607


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  16 loss :  922.70807


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  17 loss :  877.85596


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  18 loss :  833.78235


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  19 loss :  789.54608


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  20 loss :  743.95062


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  21 loss :  701.06628


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  22 loss :  658.98126


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  23 loss :  618.87494


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  24 loss :  582.97168


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  25 loss :  549.45538


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  26 loss :  514.61957


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  27 loss :  482.89148


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  28 loss :  454.69922


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  29 loss :  428.54333


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  30 loss :  404.1521


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  31 loss :  380.89777


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  32 loss :  360.31369


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  33 loss :  341.33817


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  34 loss :  323.7359


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  35 loss :  307.02396


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  36 loss :  292.02234


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  37 loss :  277.53864


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  38 loss :  265.0448


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  39 loss :  253.54013


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  40 loss :  242.05672


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  41 loss :  231.89917


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  42 loss :  222.52266


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  43 loss :  213.82475


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  44 loss :  205.15945


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  45 loss :  198.3054


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  46 loss :  191.213


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  47 loss :  185.72214


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  48 loss :  179.19632


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  49 loss :  173.33195


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  50 loss :  167.20464


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  51 loss :  162.18774


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  52 loss :  158.57112


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  53 loss :  153.32004


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  54 loss :  149.03009


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  55 loss :  145.42122


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  56 loss :  141.70454


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  57 loss :  138.46381


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  58 loss :  134.94905


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  59 loss :  131.93085


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  60 loss :  129.19196


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  61 loss :  126.45668


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  62 loss :  124.25211


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  63 loss :  121.29346


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  64 loss :  119.00863


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  65 loss :  117.17252


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  66 loss :  115.3406


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  67 loss :  112.69533


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  68 loss :  109.79904


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  69 loss :  107.92199


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  70 loss :  106.1062


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  71 loss :  104.62065


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  72 loss :  103.42524


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  73 loss :  101.08116


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  74 loss :  98.69005


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  75 loss :  97.1262


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  76 loss :  95.63674


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  77 loss :  94.00686


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  78 loss :  92.59722


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  79 loss :  90.88269


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  80 loss :  89.6589


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  81 loss :  88.37732


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  82 loss :  87.21542


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  83 loss :  86.54045


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  84 loss :  86.61912


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  85 loss :  86.69067


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  86 loss :  85.52463


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  87 loss :  84.04041


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  88 loss :  82.7324


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  89 loss :  81.98655


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  90 loss :  80.34297


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  91 loss :  79.30241


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  92 loss :  77.72986


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  93 loss :  75.9734


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  94 loss :  75.06636


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  95 loss :  75.70412


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  96 loss :  75.58871


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  97 loss :  75.29728


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  98 loss :  77.03479


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  99 loss :  75.89577


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  100 loss :  72.36798


# 모든 유저에 대해 추천 결과 생성

In [31]:
model.load_state_dict(torch.load(os.path.join(saved_path, 'newMF_count_total_model.pth')))

<All keys matched successfully>

In [None]:
## 추론시간은 약 20초 소요 ##
submission_path = os.path.join(data_path, 'sample_submission.csv')
submission = pd.read_csv(submission_path)
submission,actual_pred_matrix = test_epoch(cfg, model, submission, mode='test')

In [33]:
actual_pred_matrix_df = pd.DataFrame(actual_pred_matrix,index=ratings_minmax_matrix_df.index,columns=ratings_minmax_matrix_df.columns)
actual_pred_matrix_df

album_id,0,1,2,3,4,5,6,7,8,9,...,25877,25893,25894,25895,25898,25912,25913,25914,25915,25916
profile_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,0.001349,0.001327,0.001918,0.002813,0.002913,0.001291,0.000850,0.004393,0.000228,0.000182,...,0.004228,0.005612,0.005112,0.005105,0.001500,0.003759,0.002811,0.003236,0.002968,0.003223
5,0.124824,0.006320,0.001318,0.002037,0.004726,0.007231,0.002051,0.002599,0.000075,0.014263,...,0.000448,0.000089,0.000112,0.000123,0.000240,0.000227,0.000123,0.000172,0.000147,0.000183
7,0.017060,0.029104,0.071857,0.028652,0.004407,0.015356,0.001762,0.000755,0.000195,0.026352,...,0.000095,0.000147,0.000130,0.000187,0.000151,0.000271,0.000224,0.000195,0.000188,0.000197
12,0.031263,0.005910,0.011938,0.001501,0.003208,0.002793,0.002285,0.001406,0.000917,0.001983,...,0.001196,0.001109,0.000651,0.000993,0.000658,0.001204,0.001028,0.001185,0.001037,0.001177
16,0.005391,0.001231,0.002027,0.000809,0.000988,0.001214,0.001212,0.000062,0.000069,0.000959,...,0.000028,0.000013,0.000011,0.000017,0.000036,0.000043,0.000028,0.000030,0.000034,0.000028
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33022,0.455257,0.158891,0.232770,0.007374,0.016252,0.012754,0.012858,0.000507,0.000191,0.000861,...,0.000030,0.000077,0.000059,0.000090,0.000134,0.000168,0.000083,0.000097,0.000083,0.000101
33023,0.038592,0.003348,0.009305,0.001537,0.003749,0.003514,0.004144,0.000226,0.000089,0.000733,...,0.000098,0.000075,0.000074,0.000095,0.000108,0.000159,0.000089,0.000083,0.000088,0.000078
33026,0.000669,0.000547,0.000405,0.002999,0.002839,0.001169,0.000951,0.001287,0.001297,0.011769,...,0.000407,0.000612,0.000458,0.000483,0.000510,0.000371,0.000266,0.000309,0.000314,0.000373
33027,0.055477,0.005943,0.013829,0.002314,0.003579,0.002493,0.002554,0.000337,0.000040,0.000790,...,0.000140,0.000064,0.000069,0.000091,0.000028,0.000074,0.000062,0.000091,0.000075,0.000090


# 생성된 예측행렬을 ensemble을 시켜주기 위하여 유저별로 최대 1 최소 0 으로 Minmaxscaler 적용

In [35]:
scaler = MinMaxScaler()
actual_pred_matrix_t = scaler.fit_transform(actual_pred_matrix_df.transpose())
actual_pred_matrix_df_t = pd.DataFrame(actual_pred_matrix_t,index=ratings_total_matrix_df.columns,columns=ratings_total_matrix_df.index)
actual_pred_matrix_df = actual_pred_matrix_df_t.transpose()
actual_pred_matrix_df

album_id,0,1,2,3,4,5,6,7,8,9,...,25877,25893,25894,25895,25898,25912,25913,25914,25915,25916
profile_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,0.001344,0.001321,0.001921,0.002831,0.002932,0.001284,0.000837,0.004436,0.000205,0.000158,...,0.004268,0.005674,0.005166,0.005159,0.001496,0.003791,0.002829,0.003260,0.002987,0.003246
5,0.130343,0.006591,0.001367,0.002118,0.004926,0.007542,0.002133,0.002705,0.000070,0.014885,...,0.000458,0.000084,0.000108,0.000120,0.000242,0.000228,0.000119,0.000170,0.000145,0.000182
7,0.017054,0.029102,0.071872,0.028651,0.004396,0.015349,0.001750,0.000743,0.000183,0.026350,...,0.000083,0.000135,0.000118,0.000175,0.000139,0.000259,0.000212,0.000183,0.000177,0.000185
12,0.031902,0.006001,0.012159,0.001497,0.003241,0.002817,0.002297,0.001400,0.000900,0.001989,...,0.001185,0.001096,0.000628,0.000978,0.000636,0.001193,0.001014,0.001174,0.001023,0.001166
16,0.005409,0.001234,0.002033,0.000810,0.000990,0.001216,0.001215,0.000060,0.000068,0.000961,...,0.000026,0.000011,0.000009,0.000015,0.000034,0.000042,0.000027,0.000029,0.000033,0.000026
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33022,0.455485,0.158969,0.232886,0.007375,0.016258,0.012759,0.012863,0.000505,0.000189,0.000860,...,0.000028,0.000075,0.000057,0.000088,0.000132,0.000166,0.000081,0.000095,0.000080,0.000099
33023,0.038763,0.003345,0.009332,0.001526,0.003748,0.003512,0.004145,0.000208,0.000070,0.000717,...,0.000080,0.000056,0.000055,0.000077,0.000089,0.000141,0.000070,0.000064,0.000069,0.000060
33026,0.000669,0.000546,0.000403,0.003013,0.002851,0.001171,0.000952,0.001291,0.001301,0.011835,...,0.000406,0.000611,0.000456,0.000481,0.000509,0.000369,0.000263,0.000306,0.000311,0.000371
33027,0.056008,0.005998,0.013959,0.002334,0.003611,0.002515,0.002576,0.000338,0.000038,0.000794,...,0.000139,0.000062,0.000067,0.000089,0.000026,0.000072,0.000060,0.000089,0.000073,0.000088


# 예측행렬 저장

In [36]:
actual_pred_matrix_df.to_csv('./save_matrix_csv/total_newmf_count.csv',header=False,index=False)