# Import

In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict
from tqdm.notebook import tqdm
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
import os, random

import torch
import torch.nn as nn
from torch.nn.init import normal_
import torch.nn.functional as F

# 하이퍼파라미터 설정

In [2]:
# 하이퍼파라미터 
class cfg: 
    gpu_idx = 0
    device = torch.device("cuda:{}".format(gpu_idx) if torch.cuda.is_available() else "cpu")
    top_k = 25
    seed = 42
    neg_ratio = 100

In [3]:
torch.cuda.is_available()

True

# 시드 고정

In [4]:
# 시드 고정 
def seed_everything(random_seed):
    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(random_seed)
    random.seed(random_seed)
    
seed_everything(cfg.seed)

# Data Load

In [5]:
# 경로 설정
data_path = '../data'
saved_path = './saved'
output_path = './submission'

### 데이터 불러오기
- history_data : 시청 시작 데이터
- profile_data : 프로필 정보 
- meta_data : 콘텐츠 일반 메타 정보

In [6]:
history_df = pd.read_csv(os.path.join(data_path, 'history_data.csv'), encoding='utf-8')

## 중복 데이터 제거 ##
history_df = history_df[['profile_id', 'log_time', 'album_id']].drop_duplicates(subset=['profile_id', 'album_id', 'log_time']).sort_values(by = ['profile_id', 'log_time']).reset_index(drop = True)
history_df['rating']=1
history_df.head(3)

profile_df = pd.read_csv(os.path.join(data_path, 'profile_data.csv'), encoding='utf-8')
meta_df = pd.read_csv(os.path.join(data_path, 'meta_data.csv'), encoding='utf-8')

  meta_df = pd.read_csv(os.path.join(data_path, 'meta_data.csv'), encoding='utf-8')


# 전체 데이터를 통하여 table 생성

In [7]:
ratings_matrix_df = history_df.pivot_table('rating',index='profile_id',columns='album_id')
ratings_matrix_df.head()

album_id,0,1,2,3,4,5,6,7,8,9,...,25877,25893,25894,25895,25898,25912,25913,25914,25915,25916
profile_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,,,,,,,,,,,...,,,,,,,,,,
5,1.0,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
12,,,,,,,,,,,...,,,,,,,,,,
16,,,,,,,,,,,...,,,,,,,,,,


In [8]:
ratings_total_matrix_df = ratings_matrix_df.fillna(0)
ratings_total_matrix_df.head()

album_id,0,1,2,3,4,5,6,7,8,9,...,25877,25893,25894,25895,25898,25912,25913,25914,25915,25916
profile_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# (profile_id와 index) 와 (album_id와 columns) 변환을 위한 dictionary 생성

In [9]:
index_list = list(ratings_total_matrix_df.index)
column_list = list(ratings_total_matrix_df.columns)

print('index 의 전체 길이 :',len(index_list),'columns 의 전체 길이 :',len(column_list))

real_to_fake_user = {real:fake for fake,real in enumerate(index_list)} # profile_id -> index
fake_to_real_user = {fake:real for fake,real in enumerate(index_list)} # index -> profile_id

real_to_fake_album = {real:fake for fake,real in enumerate(column_list)} # album_id -> column
fake_to_real_album = {fake:real for fake,real in enumerate(column_list)} # column -> album_id

print('profile_id 33032의 index 번호 :',real_to_fake_user[33032] ,'index 번호 8310의 profile_id :',fake_to_real_user[8310])
print('album_id 25916의 column 번호 :',real_to_fake_album[25916] ,'column 번호 20694의 album_id :',fake_to_real_album[20694])

index 의 전체 길이 : 8311 columns 의 전체 길이 : 20695
profile_id 33032의 index 번호 : 8310 index 번호 8310의 profile_id : 33032
album_id 25916의 column 번호 : 20694 column 번호 20694의 album_id : 25916


# 필요한 정보 추출

In [10]:
cfg.n_users = history_df.profile_id.nunique()
cfg.n_items = history_df.album_id.nunique()

print(cfg.n_users,cfg.n_items)

8311 20695


In [11]:
# 유저 특징 정보 추출 
profile_df = profile_df.set_index('profile_id')
user_features = profile_df[['age']].to_dict()
print("user_id 3의 age 정보 :", user_features['age'][3]) # 실제 id

user_id 3의 age 정보 : 5


In [12]:
# 아이템 특징 정보 추출 
meta_df = meta_df.set_index('album_id')

# 범주형 데이터를 수치형 데이터로 변경 
le = LabelEncoder()
meta_df['genre_mid'] = le.fit_transform(meta_df['genre_mid'])
item_features = meta_df[['genre_mid']].to_dict()
print("album_id 749의 genre_mid 정보 :", item_features['genre_mid'][749]) # 실제 album_id

album_id 749의 genre_mid 정보 : 1


In [13]:
# 추출한 특징 정보의 속성을 저장 
cfg.n_genres = meta_df['genre_mid'].nunique()
cfg.n_continuous_feats = 1 

# NeuMF 구현

![](http://drive.google.com/uc?export=view&id=1tpajTLipLoFdvLICO-alAxeoKAE8-k61)

In [14]:
class NeuMF(nn.Module):
    """Neural Matrix Factorization Model
        참고 문헌 : https://arxiv.org/abs/1708.05031

    예시 :
        model = NeuMF(cfg) 
        output = model.forward(user_ids, item_ids, [feat0, feat1]) 
    """
    def __init__(self, cfg):
        """ 
        Args:
            cfg : config 파일로 네트워크 생성에 필요한 정보들을 담고 있음 
        """
        super(NeuMF, self).__init__()
        self.n_users = cfg.n_users
        self.n_items = cfg.n_items
        self.emb_dim = cfg.emb_dim
        self.layer_dim = cfg.layer_dim
        self.n_continuous_feats = cfg.n_continuous_feats
        self.n_genres = cfg.n_genres
        self.dropout = cfg.dropout
        self.build_graph()

    def build_graph(self):
        """Neural Matrix Factorization Model 생성
            구현된 모습은 위의 그림을 참고 
        """
        self.user_embedding_mf = nn.Embedding(num_embeddings=self.n_users, embedding_dim=self.emb_dim)
        self.item_embedding_mf = nn.Embedding(num_embeddings=self.n_items, embedding_dim=self.emb_dim)
        
        self.user_embedding_mlp = nn.Embedding(num_embeddings=self.n_users, embedding_dim=self.emb_dim)
        self.item_embedding_mlp = nn.Embedding(num_embeddings=self.n_items, embedding_dim=self.emb_dim)
                
        self.genre_embeddig = nn.Embedding(num_embeddings=self.n_genres, embedding_dim=self.n_genres//2)
        
        self.mlp_layers = nn.Sequential(
            nn.Linear(2*self.emb_dim + self.n_genres//2 + self.n_continuous_feats, self.layer_dim), 
            nn.ReLU(), 
            nn.Dropout(p=self.dropout), 
            nn.Linear(self.layer_dim, self.layer_dim//2), 
            nn.ReLU(), 
            nn.Dropout(p=self.dropout)
        )
        self.affine_output = nn.Linear(self.layer_dim//2 + self.emb_dim, 1)
        self.apply(self._init_weights)
        

    def _init_weights(self, module):
        if isinstance(module, nn.Embedding):
            normal_(module.weight.data, mean=0.0, std=0.01)
        elif isinstance(module, nn.Linear):
            normal_(module.weight.data, 0, 0.01)
            if module.bias is not None:
                module.bias.data.fill_(0.0)
    
    def forward(self, user_indices, item_indices, feats):
        """ 
        Args:
            user_indices : 유저의 인덱스 정보 
                ex) tensor([ 3100,  3100,  ..., 14195, 14195])
            item_indices : 아이템의 인덱스 정보
                ex) tensor([   50,    65,   ..., 14960, 11527])
            feats : 특징 정보 
        Returns: 
            output : 유저-아이템 쌍에 대한 추천 결과 
                ex) tensor([  9.4966,  22.0261, ..., -19.3535, -23.0212])
        """
        user_embedding_mf = self.user_embedding_mf(user_indices)
        item_embedding_mf = self.item_embedding_mf(item_indices)
        mf_output = torch.mul(user_embedding_mf, item_embedding_mf)
        
        user_embedding_mlp = self.user_embedding_mlp(user_indices)
        item_embedding_mlp = self.item_embedding_mlp(item_indices)
        genre_embedding_mlp = self.genre_embeddig(feats[1])
        input_feature = torch.cat((user_embedding_mlp, item_embedding_mlp, genre_embedding_mlp, feats[0].unsqueeze(1)), -1)
        mlp_output = self.mlp_layers(input_feature)
        
        output = torch.cat([mlp_output, mf_output], dim=-1)
        output = self.affine_output(output).squeeze(-1)
        return output

### 학습 및 추론 코드 구현

- 학습 : Negative sampling을 활용하여 Binary Classification 진행 
    - history 에 있는 album_id는 positive label로 그렇지 않은 album_id는 nagative label로 활용  
    - 단, 이때 모든 album_id를 negative label로 활용하는 것이 아닌 일부만 사용 (neg_ratio 값에 따라서 개수 조정)
- 추론 : 일부 데이터에 대해 recall, ndcg, coverage 성능 확인

#### 학습 및 추론에 필요한 데이터 셋 생성 코드 구현

In [15]:
def make_UIdataset(train, neg_ratio):
    """ 유저별 학습에 필요한 딕셔너리 데이터 생성 
    Args:
        train : 유저-아이템의 상호작용을 담은 행렬 
            ex) 
                array([[0., 0., 0., ..., 0., 0., 0.],
                        [0., 0., 0., ..., 0., 0., 0.],
                        [0., 0., 0., ..., 0., 0., 0.],
                        ...,
                        [0., 0., 0., ..., 0., 0., 0.],
                        [0., 0., 0., ..., 0., 0., 0.],
                        [0., 0., 0., ..., 0., 0., 0.]])
        neg_ratio : negative sampling 활용할 비율 
            ex) 3 (positive label 1개당 negative label 3개)
    Returns: 
        UIdataset : 유저별 학습에 필요한 정보를 담은 딕셔너리 
            ex) {'사용자 ID': [[positive 샘플, negative 샘플], ... , [1, 1, 1, ..., 0, 0]]}
                >>> UIdataset[3]
                    [array([   16,    17,    18, ...,  9586, 18991,  9442]),
                    array([5, 5, 5, ..., 5, 5, 5]),
                    array([4, 4, 4, ..., 5, 1, 1]),
                    array([1., 1., 1., ..., 0., 0., 0.])]
    """
    UIdataset = {}
    for user_id, items_by_user in enumerate(tqdm(train)):
        # 가짜 user id
        UIdataset[user_id] = []
        # positive 샘플 계산 
        pos_item_ids = np.where(items_by_user > 0)[0] # 가짜 아이템 id
        num_pos_samples = len(pos_item_ids)

        # negative 샘플 계산 (random negative sampling) 
        num_neg_samples = neg_ratio * num_pos_samples
        neg_items = np.where(items_by_user <= 0)[0] # 가짜 아이템 id
        neg_item_ids = np.random.choice(neg_items, min(num_neg_samples, len(neg_items)), replace=False)
        UIdataset[user_id].append(np.concatenate([pos_item_ids, neg_item_ids]))
        
        # feature 추출 
        features = []
        for item_id in np.concatenate([pos_item_ids, neg_item_ids]): 
            features.append(user_features['age'][fake_to_real_user[user_id]])
        UIdataset[user_id].append(np.array(features))
        
        features = []
        for item_id in np.concatenate([pos_item_ids, neg_item_ids]): 
            features.append(item_features['genre_mid'][fake_to_real_album[item_id]])
        UIdataset[user_id].append(np.array(features))
        
        # label 저장  
        pos_labels = np.ones(len(pos_item_ids))
        neg_labels = np.zeros(len(neg_item_ids))
        UIdataset[user_id].append(np.concatenate([pos_labels, neg_labels]))

    return UIdataset

In [16]:
UIdataset = make_UIdataset(ratings_total_matrix_df.values, neg_ratio=cfg.neg_ratio)

  0%|          | 0/8311 [00:00<?, ?it/s]

In [17]:
# UIdataset [ 가짜 user _id ] = [[가짜 item id들],[나이정보],[장르정보],[label들 , label]]

In [18]:
def make_batchdata(user_indices, batch_idx, batch_size):
    """ 배치 데이터로 변환 
    Args:
        user_indices : 전체 유저의 인덱스 정보 
            ex) array([ 3100,  1800, 30098, ...,  2177, 11749, 20962])
        batch_idx : 배치 인덱스 (몇번째 배치인지)
            ex) 0 
        batch_size : 배치 크기 
            ex) 256 
    Returns 
        batch_user_ids : 배치내의 유저 인덱스 정보 
            ex) [22194, 22194, 22194, 22194, 22194, ...]
        batch_item_ids : 배치내의 아이템 인덱스 정보 
            ex) [36, 407, 612, 801, 1404, ...]
        batch_feat0 : 배치내의 유저-아이템 인덱스 정보에 해당하는 feature0 정보 
            ex) [6, 6, 6, 6, 6, ...]
        batch_feat1 : 배치내의 유저-아이템 인덱스 정보에 해당하는 feature1 정보 
            ex) [4,  4,  4, 23,  4, ...]
        batch_labels : 배치내의 유저-아이템 인덱스 정보에 해당하는 label 정보 
            ex) [1.0, 1.0, 1.0, 1.0, 1.0, ...]
    """
    batch_user_indices = user_indices[batch_idx*batch_size : (batch_idx+1)*batch_size]
    batch_user_ids = []
    batch_item_ids = []
    batch_feat0 = []
    batch_feat1 = []
    batch_labels = []
    for user_id in batch_user_indices:
        item_ids = UIdataset[user_id][0]
        feat0 = UIdataset[user_id][1]
        feat1 = UIdataset[user_id][2]
        labels = UIdataset[user_id][3]
        user_ids = np.full(len(item_ids), user_id)
        batch_user_ids.extend(user_ids.tolist())
        batch_item_ids.extend(item_ids.tolist())
        batch_feat0.extend(feat0.tolist())
        batch_feat1.extend(feat1.tolist())
        batch_labels.extend(labels.tolist())
    return batch_user_ids, batch_item_ids, batch_feat0, batch_feat1, batch_labels

def update_avg(curr_avg, val, idx):
    """ 현재 epoch 까지의 평균 값을 계산 
    """
    return (curr_avg * idx + val) / (idx + 1)

#### 학습 및 검증 코드 생성

In [19]:
# cfg.n_users,cfg.n_items -> 8311 20695 개수 (0~8310 ,0~20694) 가짜 user id ,가짜 item id
def train_epoch(cfg, model, optimizer, criterion): 
    model.train()
    curr_loss_avg = 0.0

    user_indices = np.arange(cfg.n_users)
    np.random.RandomState(cfg.epoch).shuffle(user_indices)
    batch_num = int(len(user_indices) / cfg.batch_size) + 1 # 배치 전체 개수 int(8311/256) +1
    bar = tqdm(range(batch_num), leave=False)
    for step, batch_idx in enumerate(bar):
        user_ids, item_ids, feat0, feat1, labels = make_batchdata(user_indices, batch_idx, cfg.batch_size)
        # 배치 사용자 단위로 학습
        user_ids = torch.LongTensor(user_ids).to(cfg.device)
        item_ids = torch.LongTensor(item_ids).to(cfg.device)
        feat0 = torch.FloatTensor(feat0).to(cfg.device)
        feat1 = torch.LongTensor(feat1).to(cfg.device)
        labels = torch.FloatTensor(labels).to(cfg.device)
        labels = labels.view(-1, 1)

        # grad 초기화
        optimizer.zero_grad()

        # 모델 forward
        output = model.forward(user_ids, item_ids, [feat0, feat1])
        output = output.view(-1, 1)

        loss = criterion(output, labels)

        # 역전파
        loss.backward()

        # 최적화
        optimizer.step()    
        if torch.isnan(loss):
            print('Loss NAN. Train finish.')
            break
        curr_loss_avg = update_avg(curr_loss_avg, loss, step)
        
        msg = f"epoch: {cfg.epoch}, "
        msg += f"loss: {curr_loss_avg.item():.5f}, "
        msg += f"lr: {optimizer.param_groups[0]['lr']:.6f}"
        bar.set_description(msg)
    rets = {'losses': np.around(curr_loss_avg.item(), 5)}
    return rets

In [20]:
def recallk(actual, predicted, k = 25):
    """ label과 prediction 사이의 recall 평가 함수 
    Args:
        actual : 실제로 본 상품 리스트
        pred : 예측한 상품 리스트
        k : 상위 몇개의 데이터를 볼지 (ex : k=5 상위 5개의 상품만 봄)
    Returns: 
        recall_k : recall@k 
    """ 
    set_actual = set(actual)
    recall_k = len(set_actual & set(predicted[:k])) / min(k, len(set_actual))
    return recall_k

def unique(sequence):
    # preserves order
    seen = set()
    return [x for x in sequence if not (x in seen or seen.add(x))]

def ndcgk(actual, predicted, k = 25):
    set_actual = set(actual)
    idcg = sum([1.0 / np.log(i + 2) for i in range(min(k, len(set_actual)))])
    dcg = 0.0
    unique_predicted = unique(predicted[:k])
    for i, r in enumerate(unique_predicted):
        if r in set_actual:
            dcg += 1.0 / np.log(i + 2)
    ndcg_k = dcg / idcg
    return ndcg_k

def evaluation(gt, pred):
    """ label과 prediction 사이의 recall, coverage, competition metric 평가 함수 
    Args:
        gt : 데이터 프레임 형태의 정답 데이터 
        pred : 데이터 프레임 형태의 예측 데이터 
    Returns: 
        rets : recall, ndcg, coverage, competition metric 결과 
            ex) {'recall': 0.123024, 'ndcg': 056809, 'coverage': 0.017455, 'score': 0.106470}
    """    
    gt = gt.groupby('profile_id')['album_id'].unique().to_frame().reset_index()
    gt.columns = ['profile_id', 'actual_list']

    evaluated_data = pd.merge(pred, gt, how = 'left', on = 'profile_id')

    evaluated_data['Recall@25'] = evaluated_data.apply(lambda x: recallk(x.actual_list, x.predicted_list), axis=1)
    evaluated_data['NDCG@25'] = evaluated_data.apply(lambda x: ndcgk(x.actual_list, x.predicted_list), axis=1)

    recall = evaluated_data['Recall@25'].mean()
    ndcg = evaluated_data['NDCG@25'] .mean()
    coverage = (evaluated_data['predicted_list'].apply(lambda x: x[:cfg.top_k]).explode().nunique())/meta_df.index.nunique()

    score = 0.75*recall + 0.25*ndcg
    rets = {"recall" :recall, 
            "ndcg" :ndcg, 
            "coverage" :coverage, 
            "score" :score}
    return rets

In [21]:
def valid_epoch(cfg, model, data, mode='valid'):
    pred_list = []
    model.eval()
    
    # data -> valid 에 있는 유저만 포함
    query_user_ids = data['profile_id'].unique() # 추론할 모든 user array 집합
    query_user_ids = [real_to_fake_user[real_id] for real_id in query_user_ids]
    full_item_ids = np.array([c for c in range(cfg.n_items)]) # 추론할 모든 fake item array 집합 
    full_item_ids_feat1 = [item_features['genre_mid'][fake_to_real_album[c]] for c in full_item_ids]
    for user_id in query_user_ids:
        with torch.no_grad():
            user_ids = np.full(cfg.n_items, user_id)
            
            user_ids = torch.LongTensor(user_ids).to(cfg.device)
            item_ids = torch.LongTensor(full_item_ids).to(cfg.device)
            
            feat0 = np.full(cfg.n_items, user_features['age'][fake_to_real_user[user_id]])
            feat0 = torch.FloatTensor(feat0).to(cfg.device)
            feat1 = torch.LongTensor(full_item_ids_feat1).to(cfg.device)
            
            eval_output = model.forward(user_ids, item_ids, [feat0, feat1]).detach().cpu().numpy()
            pred_u_score = eval_output.reshape(-1)   
        
        pred_u_idx = np.argsort(pred_u_score)[::-1]
        pred_u = full_item_ids[pred_u_idx]
        pred_list.append([fake_to_real_album[item] for item in list(pred_u[:cfg.top_k])])
        
    pred = pd.DataFrame()
    pred['profile_id'] = [fake_to_real_user[fake_id] for fake_id in query_user_ids]
    pred['predicted_list'] = pred_list
    
    # 모델 성능 확인 
    if mode == 'valid':
        rets = evaluation(data, pred)
        return rets, pred
    return pred

In [22]:
def test_epoch(cfg, model, data, mode='test'):
    pred_list = []
    model.eval()
    actual_pred_matrix=[]
    # data -> valid 에 있는 유저만 포함
    query_user_ids = data['profile_id'].unique() # 추론할 모든 user array 집합
    query_user_ids = [real_to_fake_user[real_id] for real_id in query_user_ids]
    full_item_ids = np.array([c for c in range(cfg.n_items)]) # 추론할 모든 fake item array 집합 
    full_item_ids_feat1 = [item_features['genre_mid'][fake_to_real_album[c]] for c in full_item_ids]
    for user_id in query_user_ids:
        with torch.no_grad():
            user_ids = np.full(cfg.n_items, user_id)
            
            user_ids = torch.LongTensor(user_ids).to(cfg.device)
            item_ids = torch.LongTensor(full_item_ids).to(cfg.device)
            
            feat0 = np.full(cfg.n_items, user_features['age'][fake_to_real_user[user_id]])
            feat0 = torch.FloatTensor(feat0).to(cfg.device)
            feat1 = torch.LongTensor(full_item_ids_feat1).to(cfg.device)
            
            eval_output = model.forward(user_ids, item_ids, [feat0, feat1]).detach().cpu().numpy()
            pred_u_score = eval_output.reshape(-1) 
            
        actual_pred_matrix.append(pred_u_score)
        pred_u_idx = np.argsort(pred_u_score)[::-1]
        pred_u = full_item_ids[pred_u_idx]
        pred_list.append([fake_to_real_album[item] for item in list(pred_u[:cfg.top_k])])
        
    pred = pd.DataFrame()
    pred['profile_id'] = [fake_to_real_user[fake_id] for fake_id in query_user_ids]
    pred['predicted_list'] = pred_list
    
    return pred,actual_pred_matrix

## 모델 학습

### 하이퍼파라미터 설정 & 최적화 기법 설정

In [23]:
# 하이퍼 파라미터 설정 
cfg.batch_size = 256
cfg.emb_dim = 256
cfg.layer_dim = 256
cfg.dropout = 0.05
cfg.epochs = 100
cfg.learning_rate = 0.0025
cfg.reg_lambda = 0
cfg.check_epoch = 10

In [27]:
# model 생성 및 optimizer, loss 함수 설정 
model = NeuMF(cfg).to(cfg.device)
optimizer = torch.optim.Adam(model.parameters(), lr=cfg.learning_rate, weight_decay=cfg.reg_lambda)
criterion = torch.nn.BCEWithLogitsLoss(reduction='sum')

### 학습 진행

In [28]:
## 100epoch 기준 연산시간 약 15분 소요 ##
for epoch in range(cfg.epochs+1):
    cfg.epoch = epoch
    train_results = train_epoch(cfg, model, optimizer, criterion)
    
    print('epoch : ',epoch , 'loss : ',train_results['losses'])
    
torch.save(model.state_dict(), os.path.join(saved_path, f'newMF_origin_total_model.pth'))

  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  0 loss :  247502.09375


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  1 loss :  55950.26562


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  2 loss :  48144.29297


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  3 loss :  45464.5


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  4 loss :  41144.40234


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  5 loss :  36074.76953


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  6 loss :  31331.07031


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  7 loss :  26918.3418


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  8 loss :  22788.71094


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  9 loss :  19044.88086


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  10 loss :  15909.12012


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  11 loss :  13376.9541


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  12 loss :  11157.44043


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  13 loss :  9246.5625


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  14 loss :  7707.13037


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  15 loss :  6460.07715


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  16 loss :  5426.98828


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  17 loss :  4569.93506


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  18 loss :  3859.79443


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  19 loss :  3266.35986


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  20 loss :  2773.95532


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  21 loss :  2365.06567


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  22 loss :  2026.76257


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  23 loss :  1740.76331


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  24 loss :  1505.76782


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  25 loss :  1302.74329


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  26 loss :  1135.09473


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  27 loss :  993.93396


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  28 loss :  874.26746


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  29 loss :  772.06226


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  30 loss :  682.95978


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  31 loss :  609.34924


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  32 loss :  545.06439


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  33 loss :  490.32132


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  34 loss :  442.74802


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  35 loss :  400.75018


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  36 loss :  364.17847


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  37 loss :  332.39127


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  38 loss :  304.91736


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  39 loss :  280.12967


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  40 loss :  258.10229


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  41 loss :  238.10765


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  42 loss :  220.50996


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  43 loss :  204.96404


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  44 loss :  190.60683


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  45 loss :  177.66745


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  46 loss :  165.98235


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  47 loss :  155.3031


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  48 loss :  145.62773


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  49 loss :  136.88495


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  50 loss :  128.28186


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  51 loss :  121.02666


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  52 loss :  113.95214


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  53 loss :  107.57006


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  54 loss :  101.6552


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  55 loss :  96.33409


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  56 loss :  91.18721


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  57 loss :  86.60499


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  58 loss :  82.30909


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  59 loss :  78.27184


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  60 loss :  74.46653


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  61 loss :  70.92751


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  62 loss :  67.71772


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  63 loss :  64.57409


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  64 loss :  61.64882


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  65 loss :  59.00109


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  66 loss :  56.2851


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  67 loss :  53.94971


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  68 loss :  51.63867


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  69 loss :  49.47998


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  70 loss :  47.40469


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  71 loss :  45.51229


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  72 loss :  43.72828


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  73 loss :  42.00875


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  74 loss :  40.25995


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  75 loss :  38.82824


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  76 loss :  37.34438


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  77 loss :  35.84118


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  78 loss :  34.59145


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  79 loss :  33.35343


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  80 loss :  32.09433


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  81 loss :  30.99184


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  82 loss :  29.8749


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  83 loss :  28.86609


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  84 loss :  27.86247


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  85 loss :  26.83871


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  86 loss :  25.94555


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  87 loss :  25.1182


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  88 loss :  24.26325


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  89 loss :  23.51083


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  90 loss :  22.70446


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  91 loss :  21.99752


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  92 loss :  21.26393


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  93 loss :  20.58402


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  94 loss :  19.85433


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  95 loss :  19.34329


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  96 loss :  18.71874


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  97 loss :  18.15154


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  98 loss :  17.61897


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  99 loss :  17.08278


  0%|          | 0/33 [00:00<?, ?it/s]

epoch :  100 loss :  16.5515


# 모든 유저에 대해 추천 결과 생성

In [29]:
model.load_state_dict(torch.load(os.path.join(saved_path, 'newMF_origin_total_model.pth')))

<All keys matched successfully>

In [30]:
## 추론시간은 약 20초 소요 ##
submission_path = os.path.join(data_path, 'sample_submission.csv')
submission = pd.read_csv(submission_path)
submission,actual_pred_matrix = test_epoch(cfg, model, submission, mode='test')

In [31]:
actual_pred_matrix_df = pd.DataFrame(actual_pred_matrix,index=ratings_total_matrix_df.index,columns=ratings_total_matrix_df.columns)
actual_pred_matrix_df

album_id,0,1,2,3,4,5,6,7,8,9,...,25877,25893,25894,25895,25898,25912,25913,25914,25915,25916
profile_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,-8.216147,-11.385468,-20.795355,-8.808238,-16.491278,-14.042564,-13.589187,-13.808871,-22.191122,-13.723072,...,-24.464981,-21.886206,-22.561911,-21.587538,-23.117899,-20.794844,-21.649109,-21.912937,-21.289310,-21.719143
5,15.814044,-6.835226,-9.379964,-9.021684,-8.720962,-15.818329,-18.858906,-19.689102,-17.429787,-6.282855,...,-26.288965,-42.243095,-42.098293,-41.650913,-39.257999,-36.452744,-35.280590,-35.767071,-36.125332,-35.264637
7,3.864225,-13.592376,-15.465365,-9.660434,-18.986694,-5.722499,-8.807994,-17.487967,-18.638973,-1.396895,...,-36.664593,-45.005608,-45.184532,-44.776257,-40.166935,-39.867893,-39.630657,-39.373756,-39.971870,-39.709015
12,-0.364236,-13.461435,-8.622186,-17.336096,-16.698507,-27.975735,-21.944407,-19.389191,-12.534580,-16.233589,...,-21.837774,-33.063671,-33.553017,-32.774754,-28.510201,-24.995977,-24.972034,-24.647867,-25.422197,-24.780331
16,-7.991467,-17.406919,-15.900767,-17.251312,-22.459894,-22.983934,-24.214558,-25.441277,-22.622168,-23.771675,...,-32.480438,-42.784370,-43.027878,-42.603951,-41.833057,-36.969627,-36.532391,-36.143829,-36.977200,-36.341846
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33022,-3.287551,-15.356943,-15.423113,-17.928839,-20.579710,-5.059880,-7.426228,-24.419537,-22.214911,-20.894054,...,-37.103016,-41.379688,-41.180687,-41.146816,-43.170166,-37.385082,-37.202812,-36.846600,-37.455948,-37.336067
33023,-9.965976,-19.649651,-18.150185,-22.134497,-21.910231,-14.703043,-18.418640,-28.799118,-24.305611,-18.740294,...,-35.044815,-42.960945,-42.860718,-42.659580,-46.228760,-36.370117,-36.049110,-35.850887,-36.184715,-36.240417
33026,-2.558714,-19.266476,-14.173717,-21.456175,-19.495352,-19.895397,-17.623510,-21.203070,-21.895287,-20.871992,...,-28.879463,-36.671608,-36.828751,-36.541828,-33.995373,-30.511559,-30.425438,-30.140110,-30.889544,-30.424730
33027,-4.591228,-16.724159,-15.886403,-18.843271,-16.148790,-20.192297,-21.687809,-28.617828,-27.112333,-23.545275,...,-35.534504,-45.813633,-45.764313,-45.603348,-43.448200,-39.785347,-39.325039,-39.216766,-39.972347,-39.091175


# 생성된 예측행렬을 ensemble을 시켜주기 위하여 유저별로 최대 1 최소 0 으로 Minmaxscaler 적용

In [32]:
scaler = MinMaxScaler()
actual_pred_matrix_t = scaler.fit_transform(actual_pred_matrix_df.transpose())
actual_pred_matrix_df_t = pd.DataFrame(actual_pred_matrix_t,index=ratings_total_matrix_df.columns,columns=ratings_total_matrix_df.index)
actual_pred_matrix_df = actual_pred_matrix_df_t.transpose()
actual_pred_matrix_df

album_id,0,1,2,3,4,5,6,7,8,9,...,25877,25893,25894,25895,25898,25912,25913,25914,25915,25916
profile_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,0.514422,0.462599,0.308736,0.504740,0.379113,0.419152,0.426566,0.422974,0.285913,0.424377,...,0.248732,0.290899,0.279850,0.295782,0.270759,0.308744,0.294776,0.290462,0.300659,0.293630
5,0.736936,0.466031,0.435594,0.439879,0.443476,0.358585,0.322217,0.312287,0.339311,0.472638,...,0.233347,0.042521,0.044253,0.049604,0.078226,0.111779,0.125799,0.119980,0.115695,0.125990
7,0.791384,0.528978,0.500823,0.588083,0.447891,0.647277,0.600896,0.470420,0.453118,0.712300,...,0.182158,0.056776,0.054087,0.060224,0.129511,0.134006,0.137572,0.141434,0.132443,0.136394
12,0.768330,0.524932,0.614864,0.452925,0.464774,0.255198,0.367284,0.414770,0.542156,0.473414,...,0.369266,0.160644,0.151550,0.166013,0.245266,0.310574,0.311019,0.317043,0.302653,0.314582
16,0.556572,0.422839,0.444232,0.425050,0.351069,0.343626,0.326147,0.308723,0.348764,0.332437,...,0.208742,0.062389,0.058931,0.064952,0.075901,0.144979,0.151190,0.156709,0.144872,0.153896
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33022,0.667528,0.484589,0.483586,0.445606,0.405426,0.640664,0.604797,0.347225,0.380641,0.400662,...,0.154978,0.090156,0.093172,0.093685,0.063017,0.150703,0.153466,0.158865,0.149629,0.151446
33023,0.579968,0.445349,0.466194,0.410806,0.413924,0.514115,0.462462,0.318157,0.380624,0.457991,...,0.231332,0.121286,0.122679,0.125475,0.075858,0.212909,0.217371,0.220127,0.215486,0.214712
33026,0.721268,0.406059,0.502139,0.364748,0.401741,0.394194,0.437055,0.369523,0.356463,0.375769,...,0.224700,0.077693,0.074728,0.080141,0.128183,0.193909,0.195533,0.200916,0.186777,0.195547
33027,0.675803,0.489118,0.502008,0.456512,0.497971,0.435755,0.412744,0.306114,0.329278,0.384163,...,0.199689,0.041527,0.042286,0.044763,0.077923,0.134283,0.141365,0.143031,0.131405,0.144964


# 예측행렬 저장

In [33]:
actual_pred_matrix_df.to_csv('./save_matrix_csv/total_newmf_origin.csv',header=False,index=False)