<a href="https://colab.research.google.com/github/hyunj941031/ds-sa-cp2/blob/main/models/basemodel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install python-box

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import numpy as np
import pandas as pd
import os

import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
from collections import defaultdict
from box import Box


import warnings

warnings.filterwarnings(action='ignore')

In [3]:
config = {
    'data_path' : '/content/drive/MyDrive/fashion_campus_dataset',
    'model_path' : './',
    'model' : 'baseline'
}

args = {
    "batch_size": 128,
    "epochs": 10,
    "num_factor": 32,
    "lr": 0.001,
    "num_layers": 3,
    "num_ng": 2,
    "out": True,
    "test_num_ng": 99,
    "top_k": 10,
}

config = Box(config)

In [4]:
class SplitData():
    def __init__(self, config):
        self.config = config
        self.df = pd.read_csv(os.path.join(self.config.data_path, 'user_item.csv'), index_col=0)
        self.df = self.delete_ones()

        self.item_encoder, self.item_decoder = self.generate_encoder_decoder('itemId')
        self.user_encoder, self.user_decoder = self.generate_encoder_decoder('userId')
        self.num_item, self.num_user = len(self.item_encoder), len(self.user_encoder)

        self.df['item_idx'] = self.df['itemId'].apply(lambda x : self.item_encoder[x] + 1)
        self.df['user_idx'] = self.df['userId'].apply(lambda x : self.user_encoder[x])
        self.df = self.df.sort_values(['user_idx', 'timestamp'])
        self.user_train, self.user_valid = self.split_sequence_data()

    def generate_encoder_decoder(self, col:str) -> dict:
        encoder = {}
        decoder = {}
        ids = self.df[col].unique()

        for idx, _id in enumerate(ids):
            encoder[_id] = idx
            decoder[idx] = _id

        return encoder, decoder

    def delete_ones(self) -> dict:
        a = self.df.groupby('userId')['itemId'].size()
        for i in a.index:
            if a[i] <= 1:
                del(a[i])
        df_ = self.df.copy()
        df_ = df_[df_['userId'].isin(a.index)]
        
        return df_

    def split_sequence_data(self) -> dict:
        users = defaultdict(list)
        user_train = {}
        user_valid = {}
        group_df = self.df.groupby('user_idx')
        for user, item in group_df:
            users[user].extend(item['item_idx'].tolist())

        for user in users:
            user_train[user] = users[user][:-1]
            user_valid[user] = [users[user][-1]] # 마지막 아이템 예측

        return user_train, user_valid

    def get_train_valid_data(self):
        return self.user_train, self.user_valid

split_data = SplitData(config)
train_df, val_df = split_data.get_train_valid_data()

In [5]:
train_data = []

for i in range(len(train_df)):
    for j in range(len(train_df[i])):
        train_data.append([i,train_df[i][j]])

test_data = []

for i in range(len(val_df)):
    for j in range(len(val_df[i])):
        test_data.append([i,val_df[i][j]])

In [6]:
train_li = []
for i in range(len(train_df)):
    train_li.extend(train_df[i])

In [7]:
df = split_data.df
num_user = df['userId'].nunique()
num_item = df['itemId'].nunique()

In [10]:
train_count = [0]*(num_item+1)
for i in train_li:
    train_count[i] += 1
train_count.index(max(train_count))

30849

In [54]:
def hit(gt_item, pred_items):
    if gt_item in pred_items:
        return 1
    return 0


def ndcg(gt_item, pred_items):
    if gt_item in pred_items:
        index = pred_items.index(gt_item)
        return np.reciprocal(np.log2(index + 2))
    return 0


def metrics(test_loader, train_count, top_k):
    HR, NDCG = [], []

    for user, item in test_loader:
        user = user # .cuda()
        item = item # .cuda()

        recommends = [train_count.index(max(train_count))]
        # 정답값 선택
        gt_item = item[0].item()
        HR.append(hit(gt_item, recommends))
        NDCG.append(ndcg(gt_item, recommends))

    return np.mean(HR), np.mean(NDCG)

In [55]:
test_loader = data.DataLoader(test_data)

In [56]:
HR, NDCG = metrics(test_loader, train_count, args["top_k"])

In [59]:
# Hit rate @ 10, NDCG @ 10
HR, NDCG

(2.367928772702517e-05, 2.367928772702517e-05)