<a href="https://colab.research.google.com/github/hyunj941031/ds-sa-cp2/blob/main/models/MF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
!pip install python-box

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [18]:
!pip install tensorboardX

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [19]:
import numpy as np
import pandas as pd
import scipy.sparse as sp

import os
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import torch.backends.cudnn as cudnn
from collections import defaultdict
from box import Box

import warnings
from tensorboardX import SummaryWriter

import time

warnings.filterwarnings(action='ignore')

In [20]:
config = {
    'data_path' : '/content/drive/MyDrive/fashion_campus_dataset',
    'model_path' : './',
    'model' : 'MF'
}

args = {
    "batch_size": 128,
    "epochs": 10,
    "num_factor": 32,
    "lr": 0.001,
    "num_layers": 3,
    "num_ng": 4,
    "out": True,
    "test_num_ng": 99,
    "top_k": 10,
}

config = Box(config)
# os.environ["CUDA_VISIBLE_DEVICES"] = args["gpu"]
# cudnn.benchmark = True

In [21]:
class SplitData():
    def __init__(self, config):
        self.config = config
        self.df = pd.read_csv(os.path.join(self.config.data_path, 'user_item.csv'), index_col=0)
        self.df = self.delete_ones()

        self.item_encoder, self.item_decoder = self.generate_encoder_decoder('itemId')
        self.user_encoder, self.user_decoder = self.generate_encoder_decoder('userId')
        self.num_item, self.num_user = len(self.item_encoder), len(self.user_encoder)

        self.df['item_idx'] = self.df['itemId'].apply(lambda x : self.item_encoder[x] + 1)
        self.df['user_idx'] = self.df['userId'].apply(lambda x : self.user_encoder[x])
        self.df = self.df.sort_values(['user_idx', 'timestamp'])
        self.user_train, self.user_valid = self.split_sequence_data()

    def generate_encoder_decoder(self, col:str) -> dict:
        encoder = {}
        decoder = {}
        ids = self.df[col].unique()

        for idx, _id in enumerate(ids):
            encoder[_id] = idx
            decoder[idx] = _id

        return encoder, decoder

    def delete_ones(self) -> dict:
        a = self.df.groupby('userId')['itemId'].size()
        for i in a.index:
            if a[i] <= 1:
                del(a[i])
        df_ = self.df.copy()
        df_ = df_[df_['userId'].isin(a.index)]
        
        return df_

    def split_sequence_data(self) -> dict:
        users = defaultdict(list)
        user_train = {}
        user_valid = {}
        group_df = self.df.groupby('user_idx')
        for user, item in group_df:
            users[user].extend(item['item_idx'].tolist())

        for user in users:
            user_train[user] = users[user][:-1]
            user_valid[user] = [users[user][-1]] # 마지막 아이템 예측

        return user_train, user_valid

    def get_train_valid_data(self):
        return self.user_train, self.user_valid

split_data = SplitData(config)
train_df, val_df = split_data.get_train_valid_data()

In [22]:
len(train_df), len(val_df)

(42231, 42231)

In [23]:
df = split_data.df
num_user = df['userId'].nunique()
num_item = df['itemId'].nunique()

sparsity = 1 - len(df) / (num_user * num_item)

print(f'전체 User 수: {num_user}')
print(f'전체 Item 수: {num_item}')
print(f'행렬의 희소성: {sparsity:.4f}')

전체 User 수: 42231
전체 Item 수: 44446
행렬의 희소성: 0.9993


In [24]:
items = set()
for i in range(len(train_df)):
    for val in train_df[i]:
        items.add(val)
num_item = len(items) + 1
num_user = len(train_df) + 1

In [25]:
num_item, num_user

(44447, 42232)

In [26]:
train_mat = sp.dok_matrix((num_user, num_item), dtype=np.float32)
train_data = []

for i in range(len(train_df)):
    for j in range(len(train_df[i])):
        train_mat[i, train_df[i][j]] = 1.0
        train_data.append([i,train_df[i][j]])

In [27]:
test_data = []

for i in range(len(val_df)):
    for j in range(len(val_df[i])):
        test_data.append([i,val_df[i][j]])

In [28]:
class NCFData(data.Dataset):
    def __init__(self, features, num_item, train_mat=None, num_ng=0, is_training=None):
        super(NCFData, self).__init__()
        """ Note that the labels are only useful when training, we thus 
			add them in the ng_sample() function.
		"""
        self.features_ps = features
        self.num_item = num_item
        self.train_mat = train_mat
        self.num_ng = num_ng
        self.is_training = is_training
        self.labels = [0] * len(features)

    def set_ng_sample(self):
        assert self.is_training, "no need to sampling when testing"

        # negative sample 더하기
        self.features_ng = []
        for x in self.features_ps:
            # user
            u = x[0]
            for _ in range(self.num_ng):
                j = np.random.randint(self.num_item)
                # train set에 있는 경우 다시 뽑기
                while (u, j) in self.train_mat:
                    j = np.random.randint(self.num_item)
                self.features_ng.append([u, j])

        labels_ps = [1] * len(self.features_ps)
        labels_ng = [0] * len(self.features_ng)

        self.features_fill = self.features_ps + self.features_ng
        self.labels_fill = labels_ps + labels_ng

    def __len__(self):
        return (self.num_ng + 1) * len(self.labels)

    def __getitem__(self, idx):
        features = self.features_fill if self.is_training else self.features_ps
        labels = self.labels_fill if self.is_training else self.labels

        user = features[idx][0]
        item = features[idx][1]
        label = labels[idx]
        return user, item, label

In [29]:
def prepare_data(train_data, test_data, num_item, train_mat):

    # construct the train and test datasets
    # args = (features, num_item, train_mat=None, num_ng=0, is_training=None)
    train_dataset = NCFData(train_data, num_item, train_mat, args["num_ng"], True)
    test_dataset = NCFData(test_data, num_item, train_mat, 0, False)
    train_loader = data.DataLoader(
        train_dataset, batch_size=args["batch_size"], shuffle=True, num_workers=4
    )
    test_loader = data.DataLoader(
        test_dataset, batch_size=args["test_num_ng"] + 1, shuffle=False, num_workers=0
    )

    return train_loader, test_loader


train_loader, test_loader = prepare_data(train_data, test_data, num_item, train_mat)

In [41]:
class MF(nn.Module):
    def __init__(self, num_user, num_item, num_factor):
        super(MF, self).__init__()
        self.num_factor = num_factor

        self.embed_user = nn.Embedding(num_user, num_factor)
        self.embed_item = nn.Embedding(num_item, num_factor)
        predict_size = num_factor
        self.predict_layer = torch.ones(predict_size, 1) # .cuda()
        self._init_weight_()

    def _init_weight_(self):
        # weight 초기화
        nn.init.normal_(self.embed_user.weight, std=0.01)
        nn.init.normal_(self.embed_item.weight, std=0.01)

        # bias 초기화
        for m in self.modules():
            if isinstance(m, nn.Linear) and m.bias is not None:
                m.bias.data.zero_()

    def forward(self, user, item):
        embed_user = self.embed_user(user)
        embed_item = self.embed_item(item)
        output_GMF = embed_user * embed_item
        prediction = torch.matmul(output_GMF, self.predict_layer)
        return prediction.view(-1)

In [42]:
def create_model(num_user, num_item, args):
    model = MF(num_user, num_item, args["num_factor"])
    # model.cuda()
    loss_function = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=args["lr"])
    return model, loss_function, optimizer

model, loss_function, optimizer = create_model(num_user, num_item, args)

In [45]:
def hit(gt_item, pred_items):
    if gt_item in pred_items:
        return 1
    return 0


def ndcg(gt_item, pred_items):
    if gt_item in pred_items:
        index = pred_items.index(gt_item)
        return np.reciprocal(np.log2(index + 2))
    return 0


def metrics(model, test_loader, top_k):
    HR, NDCG = [], []

    for user, item, _ in test_loader:
        user = user # .cuda()
        item = item # .cuda()

        predictions = model(user, item)
        # 가장 높은 top_k개 선택
        _, indices = torch.topk(predictions, top_k)
        # 해당 상품 index 선택
        recommends = torch.take(item, indices).cpu().numpy().tolist()
        # 정답값 선택
        gt_item = item[0].item()
        HR.append(hit(gt_item, recommends))
        NDCG.append(ndcg(gt_item, recommends))

    return np.mean(HR), np.mean(NDCG)

In [46]:
count, best_hr = 0, 0
writer = SummaryWriter()  # for visualization
# 모델 파라미터 출력
for epoch in range(args["epochs"]):
    model.train()  # Enable dropout (if have).

    start_time = time.time()
    train_loader.dataset.set_ng_sample()

    for user, item, label in train_loader:
        user = user # .cuda()
        item = item # .cuda()
        label = label.float() # .cuda()

        # gradient 초기화
        model.zero_grad()
        prediction = model(user, item)
        loss = loss_function(prediction, label)
        loss.backward()
        optimizer.step()
        writer.add_scalar("data/loss", loss.item(), count)
        count += 1

    model.eval()
    HR, NDCG = metrics(model, test_loader, args["top_k"])

    elapsed_time = time.time() - start_time
    print(
        "The time elapse of epoch {:03d}".format(epoch)
        + " is: "
        + time.strftime("%H: %M: %S", time.gmtime(elapsed_time))
    )
    print("HR: {:.3f}\tNDCG: {:.3f}".format(np.mean(HR), np.mean(NDCG)))

    if HR > best_hr:
        best_hr, best_ndcg, best_epoch = HR, NDCG, epoch
        if args["out"]:
            if not os.path.exists(config["model_path"]):
                os.mkdir(config["model_path"])
            torch.save(
                model, "{}{}.pth".format(config["model_path"], config["model"])
            )

print(
    "End. Best epoch {:03d}: HR = {:.3f}, NDCG = {:.3f}".format(
        best_epoch, best_hr, best_ndcg
    )
)

The time elapse of epoch 000 is: 00: 13: 38
HR: 0.106	NDCG: 0.042


BoxKeyError: ignored