<a href="https://colab.research.google.com/github/hyunj941031/RecSys/blob/main/models/MF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install python-box

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install tensorboardX

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import math
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import defaultdict
import os

import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.backends.cudnn as cudnn

from box import Box

import warnings

warnings.filterwarnings(action='ignore')
torch.set_printoptions(sci_mode=True)

In [None]:
config = {
    'data_path' : '/content/drive/MyDrive/fashion_campus_dataset',
    
    'sequence_len' : 50,
    'mask_prob' : 0.3, # cloze task
    'random_seed' : 123,

    'num_layers' : 2, # block 수 (encoder layer 수)
    'hidden_units' : 50, # Embedding size
    'num_heads' : 2, # Multi-head layer 수 (병렬처리), hidden_units를 나눴을 때 나누어 떨어지게게
    'dropout' : 0.15, # dropout의 비율

    'epoch' : 5,
    'patience' : 5,
    'batch_size' : 256,
    'lr' : 0.001,

    'num_epochs' : 10,
    'num_workers' : 4,
    'val_data' : 3,
    'delete_data' : True,
    'sampling' : True,
    'sampling_frac' : 0.3,
}

device = 'cuda' if torch.cuda.is_available() else 'cpu'

config = Box(config)

In [None]:
class MakeSequenceDataSet():
    def __init__(self, config):
        self.config = config
        self.df = pd.read_csv(os.path.join(self.config.data_path, 'user_item.csv'), index_col=0)
        if config['delete_data']:
            self.df = self.delete_ones()
        if config['sampling']:
            self.df = self.sampling_users()

        self.item_encoder, self.item_decoder = self.generate_encoder_decoder('itemId')
        self.user_encoder, self.user_decoder = self.generate_encoder_decoder('userId')
        self.num_item, self.num_user = len(self.item_encoder), len(self.user_encoder)

        self.df['item_idx'] = self.df['itemId'].apply(lambda x : self.item_encoder[x] + 1)
        self.df['user_idx'] = self.df['userId'].apply(lambda x : self.user_encoder[x])
        self.df = self.df.sort_values(['user_idx', 'timestamp'])
        self.user_train, self.user_valid = self.split_sequence_data()


    def delete_ones(self):
        a = self.df.groupby('userId')['itemId'].size()
        for i in a.index:
            if a[i] <= config['val_data']:
                del(a[i])
        df_ = self.df.copy()
        df_ = df_[df_['userId'].isin(a.index)]
                
        return df_


    def sampling_users(self):
        a = self.df.groupby('userId')['userId'].mean().sample(frac=config['sampling_frac'], random_state=config['random_seed'])
        df_ = self.df.copy()
        df_ = df_[df_['userId'].isin(a.index)]
        return df_


    def generate_encoder_decoder(self, col:str) -> dict:
                '''
        encoder, decoder 생성

        Args:
            col (str): 생성할 columns 명
        Return:
            dict: 생성된 user encoder, decoder
        '''
        encoder = {}
        decoder = {}
        ids = self.df[col].unique()

        for idx, _id in enumerate(ids):
            encoder[_id] = idx
            decoder[idx] = _id

        return encoder, decoder


    def split_sequence_data(self) -> dict:
        users = defaultdict(list)
        user_train = {}
        user_valid = {}
        group_df = self.df.groupby('user_idx')
        for user, item in group_df:
            users[user].extend(item['item_idx'].tolist())

        for user in users:
            user_train[user] = users[user][:-config['val_data']]
            user_valid[user] = users[user][-config['val_data']:] # 마지막 아이템 예측

        return user_train, user_valid


    def get_train_valid_data(self):
        return self.user_train, self.user_valid

In [None]:
class NCFData(data.Dataset):
    def __init__(self, features, num_item, train_mat=None, num_ng=0, is_training=None):
        super(NCFData, self).__init__()
        """ Note that the labels are only useful when training, we thus 
			add them in the ng_sample() function.
		"""
        self.features_ps = features
        self.num_item = num_item
        self.train_mat = train_mat
        self.num_ng = num_ng
        self.is_training = is_training
        self.labels = [0] * len(features)

    def set_ng_sample(self):
        assert self.is_training, "no need to sampling when testing"

        # negative sample 더하기
        self.features_ng = []
        for x in self.features_ps:
            # user
            u = x[0]
            for _ in range(self.num_ng):
                j = np.random.randint(self.num_item)
                # train set에 있는 경우 다시 뽑기
                while (u, j) in self.train_mat:
                    j = np.random.randint(self.num_item)
                self.features_ng.append([u, j])

        labels_ps = [1] * len(self.features_ps)
        labels_ng = [0] * len(self.features_ng)

        self.features_fill = self.features_ps + self.features_ng
        self.labels_fill = labels_ps + labels_ng

    def __len__(self):
        return (self.num_ng + 1) * len(self.labels)

    def __getitem__(self, idx):
        features = self.features_fill if self.is_training else self.features_ps
        labels = self.labels_fill if self.is_training else self.labels

        user = features[idx][0]
        item = features[idx][1]
        label = labels[idx]
        return user, item, label

In [None]:
def prepare_data(train_data, test_data, num_item, train_mat):

    # construct the train and test datasets
    # args = (features, num_item, train_mat=None, num_ng=0, is_training=None)
    train_dataset = NCFData(train_data, num_item, train_mat, args["num_ng"], True)
    test_dataset = NCFData(test_data, num_item, train_mat, 0, False)
    train_loader = data.DataLoader(
        train_dataset, batch_size=args["batch_size"], shuffle=True, num_workers=4
    )
    test_loader = data.DataLoader(
        test_dataset, batch_size=args["test_num_ng"] + 1, shuffle=False, num_workers=0
    )

    return train_loader, test_loader


train_loader, test_loader = prepare_data(train_data, test_data, num_item, train_mat)

In [None]:
class MF(nn.Module):
    def __init__(self, num_user, num_item, num_factor):
        super(MF, self).__init__()
        self.num_factor = num_factor

        self.embed_user = nn.Embedding(num_user, num_factor)
        self.embed_item = nn.Embedding(num_item, num_factor)
        predict_size = num_factor
        self.predict_layer = torch.ones(predict_size, 1) # .cuda()
        self._init_weight_()

    def _init_weight_(self):
        # weight 초기화
        nn.init.normal_(self.embed_user.weight, std=0.01)
        nn.init.normal_(self.embed_item.weight, std=0.01)

        # bias 초기화
        for m in self.modules():
            if isinstance(m, nn.Linear) and m.bias is not None:
                m.bias.data.zero_()

    def forward(self, user, item):
        embed_user = self.embed_user(user)
        embed_item = self.embed_item(item)
        output_GMF = embed_user * embed_item
        prediction = torch.matmul(output_GMF, self.predict_layer)
        return prediction.view(-1)

In [None]:
def create_model(num_user, num_item, args):
    model = MF(num_user, num_item, args["num_factor"])
    # model.cuda()
    loss_function = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=args["lr"])
    return model, loss_function, optimizer

model, loss_function, optimizer = create_model(num_user, num_item, args)

In [None]:
def hit(gt_item, pred_items):
    if gt_item in pred_items:
        return 1
    return 0


def ndcg(gt_item, pred_items):
    if gt_item in pred_items:
        index = pred_items.index(gt_item)
        return np.reciprocal(np.log2(index + 2))
    return 0


def metrics(model, test_loader, top_k):
    HR, NDCG = [], []

    for user, item, _ in test_loader:
        user = user # .cuda()
        item = item # .cuda()

        predictions = model(user, item)
        # 가장 높은 top_k개 선택
        _, indices = torch.topk(predictions, top_k)
        # 해당 상품 index 선택
        recommends = torch.take(item, indices).cpu().numpy().tolist()
        # 정답값 선택
        gt_item = item[0].item()
        HR.append(hit(gt_item, recommends))
        NDCG.append(ndcg(gt_item, recommends))

    return np.mean(HR), np.mean(NDCG)

In [None]:
class NeuMF:
    def __init__(self, user_num, item_num, latent_features=8):

        # Input
        user = Input(shape=(1,), dtype='int32')
        item = Input(shape=(1,), dtype='int32')

        # GMF
        # User embedding for GMF
        gmf_user_embedding = Embedding(user_num, latent_features, input_length=user.shape[1])(user)
        gmf_user_embedding = Flatten()(gmf_user_embedding)
        # Item embedding for GMF
        gmf_item_embedding = Embedding(item_num, latent_features, input_length=item.shape[1])(item)
        gmf_item_embedding = Flatten()(gmf_item_embedding)


        # MLP
        # User embedding for MLP
        mlp_user_embedding = Embedding(user_num, 32, input_length=user.shape[1])(user)
        mlp_user_embedding = Flatten()(mlp_user_embedding)
        # Item embedding for MLP
        mlp_item_embedding = Embedding(item_num, 32, input_length=item.shape[1])(item)
        mlp_item_embedding = Flatten()(mlp_item_embedding)


        # GMF layers
        gmf_mul =  Multiply()([gmf_user_embedding, gmf_item_embedding])


        # MLP layers
        mlp_concat = Concatenate()([mlp_user_embedding, mlp_item_embedding])
        mlp_dropout = Dropout(0.2)(mlp_concat)
        # Layer1
        mlp_layer_1 = Dense(units=64, activation='relu', name='mlp_layer1')(mlp_dropout)  # (64,1)
        mlp_dropout1 = Dropout(rate=0.2, name='dropout1')(mlp_layer_1)                    # (64,1)
        mlp_batch_norm1 = BatchNormalization(name='batch_norm1')(mlp_dropout1)            # (64,1)
        # Layer2
        mlp_layer_2 = Dense(units=32, activation='relu', name='mlp_layer2')(mlp_batch_norm1)  # (32,1)
        mlp_dropout2 = Dropout(rate=0.2, name='dropout2')(mlp_layer_2)                        # (32,1)
        mlp_batch_norm2 = BatchNormalization(name='batch_norm2')(mlp_dropout2)                # (32,1)
        # Layer3
        mlp_layer_3 = Dense(units=16, activation='relu', name='mlp_layer3')(mlp_batch_norm2)  # (16,1)
        # Layer4
        mlp_layer_4 = Dense(units=8, activation='relu', name='mlp_layer4')(mlp_layer_3)       # (8,1)
        
        
        # merge GMF + MLP
        merged_vector = tf.keras.layers.concatenate([gmf_mul, mlp_layer_4])

        # Output layer
        output_layer = Dense(1, kernel_initializer='lecun_uniform', name='output_layer')(merged_vector) # 1,1 / h(8,1)초기화

        # Model
        self.model = Model([user, item], output_layer)
        self.model.compile(optimizer= 'adam', loss= 'binary_crossentropy')

    def get_model(self):
        model = self.model
        return model


In [None]:
make_sequence_dataset = MakeSequenceDataSet(config=config)
user_train, user_valid = make_sequence_dataset.get_train_valid_data()

In [None]:
df = make_sequence_dataset.df
num_user = df['userId'].nunique()
num_item = df['itemId'].nunique()

sparsity = 1 - len(df) / (num_user * num_item)

print(f'전체 User 수: {num_user}')
print(f'전체 Item 수: {num_item}')
print(f'행렬의 희소성: {sparsity:.4f}')

In [None]:
train_mat = sp.dok_matrix((num_user, num_item), dtype=np.float32)
train_data = []

for i in range(len(train_df)):
    for j in range(len(train_df[i])):
        train_mat[i, train_df[i][j]] = 1.0
        train_data.append([i,train_df[i][j]])

In [None]:
test_data = []

for i in range(len(val_df)):
    for j in range(len(val_df[i])):
        test_data.append([i,val_df[i][j]])

In [None]:
count, best_hr = 0, 0
writer = SummaryWriter()  # for visualization
# 모델 파라미터 출력
for epoch in range(args["epochs"]):
    model.train()  # Enable dropout (if have).

    start_time = time.time()
    train_loader.dataset.set_ng_sample()

    for user, item, label in train_loader:
        user = user # .cuda()
        item = item # .cuda()
        label = label.float() # .cuda()

        # gradient 초기화
        model.zero_grad()
        prediction = model(user, item)
        loss = loss_function(prediction, label)
        loss.backward()
        optimizer.step()
        writer.add_scalar("data/loss", loss.item(), count)
        count += 1

    model.eval()
    HR, NDCG = metrics(model, test_loader, args["top_k"])

    elapsed_time = time.time() - start_time
    print(
        "The time elapse of epoch {:03d}".format(epoch + 1)
        + " is: "
        + time.strftime("%H: %M: %S", time.gmtime(elapsed_time))
    )
    print("HR: {:.3f}\tNDCG: {:.3f}".format(np.mean(HR), np.mean(NDCG)))

    if HR > best_hr:
        best_hr, best_ndcg, best_epoch = HR, NDCG, epoch
        if args["out"]:
            if not os.path.exists(config["model_path"]):
                os.mkdir(config["model_path"])
            torch.save(
                model, "{}{}.pth".format(config["model_path"], config["model"])
            )

print(
    "End. Best epoch {:03d}: HR = {:.3f}, NDCG = {:.3f}".format(
        best_epoch, best_hr, best_ndcg
    )
)

The time elapse of epoch 000 is: 00: 08: 27
HR: 0.123	NDCG: 0.060
The time elapse of epoch 001 is: 00: 07: 46
HR: 0.135	NDCG: 0.063
The time elapse of epoch 002 is: 00: 08: 31
HR: 0.132	NDCG: 0.067
The time elapse of epoch 003 is: 00: 08: 02
HR: 0.144	NDCG: 0.066
The time elapse of epoch 004 is: 00: 09: 12
HR: 0.139	NDCG: 0.067
The time elapse of epoch 005 is: 00: 08: 29
HR: 0.137	NDCG: 0.067
The time elapse of epoch 006 is: 00: 08: 53
HR: 0.116	NDCG: 0.063
The time elapse of epoch 007 is: 00: 08: 42
HR: 0.123	NDCG: 0.065
The time elapse of epoch 008 is: 00: 08: 53
HR: 0.113	NDCG: 0.063
The time elapse of epoch 009 is: 00: 08: 48
HR: 0.106	NDCG: 0.058
End. Best epoch 003: HR = 0.144, NDCG = 0.066
