In [None]:
%load_ext autoreload
%autoreload 2

try:
    from google.colab import drive
    drive.mount('/content/drive')
    print('Google Drive is mounted successfully for Colab.')
except:
    print('Not Colab.')

Not Colab.


In [23]:
import os
import sys
import torch
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from scipy.sparse import csr_matrix
from tqdm import tqdm
import scipy.sparse as sp
from time import time
from pprint import pprint

In [None]:
# CAUTION - Set your options #
sys.dont_write_bytecode = True  # pycahce option

# Put yelp2018 raw data directory path.

# colab
# path: str = '/content/drive/MyDrive/projects/기초추천시스템/model'
# local
path: str = (
    "/Users/june/projects/기초추천시스템/diversity-enhanced-lightgcn/model"
)

In [None]:
# Your Environment Setting #
os.chdir(path)
train_file = path + "/yelp2018/train.txt"
test_file = path + "/yelp2018/test.txt"
adj_mat_file = path + "/yelp2018/s_pre_adj_mat.npz"

assert os.getcwd() == path

In [26]:
# Training Resource Check #
device_str = (
    "cuda"
    if torch.cuda.is_available()
    # else "mps"
    # if torch.backends.mps.is_available()
    else "cpu"
)

print(f"Your Device: {device_str}")

device = torch.device(device_str)

Your Device: cpu


In [None]:
# Important settings including loss function, training epochs, etc.

# PARAMETERS

# Data
TRAIN_BATCH_SIZE = 1024
TEST_BATCH_SIZE = 2048
DO_NEG_SAMPLING = False

# Loss
from loss import loss_dict

# LOSS_FN = "directau"
LOSS_FN = "deweighted_directau"
assert LOSS_FN in loss_dict

# Training
EPOCHS = 500

# Model
N_LAYERS = 3
EMBEDDING_DIM = 64

# Evaluation
TOP_K = 20
METRICS = ["recall", "ndcg", "diversity"]

In [28]:
from datetime import datetime

# Preferences

# Get the current date and time
current_time = datetime.now().strftime("%Y%m%d_%H%M%S")

RESULT_DIR = path + "/result"
# File path to save the best models
BEST_MODEL_BASE_DIR = RESULT_DIR + f"/{current_time}_best_model"

# Text file to save the metric history
METRIC_RESULTS_FILE = RESULT_DIR + f"/{current_time}_metric.tsv"

# Ensure the directory exists
os.makedirs(BEST_MODEL_BASE_DIR, exist_ok=True)
os.makedirs(os.path.dirname(METRIC_RESULTS_FILE), exist_ok=True)

# **Part1.** Data Pipeline

**`class Yelp2018`**  
> **Initialize** `(train_file, test_file)`  

*.txt 확장자인 Yelp2018 raw 데이터를 학습이 용이하도록 전처리

\\

**`class AdjacencyMatrix`** \\
> **Initialize** `(train_user, train_item, num_user, num_item, device)`

전처리한 학습 데이터를 기반으로 Normalized Adjacency Matrix를 구축  
특히, `get_sparse_graph(adj_mat_file)`의 반환값은 Normalized Adjacency Matrix임에 유의하며  
이는 논문에서 $\tilde{\mathbf{A}} := \mathbf{D}^{-\frac{1}{2}} \mathbf{A} \mathbf{D}^{-\frac{1}{2}}$ 이다.

\\

**`class PairwiseTrainData(torch.utils.dataset)`** \\
> **Initialize** `(train_user, train_item, num_user, num_item)`  

전처리한 학습 데이터를 기반으로 BPR Loss 학습을 위한 Negative Sampling 과정을 구현

\\

**`class TestData(torch.utils.dataset)`** \\
> **Initialize** `(train_user, train_item, test_user, test_item)`  

전처리한 학습 데이터와 테스트 데이터를 기반으로 구현  
특히, 추천시스템의 Metric의 특성 때문에 학습 데이터에서 본 아이템은 랭킹에서 제외해야한다.  
따라서 학습 데이터도 사용해야한다.

In [29]:
from data_utils import Yelp2018

yelp2018 = Yelp2018(train_file, test_file)

num_user = yelp2018.num_user
num_item = yelp2018.num_item

train_user = yelp2018.train_user
train_item = yelp2018.train_item
train_interaction = yelp2018.train_interaction

test_user = yelp2018.test_user
test_item = yelp2018.test_item
test_interaction = yelp2018.test_interaction

In [None]:
# Yelp2018 Statistics Check #
print("Yelp2018")
print(
    f"""
#user = {num_user}
#item = {num_item}

#interactions
    (train) {train_interaction}
    (test)  {test_interaction}
    (total) {train_interaction + test_interaction}

Sparsity = {(train_interaction + test_interaction) / (num_user * num_item)}
"""
)

Yelp2018

#user = 31668
#item = 38048

#interactions
    (train) 1237259
    (test)  324147
    (total) 1561406

Sparsity = 0.0012958757851778645



In [None]:
from data_utils import AdjacencyMatrix

adjacency_matrix = AdjacencyMatrix(train_user, train_item, num_user, num_item, device)
graph = adjacency_matrix.get_sparse_graph(
    adj_mat_file
)  # This is The Normalized Adjacency Matrix.

loading adjacency matrix
successfully loaded...
don't split the matrix


In [None]:
from data_utils import PairwiseTrainData

train_dataset = PairwiseTrainData(
    train_user, train_item, num_user, num_item, do_neg_sampling=DO_NEG_SAMPLING
)
train_dataloader = DataLoader(
    train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True, num_workers=0
)

In [None]:
train_test_user = np.concatenate([train_user, test_user])
train_test_item = np.concatenate([train_item, test_item])

# Get degree per item for use in the top-k metric calculation
train_test_item_degree = torch.tensor(
    np.bincount(train_test_item), dtype=torch.float32
).to(device)

# Get degree per item only in the train set for use in the loss function while training
train_item_degree = torch.tensor(np.bincount(train_item), dtype=torch.float32).to(
    device
)

del train_test_user, train_test_item

# **Part 2**. Training and Validation

In [None]:
# Initialize model

from model import LightGCN

model = LightGCN(num_user, num_item, N_LAYERS, EMBEDDING_DIM, graph)
model.to(device)

LightGCN(
  (user_embedding): Embedding(31668, 64)
  (item_embedding): Embedding(38048, 64)
  (f): Sigmoid()
)

In [None]:
def train_loop(train_dataloader, model, loss_fn, optimizer: torch.optim.Optimizer):
    model.train()

    loss_sum = 0

    size = len(train_dataloader.dataset)
    num_batches = len(train_dataloader)

    for batch_num, minibatch in enumerate(train_dataloader):
        optimizer.zero_grad()

        user:           torch.Tensor = minibatch[0].to(device)
        pos_item:       torch.Tensor = minibatch[1].to(device)
        if DO_NEG_SAMPLING:
            neg_item:   torch.Tensor = minibatch[2].to(device)

        if DO_NEG_SAMPLING:
            result = model(user, pos_item, neg_items=neg_item)
        else:
            result = model(user, pos_item)

        loss = loss_fn(**result, pos_item=pos_item)

        loss_sum += loss.item()
        loss.backward()
        optimizer.step()

        if batch_num % 100 == 0:
            print(
                f"loss: {loss.item():>7f} [{TRAIN_BATCH_SIZE * batch_num + len(minibatch[0]):>5d}/{size:>5d}]"
            )

    avg_loss = loss_sum / num_batches
    print(f"Train Avg loss: {avg_loss:>7f}")

In [36]:
# A dictionary to store the best metric values along epochs
best_metric = dict()

# Boolean indicating whether to write the header in the metric file
write_header = True

In [None]:
from evaluator import TopKEvaluator
from data_utils import remove_padding


def test_loop(dataloader, model, loss_fn, evaluator: TopKEvaluator, epoch: int):
    global best_metric, write_header

    model.eval()

    num_batches = len(dataloader)

    metrics_result_dict = dict()

    with torch.no_grad():
        for minibatch in tqdm(dataloader):
            user:       torch.Tensor = minibatch[0].to(device)
            history:    torch.Tensor = minibatch[1].to(device)  # 각 유저 별 train 에서 존재하는 아이템
            label:      torch.Tensor = minibatch[2].to(device)  # 각 유저 별 test  에서 존재하는 아이템

            history:    list[torch.Tensor] = remove_padding(history)
            label:      list[torch.Tensor] = remove_padding(label)

            pred:       torch.Tensor = model.get_users_rating_prediction(user)
            assert pred.shape == (len(user), num_item)

            result_dict = evaluator.evaluate(pred, history, label)
            for metric in result_dict:
                if metric not in metrics_result_dict:
                    metrics_result_dict[metric] = 0
                metrics_result_dict[metric] += result_dict[metric]

    for metric in metrics_result_dict:
        metrics_result_dict[metric] /= num_batches

    # Save metrics to a text file
    with open(METRIC_RESULTS_FILE, "a") as f:
        if write_header:
            f.write("epoch\t")
            for metric in metrics_result_dict:
                f.write(f"{metric}\t")
            f.write("\n")
            write_header = False

        f.write(f"{epoch}\t")
        for metric in metrics_result_dict:
            f.write(f"{metrics_result_dict[metric]:.4f}\t")
        f.write("\n")

    # Check and save the best models
    for metric in metrics_result_dict:
        if metric not in best_metric:
            best_metric[metric] = 0
        if metrics_result_dict[metric] > best_metric[metric]:
            best_metric[metric] = metrics_result_dict[metric]
            print(f"Best {metric} model updated. Saving the model.")
            torch.save(
                model.state_dict(), f"{BEST_MODEL_BASE_DIR}/best_{metric}_model.pth"
            )

    print(f"Eval results: ")
    for metric in metrics_result_dict:
        print(f"{metric}: {metrics_result_dict[metric]:.4f}", end=" ")
    print("\n")

In [None]:
from loss import loss_dict

loss_fn = loss_dict[LOSS_FN](item_degree=train_item_degree).loss_fn
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
from data_utils import TestData, collate_fn

test_dataset = TestData(train_user, train_item, test_user, test_item)
test_dataloader = DataLoader(
    test_dataset, batch_size=TEST_BATCH_SIZE, shuffle=False, collate_fn=collate_fn
)

In [None]:
from evaluator import TopKEvaluator

evaluator = TopKEvaluator(
    TOP_K, METRICS, device=device, item_degree=train_test_item_degree
)

for epoch in range(1, EPOCHS + 1):
    print(f"Epoch {epoch}\n-------------------------------")
    train_loop(train_dataloader, model, loss_fn, optimizer)
    test_loop(test_dataloader, model, loss_fn, evaluator, epoch)

Epoch 1
-------------------------------
loss: -3.474885 [ 1024/1237259]
loss: -3.479510 [103424/1237259]
loss: -3.482824 [205824/1237259]
loss: -3.484504 [308224/1237259]
loss: -3.482735 [410624/1237259]
loss: -3.482762 [513024/1237259]
loss: -3.488720 [615424/1237259]
loss: -3.491537 [717824/1237259]
loss: -3.486400 [820224/1237259]
loss: -3.488761 [922624/1237259]
loss: -3.493164 [1025024/1237259]
loss: -3.494300 [1127424/1237259]
loss: -3.497872 [1229824/1237259]
Train Avg loss: -3.486905


100%|██████████| 16/16 [00:07<00:00,  2.02it/s]


Best recall@20 model updated. Saving the model.
Best ndcg@20 model updated. Saving the model.
Best diversity model updated. Saving the model.
Eval results: 
recall@20: 0.0061 ndcg@20: 0.0058 diversity: 0.2922 

Epoch 2
-------------------------------
loss: -3.503369 [ 1024/1237259]
loss: -3.503481 [103424/1237259]
loss: -3.511107 [205824/1237259]
loss: -3.504750 [308224/1237259]
loss: -3.503380 [410624/1237259]
loss: -3.506535 [513024/1237259]
loss: -3.512193 [615424/1237259]
loss: -3.508759 [717824/1237259]
loss: -3.511808 [820224/1237259]
