In [1]:
%load_ext autoreload
%autoreload 2

try:
    from google.colab import drive
    drive.mount('/content/drive')
    print('Google Drive is mounted successfully for Colab.')
except:
    print('Not Colab.')

Not Colab.


In [None]:
import os
import sys
import torch
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from scipy.sparse import csr_matrix
from tqdm import tqdm
import scipy.sparse as sp
from time import time
from pprint import pprint

In [3]:
# CAUTION - Set your options #
sys.dont_write_bytecode = True  # pycahce option

# Put yelp2018 raw data directory path.

# colab
# path: str = '/content/drive/MyDrive/projects/기초추천시스템/model'
# local
path: str = (
    "/Users/june/projects/기초추천시스템/diversity-enhanced-lightgcn/model"
)

In [4]:
# Your Environment Setting #
os.chdir(path)
assert os.getcwd() == path

import config

train_file = config.train_file
test_file = config.test_file
adj_mat_file = config.adj_mat_file

In [5]:
from logger import console

# Training Resource Check #
device_str = (
    "cuda"
    if torch.cuda.is_available()
    # else "mps"
    # if torch.backends.mps.is_available()
    else "cpu"
)

print(f"Your Device: {device_str}")

device = torch.device(device_str)

Your Device: cpu


In [6]:
# Important settings including loss function, training epochs, etc. -> set them in config.py

# PARAMETERS

# Data
TRAIN_BATCH_SIZE = config.train_batch_size
TEST_BATCH_SIZE = config.test_batch_size
DO_NEG_SAMPLING = config.do_neg_sampling

# Loss
from loss import loss_dict

# LOSS_FN = "directau"
LOSS_FN = config.loss_fn
assert LOSS_FN in loss_dict

reg_strength = config.reg_strength

# Training
EPOCHS = config.epochs

# Model
N_LAYERS = config.n_layers
EMBEDDING_DIM = config.embed_dim

# Evaluation
TOP_K = config.topk
METRICS = config.metrics

# Set random seed
from misc import set_seed

set_seed(config.seed)

In [7]:
from datetime import datetime

# Preferences

# Get the current date and time
current_time = datetime.now().strftime("%Y%m%d_%H%M%S")

# File path to save the best models
BEST_MODEL_BASE_DIR = config.best_model_dir

# Text file to save the metric history
METRIC_RESULTS_FILE = config.metric_result_file

# Ensure the directory exists
os.makedirs(BEST_MODEL_BASE_DIR, exist_ok=True)
os.makedirs(os.path.dirname(METRIC_RESULTS_FILE), exist_ok=True)

# **Part1.** Data Pipeline

**`class Yelp2018`**  
> **Initialize** `(train_file, test_file)`  

*.txt 확장자인 Yelp2018 raw 데이터를 학습이 용이하도록 전처리

\\

**`class AdjacencyMatrix`** \\
> **Initialize** `(train_user, train_item, num_user, num_item, device)`

전처리한 학습 데이터를 기반으로 Normalized Adjacency Matrix를 구축  
특히, `get_sparse_graph(adj_mat_file)`의 반환값은 Normalized Adjacency Matrix임에 유의하며  
이는 논문에서 $\tilde{\mathbf{A}} := \mathbf{D}^{-\frac{1}{2}} \mathbf{A} \mathbf{D}^{-\frac{1}{2}}$ 이다.

\\

**`class PairwiseTrainData(torch.utils.dataset)`** \\
> **Initialize** `(train_user, train_item, num_user, num_item)`  

전처리한 학습 데이터를 기반으로 BPR Loss 학습을 위한 Negative Sampling 과정을 구현

\\

**`class TestData(torch.utils.dataset)`** \\
> **Initialize** `(train_user, train_item, test_user, test_item)`  

전처리한 학습 데이터와 테스트 데이터를 기반으로 구현  
특히, 추천시스템의 Metric의 특성 때문에 학습 데이터에서 본 아이템은 랭킹에서 제외해야한다.  
따라서 학습 데이터도 사용해야한다.

In [8]:
from data_utils import Yelp2018

yelp2018 = Yelp2018(train_file, test_file)

num_user = yelp2018.num_user
num_item = yelp2018.num_item

train_user = yelp2018.train_user
train_item = yelp2018.train_item
train_interaction = yelp2018.train_interaction

test_user = yelp2018.test_user
test_item = yelp2018.test_item
test_interaction = yelp2018.test_interaction

In [9]:
# Yelp2018 Statistics Check #
print("Yelp2018")
print(
    f"""
#user = {num_user}
#item = {num_item}

#interactions
    (train) {train_interaction}
    (test)  {test_interaction}
    (total) {train_interaction + test_interaction}

Sparsity = {(train_interaction + test_interaction) / (num_user * num_item)}
"""
)

Yelp2018

#user = 31668
#item = 38048

#interactions
    (train) 1237259
    (test)  324147
    (total) 1561406

Sparsity = 0.0012958757851778645



In [10]:
from data_utils import AdjacencyMatrix

adjacency_matrix = AdjacencyMatrix(train_user, train_item, num_user, num_item, device)
graph = adjacency_matrix.get_sparse_graph(
    adj_mat_file
)  # This is The Normalized Adjacency Matrix.

loading adjacency matrix
successfully loaded...
don't split the matrix


  return torch.sparse.FloatTensor(index, data, torch.Size(coo.shape))


In [11]:
from data_utils import PairwiseTrainData
import misc

train_dataset = PairwiseTrainData(
    train_user, train_item, num_user, num_item, do_neg_sampling=DO_NEG_SAMPLING
)
train_dataloader = DataLoader(
    train_dataset,
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=True,
    num_workers=0,
    worker_init_fn=misc.seed_worker,
    generator=misc.get_generator(config.seed),
)

In [12]:
from data_utils import TestData, collate_fn

test_dataset = TestData(train_user, train_item, test_user, test_item)
test_dataloader = DataLoader(
    test_dataset,
    batch_size=TEST_BATCH_SIZE,
    shuffle=False,
    collate_fn=collate_fn,
    worker_init_fn=misc.seed_worker,
    generator=misc.get_generator(config.seed),
)

In [13]:
train_test_user = np.concatenate([train_user, test_user])
train_test_item = np.concatenate([train_item, test_item])

# Get degree per item for use in the top-k metric calculation
train_test_item_degree = torch.tensor(
    np.bincount(train_test_item), dtype=torch.float32
).to(device)

# Get degree per item only in the train set for use in the loss function while training
train_item_degree = torch.tensor(np.bincount(train_item), dtype=torch.float32).to(
    device
)

del train_test_user, train_test_item

# **Part 2**. Training and Validation

In [14]:
# Initialize model

from model import LightGCN

model = LightGCN(num_user, num_item, N_LAYERS, EMBEDDING_DIM, graph)
if config.load_model:
    model.load_state_dict(torch.load(config.model_file, weights_only=True))
model.to(device)

LightGCN(
  (user_embedding): Embedding(31668, 64)
  (item_embedding): Embedding(38048, 64)
  (f): Sigmoid()
)

In [15]:
def train_loop(train_dataloader, model, loss_fn, optimizer: torch.optim.Optimizer):
    model.train()

    loss_sum = 0

    size = len(train_dataloader.dataset)
    num_batches = len(train_dataloader)

    for batch_num, minibatch in enumerate(tqdm(train_dataloader)):
        optimizer.zero_grad()

        user: torch.Tensor = minibatch[0].to(device)
        pos_item: torch.Tensor = minibatch[1].to(device)
        if DO_NEG_SAMPLING:
            neg_item: torch.Tensor = minibatch[2].to(device)

        if DO_NEG_SAMPLING:
            result = model(user, pos_item, neg_items=neg_item)
        else:
            result = model(user, pos_item)

        loss = loss_fn(**result, pos_item=pos_item)

        loss_sum += loss.item()
        loss.backward()
        optimizer.step()

        if batch_num % 100 == 0:
            console(
                f"loss: {loss.item():>7f} [{TRAIN_BATCH_SIZE * batch_num + len(minibatch[0]):>5d}/{size:>5d}]"
            )

    avg_loss = loss_sum / num_batches
    console(f"Train Avg loss: {avg_loss:>7f}")

In [16]:
# A dictionary to store the best metric values along epochs
best_metric = dict()

# Boolean indicating whether to write the header in the metric file
write_header = True

In [17]:
from evaluator import TopKEvaluator
from data_utils import remove_padding


def test_loop(dataloader, model, loss_fn, evaluator: TopKEvaluator, epoch: int):
    global best_metric, write_header

    model.eval()

    num_batches = len(dataloader)

    metrics_result_dict = dict()

    with torch.no_grad():
        for minibatch in tqdm(dataloader):
            user: torch.Tensor = minibatch[0].to(device)
            history: torch.Tensor = minibatch[1].to(
                device
            )  # 각 유저 별 train 에서 존재하는 아이템
            label: torch.Tensor = minibatch[2].to(
                device
            )  # 각 유저 별 test  에서 존재하는 아이템

            history: list[torch.Tensor] = remove_padding(history)
            label: list[torch.Tensor] = remove_padding(label)

            pred: torch.Tensor = model.get_users_rating_prediction(user)
            assert pred.shape == (len(user), num_item)

            result_dict = evaluator.evaluate(pred, history, label)
            for metric in result_dict:
                if metric not in metrics_result_dict:
                    metrics_result_dict[metric] = 0
                metrics_result_dict[metric] += result_dict[metric]

    for metric in metrics_result_dict:
        metrics_result_dict[metric] /= num_batches

    # Save metrics to a text file
    with open(METRIC_RESULTS_FILE, "a") as f:
        if write_header:
            f.write("epoch\t")
            for metric in metrics_result_dict:
                f.write(f"{metric}\t")
            f.write("\n")
            write_header = False

        f.write(f"{epoch}\t")
        for metric in metrics_result_dict:
            f.write(f"{metrics_result_dict[metric]:.4f}\t")
        f.write("\n")

    # Check and save the best models
    for metric in metrics_result_dict:
        if metric not in best_metric:
            best_metric[metric] = 0
        if metrics_result_dict[metric] > best_metric[metric]:
            best_metric[metric] = metrics_result_dict[metric]
            console(f"Best {metric} model updated. Saving the model.")
            torch.save(
                model.state_dict(), f"{BEST_MODEL_BASE_DIR}/best_{metric}_model.pth"
            )

    console(f"Eval results: ")
    for metric in metrics_result_dict:
        console(f"{metric}: {metrics_result_dict[metric]:.4f}")
    console("\n")

In [18]:
from loss import loss_dict

loss_fn = loss_dict[LOSS_FN](
    item_degree=train_item_degree, reg_strength=reg_strength
).loss_fn
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [19]:
def simulated_train_or_test_loop(dataloader):
    """
    run the train dataloader for an epoch
    """
    for _ in dataloader:
        pass

In [20]:
from evaluator import TopKEvaluator

evaluator = TopKEvaluator(
    TOP_K, METRICS, device=device, item_degree=train_test_item_degree
)

if config.start_epoch != 1:
    # run the train dataloader and test dataloader for start_epoch - 1 times
    # to simulate the training process
    console(f"Simulating training and testing for {config.start_epoch - 1} epochs")
    for epoch in tqdm(range(1, config.start_epoch)):
        simulated_train_or_test_loop(train_dataloader)
        simulated_train_or_test_loop(test_dataloader)
    console("Simulation done")

for epoch in range(config.start_epoch, EPOCHS + 1):
    console(f"Epoch {epoch}\n-------------------------------")
    train_loop(train_dataloader, model, loss_fn, optimizer)
    if epoch % config.eval_every_n_epochs == 0:
        test_loop(test_dataloader, model, loss_fn, evaluator, epoch)

    if DO_NEG_SAMPLING:
        train_dataloader.dataset.sample_negs()

  2%|▏         | 11/605 [00:04<04:18,  2.29it/s]


KeyboardInterrupt: 