In [1]:
import sys
import os

sys.path.append("../source")

In [2]:
import torch
import numpy as np

In [3]:
import easydict

# jupyter didn't support argparse. so, I use 'easydict' module
args = easydict.EasyDict({
    ################
    # Dataset
    ################
    'dataset_code': 'ml-100k', # ml-100k, beauty, games
    'min_rating': 0,  # default: 0
    'min_uc': 5,  # default: 5
    'min_sc': 5,  # default: 5
    'seed': 42,  # default: 42

    ################
    # Dataloader
    ################
    'train_batch_size': 64,  # default: 64
    'val_batch_size': 64,  # default: 64
    'test_batch_size': 64,  # default: 64
    'num_workers': 0,  # default: 8
    'sliding_window_size': 1.0,  # default: 1.0
    'negative_sample_size': 10,  # default: 10

    ################
    # Trainer
    ################
    # optimization #
    'device': 'cuda',  # default: 'cuda'  # choices: ['cpu', 'cuda']
    'num_epochs': 500,  # default: 500
    'optimizer': 'AdamW',  # default: 'AdamW'  # choices: ['AdamW', 'Adam']
    'weight_decay': 0.01,  # default: None
    'adam_epsilon': 1e-9,  # default: 1e-9
    'momentum': None,  # default: None
    'lr': 0.001,  # default: 0.001
    'max_grad_norm': 5.0,  # default: 5.0
    'enable_lr_schedule': True,  # default: True
    'decay_step': 10000,  # default: 10000
    'gamma': 1,  # default: 1
    'enable_lr_warmup': True,  # default: True
    'warmup_steps': 100,  # default: 100

    # evaluation #
    'val_strategy': 'iteration',  # default: 'iteration'  # choices: ['epoch', 'iteration']
    'val_iterations': 500,  # default: 500  # only for iteration val_strategy
    'early_stopping': True,  # default: True
    'early_stopping_patience': 20,  # default: 20
    'metric_ks': [1, 5, 10, 20, 50],  # default: [1, 5, 10, 20, 50]
    'rerank_metric_ks': [1, 5, 10],  # default: [1, 5, 10]
    'best_metric': 'Recall@10',  # default: 'Recall@10'
    'rerank_best_metric': 'NDCG@10',  # default: 'NDCG@10'
    'use_wandb': False,  # default: False

    ################
    # Retriever Model
    ################
    'model_code': 'lru',  # default: None
    'bert_max_len': 50,  # default: 50
    'bert_hidden_units': 64,  # default: 64
    'bert_num_blocks': 2,  # default: 2
    'bert_num_heads': 2,  # default: 2
    'bert_head_size': 32,  # default: 32
    'bert_dropout': 0.2,  # default: 0.2
    'bert_attn_dropout': 0.2,  # default: 0.2
    'bert_mask_prob': 0.25,  # default: 0.25

    ################
    # LLM Model
    ################
    'llm_base_model': 'meta-llama/Llama-2-7b-hf',  # default: 'meta-llama/Llama-2-7b-hf'
    'llm_base_tokenizer': 'meta-llama/Llama-2-7b-hf',  # default: 'meta-llama/Llama-2-7b-hf'
    'llm_max_title_len': 32,  # default: 32
    'llm_max_text_len': 1536,  # default: 1536
    'llm_max_history': 20,  # default: 20
    'llm_train_on_inputs': False,  # default: False
    'llm_negative_sample_size': 19,  # default: 19  # 19 negative & 1 positive
    'llm_system_template': "Given user history in chronological order, recommend an item from the candidate pool with its index letter.",  # default: "Given user history in chronological order, recommend an item from the candidate pool with its index letter."
    'llm_input_template': 'User history: {}; \n Candidate pool: {}',  # default: 'User history: {}; \n Candidate pool: {}'
    'llm_load_in_4bit': True,  # default: True
    'llm_retrieved_path': None,  # default: None
    'llm_cache_dir': None,  # default: None

    ################
    # Lora
    ################
    'lora_r': 8,  # default: 8
    'lora_alpha': 32,  # default: 32
    'lora_dropout': 0.05,  # default: 0.05
    'lora_target_modules': ['q_proj', 'v_proj'],  # default: ['q_proj', 'v_proj']
    'lora_num_epochs': 1,  # default: 1
    'lora_val_iterations': 100,  # default: 100
    'lora_early_stopping_patience': 20,  # default: 20
    'lora_lr': 1e-4,  # default: 1e-4
    'lora_micro_batch_size': 16,  # default: 16
})


In [4]:
from data.datasets import dataset_factory

In [5]:
dataset = dataset_factory(args)

In [6]:
dataset = dataset.load_dataset()

Already preprocessed. Skip preprocessing


In [7]:
train = dataset['train']
val = dataset['val']
test = dataset['test']
umap = dataset['umap']
smap = dataset['smap']
rng = np.random

user_count = len(umap)
item_count = len(smap)

num_users = user_count
num_items = item_count
max_len = args.bert_max_len
sliding_size = args.sliding_window_size

print(f"user_count : {user_count}")
print(f"item_count : {item_count}")
print(f"num_users : {num_users}")
print(f"num_items : {num_items}")
print(f"max_len : {max_len}")
print(f"sliding_size : {sliding_size}")

user_count : 610
item_count : 3650
num_users : 610
num_items : 3650
max_len : 50
sliding_size : 1.0


## Train Loader

## Test Loader

최종형태: Seq: train(N개) + val(1개) -> 총 50개(**max_len**) / target: test(1개)

In [8]:
u2seq = train
u2val = val
u2answer = test
users = [u for u in sorted(u2seq.keys()) if len(u2val[u]) > 0 and len(u2answer[u]) > 0]
max_len = max_len
rng = rng

In [9]:
index = 0

In [10]:
user = users[index]
# user에 대하여 train sequence 와 val sequence를 병합
seq = u2seq[user] + u2val[user]
answer = u2answer[user]

# 전체 시퀸스에서 max_len 까지만 추출
seq = seq[-max_len:]

# 패딩 적용
padding_len = max_len - len(seq)
seq = [0] * padding_len + seq

In [11]:
torch.LongTensor(seq)

tensor([1538,  708,  857, 1496, 1863,  868,  966, 1687, 1689,  904,  924, 1597,
        1688, 1725, 1690,  798, 2100, 1104, 1989, 1113, 2060, 1500,  911,  700,
         875,  137, 1473,  916,  886,  855, 2256, 1900,  470,   47, 1511, 1968,
         429, 2830,  796,  873,  125, 2135,  590,  944, 1916,  131, 1024,  452,
        1610, 1304])

In [12]:
torch.LongTensor(answer)

tensor([1615])

## Validation Loader

Test Loader 전처리 방식과 거의 동일