In [1]:
import sys
import os

sys.path.append("../source")

In [2]:
import torch
import numpy as np

In [3]:
import easydict

# jupyter didn't support argparse. so, I use 'easydict' module
args = easydict.EasyDict({
    ################
    # Dataset
    ################
    'dataset_code': 'ml-100k', # ml-100k, beauty, games
    'min_rating': 0,  # default: 0
    'min_uc': 5,  # default: 5
    'min_sc': 5,  # default: 5
    'seed': 42,  # default: 42

    ################
    # Dataloader
    ################
    'train_batch_size': 64,  # default: 64
    'val_batch_size': 64,  # default: 64
    'test_batch_size': 64,  # default: 64
    'num_workers': 0,  # default: 8
    'sliding_window_size': 1.0,  # default: 1.0
    'negative_sample_size': 10,  # default: 10

    ################
    # Trainer
    ################
    # optimization #
    'device': 'cuda',  # default: 'cuda'  # choices: ['cpu', 'cuda']
    'num_epochs': 500,  # default: 500
    'optimizer': 'AdamW',  # default: 'AdamW'  # choices: ['AdamW', 'Adam']
    'weight_decay': 0.01,  # default: None
    'adam_epsilon': 1e-9,  # default: 1e-9
    'momentum': None,  # default: None
    'lr': 0.001,  # default: 0.001
    'max_grad_norm': 5.0,  # default: 5.0
    'enable_lr_schedule': True,  # default: True
    'decay_step': 10000,  # default: 10000
    'gamma': 1,  # default: 1
    'enable_lr_warmup': True,  # default: True
    'warmup_steps': 100,  # default: 100

    # evaluation #
    'val_strategy': 'iteration',  # default: 'iteration'  # choices: ['epoch', 'iteration']
    'val_iterations': 500,  # default: 500  # only for iteration val_strategy
    'early_stopping': True,  # default: True
    'early_stopping_patience': 20,  # default: 20
    'metric_ks': [1, 5, 10, 20, 50],  # default: [1, 5, 10, 20, 50]
    'rerank_metric_ks': [1, 5, 10],  # default: [1, 5, 10]
    'best_metric': 'Recall@10',  # default: 'Recall@10'
    'rerank_best_metric': 'NDCG@10',  # default: 'NDCG@10'
    'use_wandb': False,  # default: False

    ################
    # Retriever Model
    ################
    'model_code': 'bert',  # default: None
    'bert_max_len': 100,  # default: 50
    'bert_hidden_units': 256,  # default: 64
    'bert_num_blocks': 2,  # default: 2
    'bert_num_heads': 4,  # default: 2
    'bert_head_size': 32,  # default: 32
    'bert_dropout': 0.1,  # default: 0.2
    'bert_mask_prob': 0.15,  # default: 0.25
    
    # bertrec
    'train_negative_sampler_code': 'random',
    'train_negative_sample_size': 0,
    'train_negative_sampling_seed': 0,
    'test_negative_sampler_code': 'random',
    'test_negative_sample_size': 100,
    'test_negative_sampling_seed': 98765,
    'model_init_seed': 0,
    'num_gpu': 1,
    'optimizer': 'Adam',
    'log_period_as_iter': 12800,
    
    ################
    # LLM Model
    ################
    'llm_base_model': 'meta-llama/Llama-2-7b-hf',  # default: 'meta-llama/Llama-2-7b-hf'
    'llm_base_tokenizer': 'meta-llama/Llama-2-7b-hf',  # default: 'meta-llama/Llama-2-7b-hf'
    'llm_max_title_len': 32,  # default: 32
    'llm_max_text_len': 1536,  # default: 1536
    'llm_max_history': 20,  # default: 20
    'llm_train_on_inputs': False,  # default: False
    'llm_negative_sample_size': 19,  # default: 19  # 19 negative & 1 positive
    'llm_system_template': "Given user history in chronological order, recommend an item from the candidate pool with its index letter.",  # default: "Given user history in chronological order, recommend an item from the candidate pool with its index letter."
    'llm_input_template': 'User history: {}; \n Candidate pool: {}',  # default: 'User history: {}; \n Candidate pool: {}'
    'llm_load_in_4bit': True,  # default: True
    'llm_retrieved_path': "/home/laststar/data/model/OpenLLM-Rec",  # default: None
    'llm_cache_dir': None,  # default: None

    ################
    # Lora
    ################
    'lora_r': 8,  # default: 8
    'lora_alpha': 32,  # default: 32
    'lora_dropout': 0.05,  # default: 0.05
    'lora_target_modules': ['q_proj', 'v_proj'],  # default: ['q_proj', 'v_proj']
    'lora_num_epochs': 1,  # default: 1
    'lora_val_iterations': 100,  # default: 100
    'lora_early_stopping_patience': 20,  # default: 20
    'lora_lr': 1e-4,  # default: 1e-4
    'lora_micro_batch_size': 16,  # default: 16

    #################
    # Custom
    #################
    'alpaca_file': "../source/data/dataloader/templates"
})

In [4]:
import data.datasets
import data.dataloader
from data.dataloader import *
from data.datasets import *

from data.dataloader.negative_samplers import negative_sampler_factory

In [5]:
from data.datasets import dataset_factory

dataset = dataset_factory(args)
save_folder = dataset._get_preprocessed_folder_path()

dataset = dataset.load_dataset()

Already preprocessed. Skip preprocessing


In [6]:
train = dataset['train']
val = dataset['val']
test = dataset['test']
umap = dataset['umap']
smap = dataset['smap']
rng = np.random

user_count = len(umap)
item_count = len(smap)

max_len = args.bert_max_len
mask_prob = args.bert_mask_prob
CLOZE_MASK_TOKEN = item_count + 1


In [7]:
code = args.train_negative_sampler_code
train_negative_sampler = negative_sampler_factory(code, train, val, test,
                                                          user_count, item_count,
                                                          args.train_negative_sample_size,
                                                          args.train_negative_sampling_seed,
                                                          save_folder)
code = args.test_negative_sampler_code
test_negative_sampler = negative_sampler_factory(code, train, val, test,
                                                         user_count, item_count,
                                                         args.test_negative_sample_size,
                                                         args.test_negative_sampling_seed,
                                                         save_folder)

In [8]:
train_negative_samples = train_negative_sampler.get_negative_samples()
test_negative_samples = test_negative_sampler.get_negative_samples()

Negatives samples exist. Loading.
Negatives samples exist. Loading.


In [9]:
train_negative_samples

{0: [],
 1: [],
 2: [],
 3: [],
 4: [],
 5: [],
 6: [],
 7: [],
 8: [],
 9: [],
 10: [],
 11: [],
 12: [],
 13: [],
 14: [],
 15: [],
 16: [],
 17: [],
 18: [],
 19: [],
 20: [],
 21: [],
 22: [],
 23: [],
 24: [],
 25: [],
 26: [],
 27: [],
 28: [],
 29: [],
 30: [],
 31: [],
 32: [],
 33: [],
 34: [],
 35: [],
 36: [],
 37: [],
 38: [],
 39: [],
 40: [],
 41: [],
 42: [],
 43: [],
 44: [],
 45: [],
 46: [],
 47: [],
 48: [],
 49: [],
 50: [],
 51: [],
 52: [],
 53: [],
 54: [],
 55: [],
 56: [],
 57: [],
 58: [],
 59: [],
 60: [],
 61: [],
 62: [],
 63: [],
 64: [],
 65: [],
 66: [],
 67: [],
 68: [],
 69: [],
 70: [],
 71: [],
 72: [],
 73: [],
 74: [],
 75: [],
 76: [],
 77: [],
 78: [],
 79: [],
 80: [],
 81: [],
 82: [],
 83: [],
 84: [],
 85: [],
 86: [],
 87: [],
 88: [],
 89: [],
 90: [],
 91: [],
 92: [],
 93: [],
 94: [],
 95: [],
 96: [],
 97: [],
 98: [],
 99: [],
 100: [],
 101: [],
 102: [],
 103: [],
 104: [],
 105: [],
 106: [],
 107: [],
 108: [],
 109: [],
 110: [],


# Train

In [10]:
u2seq = train
users = sorted(u2seq.keys()) # DICTIONARY 가 아닌 LIST
max_len = max_len
mask_prob = mask_prob
mask_token = CLOZE_MASK_TOKEN
num_items = item_count
rng = rng

In [11]:
users

[1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,
 185

In [12]:
index = 610

In [13]:
user = users[index]
seq = u2seq[user]
seq

IndexError: list index out of range