In [1]:
import sys
import os

sys.path.append("../source")

In [2]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
import pickle
import torch
import numpy as np

from data.datasets import dataset_factory
from data.dataloader.utils import Prompter
from transformers import AutoTokenizer
from transformers.models.llama.tokenization_llama import DEFAULT_SYSTEM_PROMPT

# Functions

In [4]:
def worker_init_fn(worker_id):
    random.seed(np.random.get_state()[1][0] + worker_id)                                                      
    np.random.seed(np.random.get_state()[1][0] + worker_id)


# the following prompting is based on alpaca
def generate_and_tokenize_eval(args, data_point, tokenizer, prompter):
    in_prompt = prompter.generate_prompt(data_point["system"],
                                         data_point["input"])
    tokenized_full_prompt = tokenizer(in_prompt,
                                      truncation=True,
                                      max_length=args.llm_max_text_len,
                                      padding=False,
                                      return_tensors=None)
    tokenized_full_prompt["labels"] = ord(data_point["output"]) - ord('A')
    
    return tokenized_full_prompt


def generate_and_tokenize_train(args, data_point, tokenizer, prompter):
    def tokenize(prompt, add_eos_token=True):
        result = tokenizer(prompt,
                           truncation=True,
                           max_length=args.llm_max_text_len,
                           padding=False,
                           return_tensors=None)
        if (result["input_ids"][-1] != tokenizer.eos_token_id and add_eos_token):
            result["input_ids"].append(tokenizer.eos_token_id)
            result["attention_mask"].append(1)

        result["labels"] = result["input_ids"].copy()
        return result

    full_prompt = prompter.generate_prompt(data_point["system"],
                                           data_point["input"],
                                           data_point["output"])
    tokenized_full_prompt = tokenize(full_prompt, add_eos_token=True)
    if not args.llm_train_on_inputs:
        tokenized_full_prompt["labels"][:-2] = [-100] * len(tokenized_full_prompt["labels"][:-2])
    
    return tokenized_full_prompt


def seq_to_token_ids(args, seq, candidates, label, text_dict, tokenizer, prompter, eval=False):
    def truncate_title(title):
        title_ = tokenizer.tokenize(title)[:args.llm_max_title_len]
        title = tokenizer.convert_tokens_to_string(title_)
        return title

    seq_t = ' \n '.join(['(' + str(idx + 1) + ') ' + truncate_title(text_dict[item]) 
                       for idx, item in enumerate(seq)])
    can_t = ' \n '.join(['(' + chr(ord('A') + idx) + ') ' + truncate_title(text_dict[item])
                       for idx, item in enumerate(candidates)])
    output = chr(ord('A') + candidates.index(label))  # ranking only
    
    data_point = {}
    data_point['system'] = args.llm_system_template if args.llm_system_template is not None else DEFAULT_SYSTEM_PROMPT
    data_point['input'] = args.llm_input_template.format(seq_t, can_t)
    data_point['output'] = output
    
    if eval:
        return generate_and_tokenize_eval(args, data_point, tokenizer, prompter)
    else:
        return generate_and_tokenize_train(args, data_point, tokenizer, prompter)

# Preprocessing

In [5]:
import easydict

# jupyter didn't support argparse. so, I use 'easydict' module
args = easydict.EasyDict({
    ################
    # Dataset
    ################
    'dataset_code': 'ml-100k', # ml-100k, beauty, games
    'min_rating': 0,  # default: 0
    'min_uc': 5,  # default: 5
    'min_sc': 5,  # default: 5
    'seed': 42,  # default: 42

    ################
    # Dataloader
    ################
    'train_batch_size': 64,  # default: 64
    'val_batch_size': 64,  # default: 64
    'test_batch_size': 64,  # default: 64
    'num_workers': 0,  # default: 8
    'sliding_window_size': 1.0,  # default: 1.0
    'negative_sample_size': 10,  # default: 10

    ################
    # Trainer
    ################
    # optimization #
    'device': 'cuda',  # default: 'cuda'  # choices: ['cpu', 'cuda']
    'num_epochs': 500,  # default: 500
    'optimizer': 'AdamW',  # default: 'AdamW'  # choices: ['AdamW', 'Adam']
    'weight_decay': 0.01,  # default: None
    'adam_epsilon': 1e-9,  # default: 1e-9
    'momentum': None,  # default: None
    'lr': 0.001,  # default: 0.001
    'max_grad_norm': 5.0,  # default: 5.0
    'enable_lr_schedule': True,  # default: True
    'decay_step': 10000,  # default: 10000
    'gamma': 1,  # default: 1
    'enable_lr_warmup': True,  # default: True
    'warmup_steps': 100,  # default: 100

    # evaluation #
    'val_strategy': 'iteration',  # default: 'iteration'  # choices: ['epoch', 'iteration']
    'val_iterations': 500,  # default: 500  # only for iteration val_strategy
    'early_stopping': True,  # default: True
    'early_stopping_patience': 20,  # default: 20
    'metric_ks': [1, 5, 10, 20, 50],  # default: [1, 5, 10, 20, 50]
    'rerank_metric_ks': [1, 5, 10],  # default: [1, 5, 10]
    'best_metric': 'Recall@10',  # default: 'Recall@10'
    'rerank_best_metric': 'NDCG@10',  # default: 'NDCG@10'
    'use_wandb': False,  # default: False

    ################
    # Retriever Model
    ################
    'model_code': 'llm',  # default: None
    'bert_max_len': 50,  # default: 50
    'bert_hidden_units': 64,  # default: 64
    'bert_num_blocks': 2,  # default: 2
    'bert_num_heads': 2,  # default: 2
    'bert_head_size': 32,  # default: 32
    'bert_dropout': 0.2,  # default: 0.2
    'bert_attn_dropout': 0.2,  # default: 0.2
    'bert_mask_prob': 0.25,  # default: 0.25

    ################
    # LLM Model
    ################
    'llm_base_model': 'meta-llama/Llama-2-7b-hf',  # default: 'meta-llama/Llama-2-7b-hf'
    'llm_base_tokenizer': 'meta-llama/Llama-2-7b-hf',  # default: 'meta-llama/Llama-2-7b-hf'
    'llm_max_title_len': 32,  # default: 32
    'llm_max_text_len': 1536,  # default: 1536
    'llm_max_history': 20,  # default: 20
    'llm_train_on_inputs': False,  # default: False
    'llm_negative_sample_size': 19,  # default: 19  # 19 negative & 1 positive
    'llm_system_template': "Given user history in chronological order, recommend an item from the candidate pool with its index letter.",  # default: "Given user history in chronological order, recommend an item from the candidate pool with its index letter."
    'llm_input_template': 'User history: {}; \n Candidate pool: {}',  # default: 'User history: {}; \n Candidate pool: {}'
    'llm_load_in_4bit': True,  # default: True
    'llm_retrieved_path': "/home/laststar/data/model/OpenLLM-Rec/retrieved.pkl",  # default: None
    'llm_cache_dir': None,  # default: None

    ################
    # Lora
    ################
    'lora_r': 8,  # default: 8
    'lora_alpha': 32,  # default: 32
    'lora_dropout': 0.05,  # default: 0.05
    'lora_target_modules': ['q_proj', 'v_proj'],  # default: ['q_proj', 'v_proj']
    'lora_num_epochs': 1,  # default: 1
    'lora_val_iterations': 100,  # default: 100
    'lora_early_stopping_patience': 20,  # default: 20
    'lora_lr': 1e-4,  # default: 1e-4
    'lora_micro_batch_size': 16,  # default: 16
})

In [6]:
dataset = dataset_factory(args)
dataset = dataset.load_dataset()

Already preprocessed. Skip preprocessing


In [7]:
train = dataset['train']
val = dataset['val']
test = dataset['test']
umap = dataset['umap']
smap = dataset['smap']
text_dict = dataset['meta']
user_count = len(umap)
item_count = len(smap)
rng = np.random

args.num_items = item_count
max_len = args.llm_max_history

print(f"user_count : {user_count}")
print(f"item_count : {item_count}")
print(f"max_len : {max_len}")

user_count : 610
item_count : 3650
max_len : 20


In [8]:
tokenizer = AutoTokenizer.from_pretrained(args.llm_base_tokenizer, cache_dir=args.llm_cache_dir)
tokenizer.pad_token = tokenizer.unk_token
tokenizer.padding_side = 'left'
tokenizer.truncation_side = 'left'
tokenizer.clean_up_tokenization_spaces = True

In [9]:
prompter = Prompter(dir_name = "../source/data/dataloader/templates")

In [10]:
# LRU Output Load
llm_retrieved_path = args.llm_retrieved_path
retrieved_file = pickle.load(open(args.llm_retrieved_path, 'rb'))

- val_user는 모델이 예측한 점수의 상위 llm_negative_sample_size(20)개 항목에서 정답값이 포함되어 있는 경우의 유저 ID값을 수집
- val_candidates는 앞써 추출한 val_user에 해당되는 상위 llm_negative_sample_size(20)개 항목 추출

In [11]:
# ************* Constructing Validation Subset *************

val_probs = retrieved_file['val_probs']
val_labels = retrieved_file['val_labels']
val_metrics = retrieved_file['val_metrics'] 

# torch.topk()는 torch.topk는 주어진 텐서에서 상위 k개의 값과 그 인덱스를 반환
# p -> probs
# l -> labels
val_users = [u for u, (p, l) in enumerate(zip(val_probs, val_labels), start=1) \
                          if l in torch.topk(torch.tensor(p), args.llm_negative_sample_size+1).indices]
val_candidates = [torch.topk(torch.tensor(val_probs[u-1]), 
                        args.llm_negative_sample_size+1).indices.tolist() for u in val_users]


print(f"val_probs shape : {np.array(val_probs).shape} ")
print(f"val_labels shape : {np.array(val_labels).shape} ")
print(f"val_users shape : {np.array(val_users).shape} ")
print(f"val_candidates shape : {np.array(val_candidates).shape} ")

val_probs shape : (610, 3651) 
val_labels shape : (610,) 
val_users shape : (115,) 
val_candidates shape : (115, 20) 


- test_user는 모델이 예측한 점수의 상위 llm_negative_sample_size(20)개 항목에서 정답값이 포함되어 있는 경우의 유저 ID값을 수집
- test_candidates는 앞써 추출한 val_user에 해당되는 상위 llm_negative_sample_size(20)개 항목 추출
- non_test_user = ~test_user

In [12]:
# ************* Constructing Test Subset *************

test_probs = retrieved_file['test_probs']
test_labels = retrieved_file['test_labels']
test_metrics = retrieved_file['test_metrics']

# val 수행 작업과 동일
test_users = [u for u, (p, l) in enumerate(zip(test_probs, test_labels), start=1) \
                  if l in torch.topk(torch.tensor(p), args.llm_negative_sample_size+1).indices]
test_candidates = [torch.topk(torch.tensor(test_probs[u-1]), 
                        args.llm_negative_sample_size+1).indices.tolist() for u in test_users]
# non_test_users = ~test_users
non_test_users = [u for u, (p, l) in enumerate(zip(test_probs, test_labels), start=1) \
                               if l not in torch.topk(torch.tensor(p), args.llm_negative_sample_size+1).indices]

# Train Dataset Loader

In [14]:
args = args
u2seq = train
max_len = max_len
num_items = args.num_items
rng = rng
text_dict = text_dict
tokenizer = tokenizer
prompter = prompter

all_seqs = []
for u in sorted(u2seq.keys()):
    seq = u2seq[u]
    for i in range(2, len(seq)+1):
        all_seqs += [seq[:i]]

In [15]:
index = 39

- tokens : 특정 유저(u)에 대한 구매 시퀸스
- answer: 특정 유저(u)의 다음 구매할<예측할> 아이템 ID (정답 레이블)
- original_seq : tokens에서 answer 제외한 시퀸스 (입력 시퀸스)

In [16]:
tokens = all_seqs[index]
answer = tokens[-1]
original_seq = tokens[:-1]

print(f"tokens : {tokens}")
print(f"answer : {answer}")
print(f"original_seq : {original_seq}")

tokens : [610, 859, 1309, 1682, 1796, 2202, 2219, 2281, 92, 364, 1808, 1882, 195, 777, 297, 185, 1049, 1714, 1546, 2030, 1054, 2040, 1081, 1166, 1667, 2136, 191, 274, 443, 766, 181, 412, 1914, 2312, 1186, 3, 1641, 1621, 788, 820, 902]
answer : 902
original_seq : [610, 859, 1309, 1682, 1796, 2202, 2219, 2281, 92, 364, 1808, 1882, 195, 777, 297, 185, 1049, 1714, 1546, 2030, 1054, 2040, 1081, 1166, 1667, 2136, 191, 274, 443, 766, 181, 412, 1914, 2312, 1186, 3, 1641, 1621, 788, 820]


In [None]:
seq = original_seq[-max_len:]
cur_idx, candidates = 0, [answer]
samples = rng.randint(1, args.num_items+1, size=5*args.llm_negative_sample_size)

In [None]:
samples

# Test Dataset Loader

# Validation Dataset Loader