In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]= "4"

In [2]:
import torch
from transformers import GPT2LMHeadModel, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset

In [3]:
# Load and preprocess dataset
dataset = load_dataset('csv',
                       data_files = {'train': "/home/sslunder24/project/moral/delphi/dataset/commonsense-norm-bank/freeform/train_freeform_ko.tsv",
                                     'test': "/home/sslunder24/project/moral/delphi/dataset/commonsense-norm-bank/freeform/test_freeform_ko.tsv"}, 
                       delimiter='\t')

In [4]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained('skt/kogpt2-base-v2',pad_token='<pad>')
#special_tokens_dict = {'additional_special_tokens': ['<usr>','<sys>']}
#num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
#tokenizer.pad_token_id = tokenizer.eos_token_id
model = GPT2LMHeadModel.from_pretrained('skt/kogpt2-base-v2')
model.resize_token_embeddings(len(tokenizer))
model.config.attn_pdrop = 0.05  # Dropout rate for attention probabilities
model.config.resid_pdrop = 0.05  # Dropout rate for hidden states
model.config.embd_pdrop = 0.05

In [5]:
# Check number of trainable parameters
def check_trainable_parameters(model):
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    trainable_percent = round((trainable_params / total_params) * 100, 4)
    return (f"trainable params: {trainable_params:,} || all params: {total_params:,} || trainable%: {trainable_percent}")

In [6]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from copy import deepcopy

# Define LoRA Config
LORA_R = 256 # 512
LORA_ALPHA = 512 # 1024
LORA_DROPOUT = 0.05

lora_config = LoraConfig(
                 r = LORA_R, # dimension of the low-rank matrices
                 lora_alpha = LORA_ALPHA, # scaling factor for the weight matrices
                 lora_dropout = LORA_DROPOUT, # dropout probability of the LoRA layers
                 bias="none",
                 task_type="CAUSAL_LM",
                 # target_modules=["query_key_value"],
)

# Prepare int-8 model for training - utility function that prepares 
# a PyTorch model for int8 quantization training.
# <https://huggingface.co/docs/peft/task_guides/int8-asr>
model_without_lora = deepcopy(model)
model_with_lora = model
model_with_lora = prepare_model_for_kbit_training(model)

# initialize the model with the LoRA framework
model_with_lora = get_peft_model(model_with_lora, lora_config)
print(check_trainable_parameters(model_without_lora))
model_with_lora.print_trainable_parameters()

trainable params: 125,164,800 || all params: 125,164,800 || trainable%: 100.0
trainable params: 9,437,184 || all params: 134,601,984 || trainable%: 7.0112




In [7]:
# Check number of trainable parameters
def check_trainable_parameters(model):
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    trainable_percent = round((trainable_params / total_params) * 100, 4)
    return (f"trainable params: {trainable_params:,} || all params: {total_params:,} || trainable%: {trainable_percent}")

print(check_trainable_parameters(model_without_lora))
model_with_lora.print_trainable_parameters()
# The total number of parameters in the model with LoRA applied is equal to the sum of the new LoRA parameters 
# and the total number of parameters in the original model.

trainable params: 125,164,800 || all params: 125,164,800 || trainable%: 100.0
trainable params: 9,437,184 || all params: 134,601,984 || trainable%: 7.0112


In [8]:
def preprocess_function(examples):

    inputs = examples['input_sequence']
    targets = examples['text_label']
    
    # Combine input and output with a separator
    combined_text = [f"{input_seq} <SEP> {text_label}" for input_seq, text_label in zip(inputs, targets)]
    
    # Tokenize combined text
    model_inputs = tokenizer(combined_text, max_length=32, truncation=True, padding='max_length')
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=10, truncation=True, padding='max_length')

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [9]:
# Apply the preprocessing function to the dataset
tokenized_datasets = dataset.map(preprocess_function, batched=True)

In [10]:
# Set training arguments
training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    save_steps=5000,
    logging_steps=2500,
    save_total_limit=2,
)

# Data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

# Initialize the trainer without LoRA
trainer = Trainer(
    model=model_without_lora,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    data_collator=data_collator,
)

# Initialize the trainer with LoRA
trainer_lora = Trainer(
    model=model_with_lora,
    # tokenizer=tokenizer,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    data_collator=data_collator,
)

Detected kernel version 4.15.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
Detected kernel version 4.15.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [12]:
# Stop the training manually once the estimated time stabilizes.
# Compare estimated training time and memory usage between model_without_lora and model_with_lora.

try:
    trainer_lora.train()
    pass
except KeyboardInterrupt:
    print("Training interrupted by user.")

current_memory = torch.cuda.memory_allocated()
print(f"Current memory allocated: {current_memory / 1024**2:.2f} MB")

Step,Training Loss


Training interrupted by user.
Current memory allocated: 2581.21 MB


In [11]:
try:
    trainer.train()
    pass
except KeyboardInterrupt:
    print("Training interrupted by user.")

current_memory = torch.cuda.memory_allocated()
print(f"Current memory allocated: {current_memory / 1024**2:.2f} MB")

Step,Training Loss


Training interrupted by user.
Current memory allocated: 2471.71 MB
