In [18]:
import os
import torch
import transformers
import peft
import datasets
import evaluate
import time
import logging
import math
import argparse
import numpy as np
from clearml import Task

# Set up logging
datasets.logging.set_verbosity_warning()
logging.basicConfig(level=logging.WARNING, format='%(asctime)s %(message)s')

# Ensure CUDA is available
assert torch.cuda.is_available(), "CUDA is required for this script"

# Set device
device = torch.device('cuda:0')
task = Task.init(project_name="BlackBoxCoder", task_name="Train")

In [19]:


# Constants
BLOCK_SIZE = 512
STRIDE = 32

def tokenize_with_stride(texts, tokenizer, block_size=BLOCK_SIZE, stride=STRIDE):
    tokenized_data = tokenizer(
        texts,
        max_length=block_size,
        truncation=True,
        stride=stride,
        return_overflowing_tokens=True,
        padding="max_length",
        return_tensors="pt",
        add_special_tokens=False,
    )
    tokenized_data["labels"] = tokenized_data["input_ids"]
    return datasets.Dataset.from_dict(tokenized_data).train_test_split(test_size=0.1, shuffle=False)

def load_model_and_tokenizer(model_name, device, bnb_config, peft_config):
    tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
    model = transformers.AutoModelForCausalLM.from_pretrained(model_name, device_map=device, quantization_config=bnb_config)
    model._hf_peft_config_loaded = True  # silence a warning from HF trainer
    model = peft.get_peft_model(model, peft_config)
    model.print_trainable_parameters()
    return model, tokenizer




In [20]:
class AttrDict(dict):
    def __init__(self, *args, **kwargs):
        super(AttrDict, self).__init__(*args, **kwargs)
        self.__dict__ = self
args = AttrDict({
    "model_name" : "Qwen/Qwen2.5-Coder-3B-Instruct",
    "dataset_path" : "hodza/BlackBox.Shkola.2014",
    "output_dir" : "./out",
    "epochs" : 7,
    "models_path" : "./models"
})

In [21]:

# Configuration for BitsAndBytes and PEFT
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)
peft_config = peft.LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=[
        "mlp.down_proj",
        "self_attn.k_proj",
        "self_attn.o_proj",
        "mlp.up_proj",
        "self_attn.v_proj",
        "mlp.gate_proj",
        "self_attn.q_proj"
    ],
    lora_dropout=0.1,
    bias="none",
    task_type=peft.TaskType.CAUSAL_LM
)

# Load model and tokenizer
model, tokenizer = load_model_and_tokenizer(args.model_name, device, bnb_config, peft_config)

dataset_name = os.path.basename(args.dataset_path)
only_model_name = args.model_name.split("/")[-1].replace(':', "_")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

trainable params: 29,933,568 || all params: 3,115,872,256 || trainable%: 0.9607


In [22]:
# Prepare dataset
dataset = datasets.load_dataset(args.dataset_path)

2025-02-21 14:53:40,143 Attempting to acquire lock 139644579203472 on /hf/datasets/_hf_datasets_hodza___black_box.shkola.2014_default_0.0.0_c5fff690f82628e7be1e3a6e3330b4548c99c340.lock
2025-02-21 14:53:40,144 Lock 139644579203472 acquired on /hf/datasets/_hf_datasets_hodza___black_box.shkola.2014_default_0.0.0_c5fff690f82628e7be1e3a6e3330b4548c99c340.lock
2025-02-21 14:53:40,145 open file: /hf/datasets/hodza___black_box.shkola.2014/default/0.0.0/c5fff690f82628e7be1e3a6e3330b4548c99c340/dataset_info.json
2025-02-21 14:53:40,146 Attempting to release lock 139644579203472 on /hf/datasets/_hf_datasets_hodza___black_box.shkola.2014_default_0.0.0_c5fff690f82628e7be1e3a6e3330b4548c99c340.lock
2025-02-21 14:53:40,147 Lock 139644579203472 released on /hf/datasets/_hf_datasets_hodza___black_box.shkola.2014_default_0.0.0_c5fff690f82628e7be1e3a6e3330b4548c99c340.lock
2025-02-21 14:53:40,148 Attempting to acquire lock 139644577550288 on /hf/datasets/hodza___black_box.shkola.2014/default/0.0.0/c5ff

In [23]:
texts = dataset["train"]["texts"]

lm_datasets = tokenize_with_stride(texts, tokenizer)
# metric = evaluate.load("bleu")

# Training arguments
timestr = time.strftime("%Y%m%d-%H%M%S")
data_collator = transformers.DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
training_args = transformers.TrainingArguments(
    output_dir=f"{args.output_dir}/{dataset_name}_{only_model_name}/{timestr}/",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    bf16=torch.cuda.get_device_capability(torch.cuda.current_device())[0] >= 8,
    gradient_accumulation_steps=4,
    num_train_epochs=args.epochs,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=32,
    warmup_steps=100,
)

# Trainer
trainer = transformers.Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["test"],
    data_collator=data_collator,
    # compute_metrics=lambda eval_pred: compute_metrics(eval_pred, metric),
)

# Train the model
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

final_path = os.path.join(args.models_path, f"{dataset_name}_{only_model_name}_{timestr}")
# Save the model
model.save_pretrained(final_path)
tokenizer.save_pretrained(final_path)


No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.




Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 