In [1]:
import os
import torch
import transformers
import peft
import datasets
import evaluate
import time
import logging
import math
import argparse
import numpy as np
from clearml import Task

# Set up logging
datasets.logging.set_verbosity_warning()
logging.basicConfig(level=logging.WARNING, format='%(asctime)s %(message)s')

# Ensure CUDA is available
assert torch.cuda.is_available(), "CUDA is required for this script"

# Set device
device = torch.device('cuda:0')
task = Task.init(project_name="BlackBoxCoder", task_name="Train")

ClearML Task: created new task id=153a25c889fc4147b801becd4f2a1577
2025-03-05 09:38:05,911 - clearml.Task - INFO - Storing jupyter notebook directly as code
ClearML results page: https://app.clear.ml/projects/7de6a9063e5c438ba607ed3b01246bf1/experiments/153a25c889fc4147b801becd4f2a1577/output/log
ClearML Monitor: Could not detect iteration reporting, falling back to iterations as seconds-from-start


In [2]:


# Constants
BLOCK_SIZE = 512
STRIDE = 32

def tokenize_with_stride(texts, tokenizer, block_size=BLOCK_SIZE, stride=STRIDE):
    tokenized_data = tokenizer(
        texts,
        max_length=block_size,
        truncation=True,
        stride=stride,
        return_overflowing_tokens=True,
        padding="max_length",
        return_tensors="pt",
        add_special_tokens=False,
    )
    tokenized_data["labels"] = tokenized_data["input_ids"]
    return datasets.Dataset.from_dict(tokenized_data).train_test_split(test_size=0.1, shuffle=False)

def load_model_and_tokenizer(model_name, device, bnb_config, peft_config):
    tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
    model = transformers.AutoModelForCausalLM.from_pretrained(model_name, device_map=device, quantization_config=bnb_config)
    model._hf_peft_config_loaded = True  # silence a warning from HF trainer
    model = peft.get_peft_model(model, peft_config)
    model.print_trainable_parameters()
    return model, tokenizer




In [3]:
class AttrDict(dict):
    def __init__(self, *args, **kwargs):
        super(AttrDict, self).__init__(*args, **kwargs)
        self.__dict__ = self
args = AttrDict({
    "model_name" : "Qwen/Qwen2.5-Coder-3B-Instruct",
    "dataset_path" : "hodza/BlackBox.Shkola.2014",
    "output_dir" : "./out",
    "epochs" : 7,
    "models_path" : "./models"
})

In [4]:

# Configuration for BitsAndBytes and PEFT
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)
peft_config = peft.LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=[
        "mlp.down_proj",
        "self_attn.k_proj",
        "self_attn.o_proj",
        "mlp.up_proj",
        "self_attn.v_proj",
        "mlp.gate_proj",
        "self_attn.q_proj"
    ],
    lora_dropout=0.1,
    bias="none",
    task_type=peft.TaskType.CAUSAL_LM
)

# Load model and tokenizer
model, tokenizer = load_model_and_tokenizer(args.model_name, device, bnb_config, peft_config)

dataset_name = os.path.basename(args.dataset_path)
only_model_name = args.model_name.split("/")[-1].replace(':', "_")

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

trainable params: 29,933,568 || all params: 3,115,872,256 || trainable%: 0.9607


In [5]:
# Prepare dataset
dataset = datasets.load_dataset(args.dataset_path)

In [6]:
texts = dataset["train"]["texts"]

lm_datasets = tokenize_with_stride(texts, tokenizer)

In [7]:
bleu_metric = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")

def preprocess_logits_for_metrics(logits, labels):
    if isinstance(logits, tuple):
        logits = logits[0]
    return logits.argmax(dim=-1)
    
def compute_metrics(eval_preds):
    preds, labels = eval_preds

    if isinstance(preds, tuple):
        preds = preds[0]

    # Replace -100 in the preds as we can't decode them
    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)

    # Decode generated summaries into text
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # Decode reference summaries into text
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # ROUGE expects a newline after each sentence
    decoded_preds = ["\n".join(pred.strip()) for pred in decoded_preds]

    decoded_labels = ["\n".join(label.strip()) for label in decoded_labels]
    # Extract the median scores
    resultb = bleu_metric.compute(predictions=decoded_preds, references=decoded_labels)
    resultr = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels)
    f1 = 2 * (resultb["bleu"] * resultr['rougeL']) / (resultb["bleu"] + resultr['rougeL'])
    return {
        # Bleu measures precision
        "bleu": resultb["bleu"], 
        # Rouge measures recall
        "rouge1": resultr["rouge1"],
        "rouge2": resultr["rouge2"],
        # valuates the match of the longest common subsequences (LCS).
        # Suitable for tasks where preserving the order and structure of the text is important.
        "rougeL": resultr["rougeL"],
        "f1" : f1}

In [8]:
# Training arguments
timestr = time.strftime("%Y%m%d-%H%M%S")
data_collator = transformers.DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
training_args = transformers.TrainingArguments(
    output_dir=f"{args.output_dir}/{dataset_name}_{only_model_name}/{timestr}/",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=1,
    bf16=torch.cuda.get_device_capability(torch.cuda.current_device())[0] >= 8,
    gradient_accumulation_steps=4,
    num_train_epochs=args.epochs,
    weight_decay=0.01, # regularisation
    logging_dir='./logs',
    logging_steps=32,
    warmup_steps=100,
)

# Trainer
trainer = transformers.Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    preprocess_logits_for_metrics = preprocess_logits_for_metrics,
)






No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
# Train the model
trainer.train()



Epoch,Training Loss,Validation Loss


In [None]:
# Evaluate the model
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

In [None]:
eval_results

In [None]:
final_path = os.path.join(args.models_path, f"{dataset_name}_{only_model_name}_{timestr}")
# Save the model
model.save_pretrained(final_path)
tokenizer.save_pretrained(final_path)