## Supervised training
Refer to 
https://github.com/jalkestrup/llm2vec-dtu/blob/main/experiments/run_supervised.py

In [5]:
import torch
import transformers
from tqdm import tqdm
import os
from accelerate import Accelerator, DistributedDataParallelKwargs

In [6]:
from huggingface_hub import notebook_login

# Handle lighting AI studio
if '/teamspace' in os.getcwd():
    os.chdir('/teamspace/studios/this_studio/llm2vec-da')
    print(os.getcwd())

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [7]:
from transformers import HfArgumentParser
from llm2vec_da.arguments import EmbeddingModelArguments, DataTrainingArguments, TrainingArguments, CustomArguments

simcse_parser = HfArgumentParser(
        (EmbeddingModelArguments, DataTrainingArguments, TrainingArguments, CustomArguments)
    )

model_args, data_args, training_args, custom_args = simcse_parser.parse_json_file(
        "configs/supervised/MetaLlama3-swe-supervised-dk-curated.json"
    )

if training_args.ddp_find_unused_parameters:
    kwargs = [
        DistributedDataParallelKwargs(
            dim=0,
            broadcast_buffers=True,
            bucket_cap_mb=25,
            find_unused_parameters=True,
            check_reduction=False,
            gradient_as_bucket_view=False,
        )
    ]
else:
    kwargs = []

accelerator = Accelerator(kwargs_handlers=kwargs)
transformers.set_seed(training_args.seed)

## Load data

In [None]:
# TODO: can also pass separator arg here
train_dataset = load_dataset(
    data_args.dataset_name,
    split="train",
    file_path=data_args.dataset_file_path,
    effective_batch_size=training_args.per_device_train_batch_size
    * accelerator.num_processes,
)

## Load model

In [None]:
from llm2vec_da import LLM2Vec

torch_dtype = (
    model_args.torch_dtype
    if model_args.torch_dtype in ["auto", None]
    else getattr(torch, model_args.torch_dtype)
)

model = LLM2Vec.from_pretrained(
    base_model_name_or_path=model_args.model_name_or_path,
    enable_bidirectional=model_args.bidirectional,
    peft_model_name_or_path=model_args.peft_model_name_or_path,
    merge_peft=True,
    pooling_mode=model_args.pooling_mode,
    max_length=model_args.max_seq_length,
    torch_dtype=torch_dtype,
    attn_implementation=model_args.attn_implementation,
)

## Set up PEFT

In [None]:
from llm2vec_da.model import initialize_peft


peft_model = initialize_peft(
    model.model,
    lora_r=custom_args.lora_r,
    lora_alpha=2 * custom_args.lora_r,
    lora_dropout=custom_args.lora_dropout,
)

# model organization is LLM2VecModel.model -> HF Model, we have to apply PEFT to the inner model
model.model = peft_model.model

## Training

In [1]:
from llm2vec.loss.utils import load_loss
train_loss = load_loss(custom_args.loss_class, scale=custom_args.loss_scale)
train_loss

NameError: name 'custom_args' is not defined

In [None]:
from llm2vec_da.training import SupervisedDefaultCollator

tokenizer = model.tokenizer
data_collator = SupervisedDefaultCollator(model)

In [None]:
from llm2vec_da.training import SupervisedTrainer
trainer = SupervisedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_examples,
    eval_dataset=validation_examples,
    data_collator=data_collator,
    tokenizer=model.tokenizer,
    loss_function=train_loss,
)

if custom_args.stop_after_n_steps is not None:
    trainer.add_callback(StopTrainingCallback(custom_args.stop_after_n_steps))

In [None]:
trainer.train()