## Supervised training
Refer to 
https://github.com/jalkestrup/llm2vec-dtu/blob/main/experiments/run_supervised.py

In [1]:
import torch
import transformers
from tqdm import tqdm
import os
from accelerate import Accelerator, DistributedDataParallelKwargs

In [2]:
from huggingface_hub import HfApi
from dotenv import load_dotenv
load_dotenv()

api = HfApi(token=os.getenv("HF_TOKEN"))

# Alternatively, login with huggingface_hub GUI
#notebook_login()

# Handle lighting AI studio path
if '/teamspace' in os.getcwd():
    os.chdir('/teamspace/studios/this_studio/llm2vec-da')
    # Hmm lighting AI studio changed to the below ..?
    #os.chdir('/home/zeus/content/llm2vec-da')
    print(os.getcwd())

In [3]:
from transformers import HfArgumentParser, TrainingArguments
from llm2vec_da.arguments import EmbeddingModelArguments, DataTrainingArguments, CustomArguments

supervised_parser = HfArgumentParser(
        (EmbeddingModelArguments, DataTrainingArguments, TrainingArguments, CustomArguments)
    )

model_args, data_args, training_args, custom_args = supervised_parser.parse_json_file(
        "configs/supervised/MetaLlama3-sheared.json"
    )

if training_args.ddp_find_unused_parameters:
    kwargs = [
        DistributedDataParallelKwargs(
            dim=0,
            broadcast_buffers=True,
            bucket_cap_mb=25,
            find_unused_parameters=True,
            check_reduction=False,
            gradient_as_bucket_view=False,
        )
    ]
else:
    kwargs = []

accelerator = Accelerator(kwargs_handlers=kwargs)
transformers.set_seed(training_args.seed)

In [4]:
from datasets import load_dataset
dataset = load_dataset(data_args.dataset_name, split="train[:10%]")
#dataset = load_dataset("DDSC/nordic-embedding-training-data", split="train[:10%]")

# Optionally, save to local file
#dataset.save_to_disk("nordic-embedding-training-data")

# Optionally, load from local file
#from datasets import load_from_disk
#ds_transformed = load_from_disk("/teamspace/studios/this_studio/synthetic-supervised-dataset-2")

print(dataset[0])


Using the latest cached version of the dataset since DDSC/nordic-embedding-training-data couldn't be found on the Hugging Face Hub (offline mode is enabled).
Found the latest cached dataset configuration 'default' at /home/jeal/.cache/huggingface/datasets/DDSC___nordic-embedding-training-data/default/0.0.0/fba903a3f0369fa1a239aab9993c735b0f3d6e12 (last modified on Sun Apr 20 10:57:26 2025).


{'query': 'Hvad var de langsigtede konsekvenser for dansk økonomi af den danske forfatningslov af 1849?', 'positive': 'Den danske forfatningslov af 1849 markerede et paradigmeskift i dansk politisk og økonomisk struktur.  Loven etableret et konstitutionelt monarki, begrænsede kongelig magt og indførte en parlamentarisk form for styre. Dette havde vidtrækkende konsekvenser for den danske økonomi.  Den nye forfatning lagde grunden for en mere liberalistisk økonomisk orden, med fokus på fri handel, privat ejendomsret og entreprenørskab.  Den øgede politiske stabilitet og forudsigelighed tiltrak udenlandsk kapital og investeringer, der bidrog til økonomisk vækst.  Samtidig reducerede loven den statslige indblanding i økonomien, hvilket gav plads til privat initiativ og markedskræfter.  Introduktionen af en national valuta og en centralbank styrkede den økonomiske integration med andre europæiske lande.  Mens forfatningsloven af 1849 ikke direkte førte til økonomisk mirakel, lagde den grund

## Load data

In [5]:
from datasets import load_dataset
from dataclasses import dataclass
from typing import List, Union
import random
import logging
import torch

logger = logging.getLogger(__name__)

class Dataset(torch.utils.data.Dataset):
    """
    Abstract class for datasets
    """
    
    def load_data(self, file_path: str = None):
        raise NotImplementedError()

    def __getitem__(self, index):
        raise NotImplementedError()

    def __len__(self):
        raise NotImplementedError()

@dataclass
class DataSample:
    id_: int
    query: str
    positive: str
    negative: str = None
    task_name: str = None


class TrainSample:
    """
    Structure for one input example with texts, the label and a unique id
    """

    def __init__(
        self, guid: str = "", texts: List[str] = None, label: Union[int, float] = 0
    ):
        """
        Creates one TrainSample with the given texts, guid and label


        :param guid
            id for the example
        :param texts
            the texts for the example.
        :param label
            the label for the example
        """
        self.guid = guid
        self.texts = texts
        self.label = label

    def __str__(self):
        return "<TrainSample> label: {}, texts: {}".format(
            str(self.label), "; ".join(self.texts)
        )


In [6]:
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


class NordicE5Data(Dataset):
    """
    A dataset class for loading and processing data from a Hugging Face dataset to a datasample following E5 datastructure.
    
    This class handles loading instruction-based samples with queries, positive examples,
    and optional negative examples. It processes the data into batches suitable for training.

    Args:
        hf_dataset: The dataset to load from (local or remote)
        [Optional] instruction_column (str): Column name for instructions. Defaults to 'instruction'. Prepends the instruction to the query.
        query_column (str): Column name for queries. Defaults to 'query'
        pos_column (str): Column name for positive examples. Defaults to 'positive'
        [Optional] neg_column (str): Column name for negative examples. Defaults to 'negative'
        [Optional] task_column (str): Column name for task labels. Task is used to group the data by task during batching.
        split (str): Dataset split to use. Defaults to "train"
        effective_batch_size (int): Size of batches to create, accounting for parallel processes.
        separator (str): Separator string between text segments. Defaults to "!@#$%^&*()"
    """

    def __init__(
        self,
        hf_dataset,
        instruction_column = 'instruction',
        query_column = 'query',
        pos_column = 'positive',
        neg_column = 'negative',
        task_column = 'task',
        split: str = "train",
        effective_batch_size: int = 32,
        separator: str = "!@#$%^&*()", #Note default of LLM2Vec is !@#$%^&*() , changing this would also have to be changed in the llm2vec lib when encoding/decoding
    ):
        self.instruction_column = instruction_column
        self.query_column = query_column
        self.pos_column = pos_column
        self.neg_column = neg_column
        self.task_column = task_column
        self.split = split
        self.effective_batch_size = effective_batch_size
        self.separator = separator
        self.data = []
        self.load_data(hf_dataset)

    def __len__(self):
        return len(self.data)

    def load_data(self, hf_dataset):
        # 1) Convert the hf dataset to a list of DataSamples
        all_samples = []
        for idx, row in tqdm(enumerate(hf_dataset), total=len(hf_dataset), desc='Loading dataset'):
            
            # If no query and positive example, skip the example
            if self.query_column not in row or self.pos_column not in row:
                logger.warning(f"No query or positive example found for example {idx}, skipping")
                continue

            # If instruction column is provided, prepend the instruction to the query
            if self.instruction_column:
                instruction = row[self.instruction_column]
                query =  f"{instruction}; {self.separator}{row[self.query_column]}"
            else:
                query =  f"{row[self.query_column]}"
        
            pos   =  f"{self.separator}{row[self.pos_column]}"

            # If negative column is provided include negative example
            neg_raw = row[self.neg_column]
            if neg_raw is None or neg_raw.strip().lower() in {"", "none", "null"}:
                neg = None
            else:
                neg   =  f"{self.separator}{row[self.neg_column]}"

            # If task column is provided include task name as to group batches per task
            if row[self.task_column]:
                task  =  row[self.task_column]
            else:
                task = None

            all_samples.append(
                DataSample(
                    id_=idx,
                    query=query,
                    positive=pos,
                    negative=neg,
                    task_name=task
                )
            )

        # First, group samples by task
        task_samples = {}
        for idx, sample in tqdm(enumerate(all_samples), total=len(all_samples), desc='Grouping data by task'):
            task = sample.task_name
            if task not in task_samples:
                task_samples[task] = []
            task_samples[task].append(sample)

        logger.info(f"Batching data for effective batch size = {self.effective_batch_size} ...")
        batched_data = []

        # Create full batches for each task
        for task, samples in tqdm(task_samples.items(), total=len(task_samples), desc='Batching data'):
            task_batches = []
            for i in range(0, len(samples), self.effective_batch_size):
                batch = samples[i : i + self.effective_batch_size]
                if len(batch) == self.effective_batch_size:
                    task_batches.append(batch)
                else:
                    logger.info(f"Skipping partial batch of {len(batch)} samples for task {task}")
            
            if task_batches:  # If we got any full batches for this task
                batched_data.extend(task_batches)

        # Shuffle the batches to mix tasks during training
        random.shuffle(batched_data)

        # Flatten while maintaining batch boundaries
        self.data = [sample for batch in batched_data for sample in batch]
        logger.info(f"Loaded and batched {len(self.data)} samples from {len(task_samples)} tasks")

    def __getitem__(self, index):
        sample = self.data[index]
        texts = [sample.query, sample.positive]
        if sample.negative is not None:          
            texts.append(sample.negative)
        return TrainSample(texts=texts, label=1.0)
        
def custom_dataset(hf_dataset,
                      effective_batch_size):
    
    dataset_map = {
        "nordic-embedding-training-data": NordicE5Data
    }

    if hf_dataset.info.dataset_name in dataset_map:
        return dataset_map[hf_dataset.info.dataset_name](hf_dataset,
                                                        effective_batch_size=effective_batch_size)
    else:
        raise ValueError(f"Dataset {hf_dataset.info.dataset_name} not found in dataset_map")


In [23]:
# dataset = load_dataset(
#     "DDSC/nordic-embedding-training-data",
#     split="train",
#     columns=['query', 'positive', 'negative', 'instruction', 'task']
# )

train_dataset = custom_dataset(dataset, 
                                 effective_batch_size=training_args.per_device_train_batch_size* accelerator.num_processes)

Loading dataset: 100%|██████████| 96825/96825 [00:16<00:00, 5890.64it/s]
Grouping data by task: 100%|██████████| 96825/96825 [00:00<00:00, 4120218.79it/s]
INFO:__main__:Batching data for effective batch size = 32 ...
Batching data:   0%|          | 0/2 [00:00<?, ?it/s]INFO:__main__:Skipping partial batch of 15 samples for task retrieval
INFO:__main__:Skipping partial batch of 10 samples for task classification
Batching data: 100%|██████████| 2/2 [00:00<00:00, 327.62it/s]
INFO:__main__:Loaded and batched 96800 samples from 2 tasks


In [8]:
train_dataset[0].texts

['Locate news articles reporting on political developments in a specific region.; !@#$%^&*()Hvilke indflydelsesrige faktorer bestemmer fremtidens finansielle regulering i Sydøstasien i lyset af voksende middelklasse og stigende digitalisering?',
 '!@#$%^&*()Den globale økonomiske vækst har kørt ind i en periode med usikkerhed. Inflation, stigende renter og geopolitisk spænding skaber udfordringer for centralbanker og regeringer verden over. Men Sydøstasien, med sin dynamiske befolkning og voksende middelklasse, udviser stadig robust vækst. Digitaliseringen er et afgørende kendetegn ved denne vækst, der fører til nye muligheder og komplekse udfordringer for regionens finansielle sektor. Regeringssamarbejdet er afgørende for at navigere i dette komplekse landskab og skabe et stabilt og inkluderende finansiel fremtid.',
 '!@#$%^&*()Sydøstasien er et hotspot for teknologisk innovation, især inden for fintech. Nye betalingsløsninger og digitale bankplattforme bryder traditionel banksektor o

## Tokenizer test

In [9]:
from transformers import (
    LlamaConfig,
    MistralConfig,
    GemmaConfig,
    Qwen2Config,
)

def prepare_for_tokenization(model, text, pooling_mode="mean"):
    if model.config._name_or_path == "meta-llama/Meta-Llama-3-8B-Instruct":
        text = (
            "<|start_header_id|>user<|end_header_id|>\n\n" + text.strip() + "<|eot_id|>"
        )
        return text
    if model.config._name_or_path in [
        "mistralai/Mistral-7B-Instruct-v0.2",
        "meta-llama/Llama-2-7b-chat-hf",
    ]:
        text = "[INST] " + text.strip() + " [/INST]"
    if model.config._name_or_path in [
        "google/gemma-2-9b-it",
    ]:
        text = "<bos><start_of_turn>user\n" + text.strip() + "<end_of_turn>"
    if model.config._name_or_path in [
        "Qwen/Qwen2-1.5B-Instruct",
        "Qwen/Qwen2-7B-Instruct",
    ]:
        text = "<|im_start|>user\n" + text.strip() + "<|im_end|>"
    if pooling_mode == "eos_token":
        if model.config._name_or_path == "meta-llama/Meta-Llama-3-8B":
            text = text.strip() + "<|end_of_text|>"
        elif isinstance(model.config, LlamaConfig) or isinstance(
            model.config, MistralConfig
        ):
            text = text.strip() + " </s>"
        elif isinstance(model.config, GemmaConfig):
            text = text.strip() + "<eos>"
        elif isinstance(model.config, Qwen2Config):
            text = text.strip() + "<|endoftext|>"
    return text

class MixedNegCollator:
    #def __init__(self, model: LLM2Vec):
    def __init__(self, model):
        self.model = model

    def _prep(self, txt):
        return prepare_for_tokenization(self.model, txt,
                                        pooling_mode=self.model.pooling_mode)

    def __call__(self, batch):
        q_texts, p_texts, n_texts, labels = [], [], [], []

        for ex in batch:
            q_texts.append(self._prep(ex.texts[0]))
            p_texts.append(self._prep(ex.texts[1]))

            if len(ex.texts) > 2 and ex.texts[2]:
                n_texts.append(self._prep(ex.texts[2]))

            labels.append(ex.label)

        sent_feat_q = self.model.tokenize(q_texts)          # size B
        sent_feat_p = self.model.tokenize(p_texts)          # size B
        sent_feat_n = (
            self.model.tokenize(n_texts) if n_texts else None
        )                                                   # size ≤ B or None

        return (sent_feat_q, sent_feat_p, sent_feat_n), torch.tensor(labels)

In [17]:
import torch
from torch import nn
from transformers import AutoTokenizer, AutoModel
from transformers import LlamaConfig

class TinyLLM2Vec(nn.Module):
    """
    Drop‑in replacement for LLM2Vec that is tiny but respects the API:
      - .tokenize(list[str]) -> dict[str, Tensor] batch encoding
      - .encode(features)    -> Tensor (batch, D)
      - .pooling_mode attr   -> str
    """
    def __init__(self, model_name="prajjwal1/bert-tiny", pooling_mode="cls"):
        super().__init__()
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model     = AutoModel.from_pretrained(model_name)
        self.config = self.model.config          # forward attr used by prep‑fn
        #self.config = LlamaConfig()
        self.config._name_or_path = "meta-llama/Meta-Llama-3-8B" # To fake the config
        
        self.pooling_mode = pooling_mode   # value read by prepare_for_tokenization

    @torch.no_grad()
    def tokenize(self, texts):
        return self.tokenizer(
            texts, padding=True, truncation=True, return_tensors="pt", max_length=512
        )

    @torch.no_grad()
    def encode(self, features):
        out = self.model(**features).last_hidden_state   # (B, L, H)
        if self.pooling_mode == "cls":
            return out[:, 0]                             # (B, H)
        elif self.pooling_mode == "mean":
            mask = features["attention_mask"].unsqueeze(-1)
            return (out * mask).sum(1) / mask.sum(1)     # (B, H)
        else:
            raise ValueError("Unknown pooling mode")

model = TinyLLM2Vec(pooling_mode="mean")      # instead of Llama‑8B

### Test of prepare_for_tokenization

In [18]:
# Set _name_or_path to define tokenizer model behavior
model.config._name_or_path =  "meta-llama/Meta-Llama-3-8B"

# Inspect the input query and the output query before and after
print(f'Input query: {nordic_e5_dataset[0].texts[0]}')
print(f'Output query: {prepare_for_tokenization(model, nordic_e5_dataset[0].texts[0], pooling_mode="eos_token")}')

Input query: Locate news articles reporting on political developments in a specific region.; !@#$%^&*()Hvilke indflydelsesrige faktorer bestemmer fremtidens finansielle regulering i Sydøstasien i lyset af voksende middelklasse og stigende digitalisering?
Output query: Locate news articles reporting on political developments in a specific region.; !@#$%^&*()Hvilke indflydelsesrige faktorer bestemmer fremtidens finansielle regulering i Sydøstasien i lyset af voksende middelklasse og stigende digitalisering?<|end_of_text|>


In [19]:
from llm2vec_da.loss import HardNegativeNLLLoss
from torch.utils.data import DataLoader

collator = MixedNegCollator(model)           # the new collator

loader   = DataLoader(
               dataset=nordic_e5_dataset,
               batch_size=32,                 
               shuffle=False, # DO NOT SHUFFLE, batching is done in the dataset class
               collate_fn=collator
           )

loss_fn  = HardNegativeNLLLoss(scale=20.0)   # unchanged

In [20]:
batch = next(iter(loader))
(q_feat, p_feat, n_feat), _ = batch
print(f'Length of batch: {len(batch)}\n# of q_feat: {q_feat["input_ids"].shape}\n# of p_feat: {p_feat["input_ids"].shape}\n# of n_feat: {n_feat["input_ids"].shape if n_feat else None}\n# of labels: {len(_)}')

Length of batch: 2
# of q_feat: torch.Size([32, 98])
# of p_feat: torch.Size([32, 512])
# of n_feat: torch.Size([32, 358])
# of labels: 32


In [22]:
batch = next(iter(loader))
(q_feat, p_feat, n_feat), _ = batch

q_reps = model.encode(q_feat)                # (B, D)
p_reps = model.encode(p_feat)                # (B, D)
n_reps = model.encode(n_feat) if n_feat else None

loss = loss_fn(q_reps, p_reps, n_reps)
print("forward OK, loss =", loss.item())

forward OK, loss = 3.9798991680145264


## Load model

In [None]:
from llm2vec_da.training import SupervisedTrainer



In [None]:
from llm2vec_da import LLM2Vec

torch_dtype = (
    model_args.torch_dtype
    if model_args.torch_dtype in ["auto", None]
    else getattr(torch, model_args.torch_dtype)
)

model = LLM2Vec.from_pretrained(
    base_model_name_or_path=model_args.model_name_or_path,
    enable_bidirectional=model_args.bidirectional,
    peft_model_name_or_path=model_args.peft_model_name_or_path,
    merge_peft=True,
    pooling_mode=model_args.pooling_mode,
    max_length=model_args.max_seq_length,
    torch_dtype=torch_dtype,
    attn_implementation=model_args.attn_implementation,
)

## Set up PEFT

In [None]:
from llm2vec_da.model import initialize_peft


peft_model = initialize_peft(
    model.model,
    lora_r=custom_args.lora_r,
    lora_alpha=2 * custom_args.lora_r,
    lora_dropout=custom_args.lora_dropout,
)

# model organization is LLM2VecModel.model -> HF Model, we have to apply PEFT to the inner model
model.model = peft_model.model

## Training

In [24]:
from llm2vec.loss.utils import load_loss
train_loss = load_loss(custom_args.loss_class, scale=custom_args.loss_scale)
train_loss

<llm2vec.loss.HardNegativeNLLLoss.HardNegativeNLLLoss at 0x7f5876180910>

In [25]:
#from llm2vec_da.training import SupervisedDefaultCollator

tokenizer = model.tokenizer
data_collator = MixedNegCollator(model)           # the new collator

# Load train examples into memory
train_examples = [
    train_dataset[i]
    for i in tqdm(
        range(len(train_dataset)),
        desc="Loading train examples...",
        disable=not accelerator.is_main_process,
    )
]

Loading train examples...: 100%|██████████| 96800/96800 [00:06<00:00, 14761.81it/s]


In [None]:
from llm2vec_da.training import SupervisedTrainer, StopTrainingCallback

trainer = SupervisedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_examples,
    data_collator=data_collator,
    tokenizer=model.tokenizer,
    loss_function=train_loss,
)

if custom_args.stop_after_n_steps is not None:
    trainer.add_callback(StopTrainingCallback(custom_args.stop_after_n_steps))

In [None]:
trainer.train()