## Supervised training
Refer to 
https://github.com/jalkestrup/llm2vec-dtu/blob/main/experiments/run_supervised.py

In [2]:
#Load in the dataset from the folder /teamspace/studios/this_studio/synthetic-supervised-dataset-synthetic-from-retrieval-tasks-danish
from datasets import load_from_disk
ds_transformed = load_from_disk("/teamspace/studios/this_studio/synthetic-supervised-dataset-2")
#Print the first example
print(ds_transformed)
#Num of samples
print(len(ds_transformed))


Dataset({
    features: ['query', 'positive', 'negative', 'language', 'task', 'instruction', 'prompt', 'response'],
    num_rows: 968249
})
968249


In [None]:
import torch
import transformers
from tqdm import tqdm
import os
from accelerate import Accelerator, DistributedDataParallelKwargs

from huggingface_hub import HfApi
from dotenv import load_dotenv
load_dotenv()
api = HfApi(token=os.getenv("HF_TOKEN"))

#from huggingface_hub import notebook_login
#notebook_login()

# Handle lighting AI studio
if '/teamspace' in os.getcwd():
    os.chdir('/teamspace/studios/this_studio/llm2vec-da')
    # Hmm lighting AI studio changed to the below ..?
    #os.chdir('/home/zeus/content/llm2vec-da')
    print(os.getcwd())

/teamspace/studios/this_studio/llm2vec-da


In [13]:
from transformers import HfArgumentParser, TrainingArguments
from llm2vec_da.arguments import EmbeddingModelArguments, DataTrainingArguments, CustomArguments

simcse_parser = HfArgumentParser(
        (EmbeddingModelArguments, DataTrainingArguments, TrainingArguments, CustomArguments)
    )

model_args, data_args, training_args, custom_args = simcse_parser.parse_json_file(
        "configs/supervised/MetaLlama3-sheared.json"
    )

if training_args.ddp_find_unused_parameters:
    kwargs = [
        DistributedDataParallelKwargs(
            dim=0,
            broadcast_buffers=True,
            bucket_cap_mb=25,
            find_unused_parameters=True,
            check_reduction=False,
            gradient_as_bucket_view=False,
        )
    ]
else:
    kwargs = []

accelerator = Accelerator(kwargs_handlers=kwargs)
transformers.set_seed(training_args.seed)

## Load data

In [19]:
from datasets import load_dataset
from dataclasses import dataclass
from typing import List, Union
import random
import logging
import torch

logger = logging.getLogger(__name__)

class Dataset(torch.utils.data.Dataset):
    """
    Abstract class for datasets
    """
    
    def load_data(self, file_path: str = None):
        raise NotImplementedError()

    def __getitem__(self, index):
        raise NotImplementedError()

    def __len__(self):
        raise NotImplementedError()

@dataclass
class DataSample:
    id_: int
    query: str
    positive: str
    negative: str = None
    task_name: str = None


class TrainSample:
    """
    Structure for one input example with texts, the label and a unique id
    """

    def __init__(
        self, guid: str = "", texts: List[str] = None, label: Union[int, float] = 0
    ):
        """
        Creates one TrainSample with the given texts, guid and label


        :param guid
            id for the example
        :param texts
            the texts for the example.
        :param label
            the label for the example
        """
        self.guid = guid
        self.texts = texts
        self.label = label

    def __str__(self):
        return "<TrainSample> label: {}, texts: {}".format(
            str(self.label), "; ".join(self.texts)
        )


class NordicE5Data(Dataset):
    def __init__(
        self,
        dataset_name: str = "DDSC/nordic-embedding-training-data",
        split: str = "train",
        effective_batch_size: int = 32,
        separator: str = "!@#$%^&*()",
    ):
        self.dataset_name = dataset_name
        self.split = split
        self.effective_batch_size = effective_batch_size
        self.separator = separator

        self.data = []
        self.load_data()  # you might or might not need extra args here

    def __len__(self):
        return len(self.data)

    def load_data(self):
        logger.info("Loading dataset from {}".format(self.dataset_name))

        # 1) Load the HF dataset (pick the split you actually need)
        dataset = load_dataset(
            self.dataset_name,
            split=self.split,
        )

        # 2) Convert it to a list of DataSamples
        all_samples = []
        # Add tqdm progress bar add hint of 'Loading dataset'
        for idx, row in tqdm(enumerate(dataset), total=len(dataset), desc='Loading dataset'):
            # The HF dataset has columns: query, positive, negative, instruction
            instruction = row["instruction"]
            query =  f"{instruction}; {self.separator}{row['query']}"
            pos   =  f"{self.separator}{row['positive']}"
            neg_raw = row["negative"]
            if neg_raw is None or neg_raw.strip().lower() in {"", "none", "null"}:
                neg = None
            else:
                neg   =  f"{self.separator}{row['negative']}"

            task  =  row["task"]

            all_samples.append(
                DataSample(
                    id_=idx,
                    query=query,
                    positive=pos,
                    negative=neg,
                    task_name=task
                )
            )

        # 3) Shuffle or batch your data if you want
        random.shuffle(all_samples)

        # 4) Optionally chunk into batches
        logger.info(f"Batching data for effective batch size = {self.effective_batch_size} ...")
        batched_idx = []
        final_idx_order = []

        # We'll walk in steps of self.effective_batch_size
        # and discard the last partial batch
        # Add tqdm progress bar add hint of 'Batching data'
        for i in tqdm(range(0, len(all_samples), self.effective_batch_size), total=len(all_samples)//self.effective_batch_size, desc='Batching data'):
            chunk = all_samples[i : i + self.effective_batch_size]
            if len(chunk) == self.effective_batch_size:
                batched_idx.append(chunk)
            else:
                logger.info("Skipping partial batch of size %d", len(chunk))

        # Shuffle the chunk order
        random.shuffle(batched_idx)

        # Flatten the chunked list back into a single list
        final_data = []
        for chunk in batched_idx:
            final_data.extend(chunk)

        self.data = final_data
        logger.info(f"Loaded and batched {len(self.data)} samples.")

    def __getitem__(self, index):
        sample = self.data[index]
        texts = [sample.query, sample.positive]
        if sample.negative is not None:          
            texts.append(sample.negative)
        return TrainSample(texts=texts, label=1.0)


In [17]:
dataset = load_dataset(
    "DDSC/nordic-embedding-training-data",
    split="train",
    columns=['query', 'positive', 'negative', 'instruction', 'task']
)

# check if the negative attribute is not none
if dataset[100000]['negative'] is not None:
    print(dataset[100000])

In [20]:
nordic_e5_data = NordicE5Data(
    dataset_name="DDSC/nordic-embedding-training-data",
    split="train",
    effective_batch_size=training_args.per_device_train_batch_size * accelerator.num_processes,
)

Loading dataset: 100%|██████████| 968249/968249 [02:12<00:00, 7323.16it/s]
Batching data: 30258it [00:00, 230415.93it/s]                           


In [None]:
nordic_e5_data[0].texts

['Match a chemical formula to its name.; !@#$%^&*()Vad är formeln för koksalt?',
 '!@#$%^&*()NaCl är kemiska beteckningen för natriumklorid.']

## Model

In [31]:
from llm2vec_da import LLM2Vec

torch_dtype = (
    model_args.torch_dtype
    if model_args.torch_dtype in ["auto", None]
    else getattr(torch, model_args.torch_dtype)
)

model = LLM2Vec.from_pretrained(
    base_model_name_or_path=model_args.model_name_or_path,
    enable_bidirectional=model_args.bidirectional,
    peft_model_name_or_path=model_args.peft_model_name_or_path,
    merge_peft=True,
    pooling_mode=model_args.pooling_mode,
    max_length=data_args.max_seq_length,
    torch_dtype=torch_dtype,
    attn_implementation=model_args.attn_implementation,
)

ValueError: Can't find 'adapter_config.json' at 'jealk/llm2vec-da-mntp-sheared'

In [38]:
from transformers import (
    LlamaConfig,
    MistralConfig,
    GemmaConfig,
    Qwen2Config,
)


def prepare_for_tokenization(model, text, pooling_mode="mean"):
    if model.config._name_or_path == "meta-llama/Meta-Llama-3-8B-Instruct":
        text = (
            "<|start_header_id|>user<|end_header_id|>\n\n" + text.strip() + "<|eot_id|>"
        )
        return text
    if model.config._name_or_path in [
        "mistralai/Mistral-7B-Instruct-v0.2",
        "meta-llama/Llama-2-7b-chat-hf",
    ]:
        text = "[INST] " + text.strip() + " [/INST]"
    if model.config._name_or_path in [
        "google/gemma-2-9b-it",
    ]:
        text = "<bos><start_of_turn>user\n" + text.strip() + "<end_of_turn>"
    if model.config._name_or_path in [
        "Qwen/Qwen2-1.5B-Instruct",
        "Qwen/Qwen2-7B-Instruct",
    ]:
        text = "<|im_start|>user\n" + text.strip() + "<|im_end|>"
    if pooling_mode == "eos_token":
        if model.config._name_or_path == "meta-llama/Meta-Llama-3-8B":
            text = text.strip() + "<|end_of_text|>"
        elif isinstance(model.config, LlamaConfig) or isinstance(
            model.config, MistralConfig
        ):
            text = text.strip() + " </s>"
        elif isinstance(model.config, GemmaConfig):
            text = text.strip() + "<eos>"
        elif isinstance(model.config, Qwen2Config):
            text = text.strip() + "<|endoftext|>"
    return text

class MixedNegCollator:
    def __init__(self, model: LLM2Vec):
        self.model = model

    def _prep(self, txt):
        return prepare_for_tokenization(self.model, txt,
                                        pooling_mode=self.model.pooling_mode)

    def __call__(self, batch):
        q_texts, p_texts, n_texts, labels = [], [], [], []

        for ex in batch:
            q_texts.append(self._prep(ex.texts[0]))
            p_texts.append(self._prep(ex.texts[1]))

            if len(ex.texts) > 2 and ex.texts[2]:
                n_texts.append(self._prep(ex.texts[2]))

            labels.append(ex.label)

        sent_feat_q = self.model.tokenize(q_texts)          # size B
        sent_feat_p = self.model.tokenize(p_texts)          # size B
        sent_feat_n = (
            self.model.tokenize(n_texts) if n_texts else None
        )                                                   # size ≤ B or None

        return (sent_feat_q, sent_feat_p, sent_feat_n), torch.tensor(labels)

In [43]:
import torch
from torch import nn
from transformers import AutoTokenizer, AutoModel

class TinyLLM2Vec(nn.Module):
    """
    Drop‑in replacement for LLM2Vec that is tiny but respects the API:
      - .tokenize(list[str]) -> dict[str, Tensor] batch encoding
      - .encode(features)    -> Tensor (batch, D)
      - .pooling_mode attr   -> str
    """
    def __init__(self, model_name="prajjwal1/bert-tiny", pooling_mode="cls"):
        super().__init__()
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model     = AutoModel.from_pretrained(model_name)
        self.config = self.model.config          # forward attr used by prep‑fn
        self.pooling_mode = pooling_mode   # value read by prepare_for_tokenization

    @torch.no_grad()
    def tokenize(self, texts):
        return self.tokenizer(
            texts, padding=True, truncation=True, return_tensors="pt"
        )

    @torch.no_grad()
    def encode(self, features):
        out = self.model(**features).last_hidden_state   # (B, L, H)
        if self.pooling_mode == "cls":
            return out[:, 0]                             # (B, H)
        elif self.pooling_mode == "mean":
            mask = features["attention_mask"].unsqueeze(-1)
            return (out * mask).sum(1) / mask.sum(1)     # (B, H)
        else:
            raise ValueError("Unknown pooling mode")

model = TinyLLM2Vec(pooling_mode="mean")      # instead of Llama‑8B

In [56]:
from llm2vec_da.loss import HardNegativeNLLLoss

collator = MixedNegCollator(model)           # the new collator

loader   = torch.utils.data.DataLoader(
               nordic_e5_data,
               batch_size=8,                 # any small number
               shuffle=True,
               collate_fn=collator
           )

loss_fn  = HardNegativeNLLLoss(scale=20.0)   # unchanged

TypeError: DataLoader.__init__() got an unexpected keyword argument 'max_seq_length'

In [55]:
batch = next(iter(loader))
(q_feat, p_feat, n_feat), _ = batch
print(f'Length of batch: {len(batch)}\n# of q_feat: {q_feat["input_ids"].shape}\n# of p_feat: {p_feat["input_ids"].shape}\n# of n_feat: {n_feat["input_ids"].shape if n_feat else None}\n# of labels: {len(_)}')

Length of batch: 2
# of q_feat: torch.Size([8, 201])
# of p_feat: torch.Size([8, 404])
# of n_feat: torch.Size([6, 288])
# of labels: 8


In [54]:
batch[1]

tensor([1., 1., 1., 1., 1., 1., 1., 1.])

In [47]:
batch = next(iter(loader))
(q_feat, p_feat, n_feat), _ = batch

q_reps = model.encode(q_feat)                # (B, D)
p_reps = model.encode(p_feat)                # (B, D)
n_reps = model.encode(n_feat) if n_feat else None

loss = loss_fn(q_reps, p_reps, n_reps)
print("forward OK, loss =", loss.item())

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


forward OK, loss = 1.78125


## Load model

In [None]:
from llm2vec_da import LLM2Vec

torch_dtype = (
    model_args.torch_dtype
    if model_args.torch_dtype in ["auto", None]
    else getattr(torch, model_args.torch_dtype)
)

model = LLM2Vec.from_pretrained(
    base_model_name_or_path=model_args.model_name_or_path,
    enable_bidirectional=model_args.bidirectional,
    peft_model_name_or_path=model_args.peft_model_name_or_path,
    merge_peft=True,
    pooling_mode=model_args.pooling_mode,
    max_length=model_args.max_seq_length,
    torch_dtype=torch_dtype,
    attn_implementation=model_args.attn_implementation,
)

## Set up PEFT

In [None]:
from llm2vec_da.model import initialize_peft


peft_model = initialize_peft(
    model.model,
    lora_r=custom_args.lora_r,
    lora_alpha=2 * custom_args.lora_r,
    lora_dropout=custom_args.lora_dropout,
)

# model organization is LLM2VecModel.model -> HF Model, we have to apply PEFT to the inner model
model.model = peft_model.model

## Training

In [1]:
from llm2vec.loss.utils import load_loss
train_loss = load_loss(custom_args.loss_class, scale=custom_args.loss_scale)
train_loss

NameError: name 'custom_args' is not defined

In [None]:
from llm2vec_da.training import SupervisedDefaultCollator

tokenizer = model.tokenizer
data_collator = SupervisedDefaultCollator(model)

In [None]:
from llm2vec_da.training import SupervisedTrainer
trainer = SupervisedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_examples,
    eval_dataset=validation_examples,
    data_collator=data_collator,
    tokenizer=model.tokenizer,
    loss_function=train_loss,
)

if custom_args.stop_after_n_steps is not None:
    trainer.add_callback(StopTrainingCallback(custom_args.stop_after_n_steps))

In [None]:
trainer.train()