In [None]:
#|default_exp 4_supervised_training

## Supervised training
Refer to 
https://github.com/jalkestrup/llm2vec-dtu/blob/main/experiments/run_supervised.py

In [4]:
#|export
from datasets import load_dataset
import transformers
from transformers import HfArgumentParser, TrainingArguments
from tqdm import tqdm
from accelerate import Accelerator, DistributedDataParallelKwargs
from huggingface_hub import HfApi
from dotenv import load_dotenv
import os
from dataclasses import dataclass
from typing import List, Union
import torch

from llm2vec_da import LLM2Vec
from llm2vec_da.model import initialize_peft
from llm2vec_da.data_utils import custom_dataset
from llm2vec_da.loss.utils import load_loss
from llm2vec_da.training import MixedNegCollator, SupervisedTrainer, StopTrainingCallback


from llm2vec_da.arguments import EmbeddingModelArguments, DataTrainingArguments, CustomArguments


In [5]:
#|export
load_dotenv()

api = HfApi(token=os.getenv("HF_TOKEN"))

/teamspace/studios/this_studio/llm2vec-da


In [None]:
# Alternatively, login with huggingface_hub GUI
#notebook_login()

# Handle lighting AI studio path
if '/teamspace' in os.getcwd():
    os.chdir('/teamspace/studios/this_studio/llm2vec-da')
    # Hmm lighting AI studio changed to the below ..?
    #os.chdir('/home/zeus/content/llm2vec-da')
    print(os.getcwd())

In [6]:
#|export
supervised_parser = HfArgumentParser(
        (EmbeddingModelArguments, DataTrainingArguments, TrainingArguments, CustomArguments)
    )

model_args, data_args, training_args, custom_args = supervised_parser.parse_json_file(
        "configs/supervised/MetaLlama3-sheared.json"
    )

if training_args.ddp_find_unused_parameters:
    kwargs = [
        DistributedDataParallelKwargs(
            dim=0,
            broadcast_buffers=True,
            bucket_cap_mb=25,
            find_unused_parameters=True,
            check_reduction=False,
            gradient_as_bucket_view=False,
        )
    ]
else:
    kwargs = []

accelerator = Accelerator(kwargs_handlers=kwargs)
transformers.set_seed(training_args.seed)

#ABSOLUTELY CRITICAL OR WILL CAUSE OBSCURE NO GRAD ERROR THAT TOOK FREAKING 4 HOURS TO IDENTIFY
if training_args.gradient_checkpointing:
    training_args.gradient_checkpointing_kwargs = {"use_reentrant": False}

## Load data

In [9]:
#|export

dataset = load_dataset(data_args.dataset_name,
split="train[:10%]")
#,columns=['query', 'positive', 'negative', 'instruction', 'task'])

{'query': 'Hvad var de langsigtede konsekvenser for dansk økonomi af den danske forfatningslov af 1849?', 'positive': 'Den danske forfatningslov af 1849 markerede et paradigmeskift i dansk politisk og økonomisk struktur.  Loven etableret et konstitutionelt monarki, begrænsede kongelig magt og indførte en parlamentarisk form for styre. Dette havde vidtrækkende konsekvenser for den danske økonomi.  Den nye forfatning lagde grunden for en mere liberalistisk økonomisk orden, med fokus på fri handel, privat ejendomsret og entreprenørskab.  Den øgede politiske stabilitet og forudsigelighed tiltrak udenlandsk kapital og investeringer, der bidrog til økonomisk vækst.  Samtidig reducerede loven den statslige indblanding i økonomien, hvilket gav plads til privat initiativ og markedskræfter.  Introduktionen af en national valuta og en centralbank styrkede den økonomiske integration med andre europæiske lande.  Mens forfatningsloven af 1849 ikke direkte førte til økonomisk mirakel, lagde den grund

In [2]:
print(dataset[0])
# Optionally, save to local file
#dataset.save_to_disk("nordic-embedding-training-data")

# Optionally, load from local file
#from datasets import load_from_disk
#ds_transformed = load_from_disk("/teamspace/studios/this_studio/synthetic-supervised-dataset-2")


NameError: name 'dataset' is not defined

In [None]:
#|export

# Split the dataset into 95% train and 5% test
split_dataset = dataset.train_test_split(test_size=0.05, seed=42)

# Define the splits
train_dataset = split_dataset['train']
valid_dataset = split_dataset['test']

In [10]:
#|export

train_dataset_e5 = custom_dataset(train_dataset, 
                                 effective_batch_size=training_args.per_device_train_batch_size* accelerator.num_processes)

valid_dataset_e5 = custom_dataset(valid_dataset,
                                effective_batch_size=training_args.per_device_train_batch_size* accelerator.num_processes)

Loading dataset: 100%|██████████| 91983/91983 [00:11<00:00, 7891.17it/s]
Grouping data by task: 100%|██████████| 91983/91983 [00:00<00:00, 2248842.45it/s]
INFO:__main__:Batching data for effective batch size = 32 ...
Batching data:   0%|          | 0/2 [00:00<?, ?it/s]INFO:__main__:Skipping partial batch of 21 samples for task retrieval
INFO:__main__:Skipping partial batch of 26 samples for task classification
Batching data: 100%|██████████| 2/2 [00:00<00:00, 382.88it/s]
INFO:__main__:Loaded and batched 91936 samples from 2 tasks
Loading dataset: 100%|██████████| 4842/4842 [00:00<00:00, 7915.46it/s]
Grouping data by task: 100%|██████████| 4842/4842 [00:00<00:00, 2702078.23it/s]
INFO:__main__:Batching data for effective batch size = 32 ...
Batching data:   0%|          | 0/2 [00:00<?, ?it/s]INFO:__main__:Skipping partial batch of 26 samples for task retrieval
INFO:__main__:Skipping partial batch of 16 samples for task classification
Batching data: 100%|██████████| 2/2 [00:00<00:00, 2810

In [11]:
train_dataset_e5[0].texts

['Locate job postings requiring a specific skill set and industry experience.; !@#$%^&*()Jeg søger en stilling som postdoc forsker indenfor kvantitativ genetik med fokus på genomisk selektion hos hvede, hvor jeg kan kombinere mineOoh ekspertise i statistisk modellering med brugen af moderne højtydende computer clusters.',
 '!@#$%^&*()Vi søger en motiveret og dygtig postdoc forsker til at slutte sig til vores hold, der arbejder med at forstå de genetiske mekanismer bag udvikling af acidotolerante hvedesorter. Projektet involverer anvendelsen af genomisk selektionsanalyse, kvantitativ genetik og avanceret statistisk modellering. Du vil have adgang til state-of-the-art computerclusters og arbejde med store datasæt af sekvensdata. Kendskab til programmeringssprog som R eller Python er et krav. En afhandling indenfor et relevant felt, f.eks. genetik, bioinformatik eller matematik, forventes.',
 '!@#$%^&*()Vores team af forskere er specialiseret i anvendelse af maskinlæring til at analysere 

## Tokenizer test

In [14]:
# import torch
# from torch import nn
# from transformers import AutoTokenizer, AutoModel
# from transformers import LlamaConfig


# class TinyLLM2Vec(nn.Module):
#     """
#     Drop‑in replacement for LLM2Vec that is tiny but respects the API:
#       - .tokenize(list[str]) -> dict[str, Tensor] batch encoding
#       - .encode(features)    -> Tensor (batch, D)
#       - .pooling_mode attr   -> str
#     """
#     def __init__(self, model_name="prajjwal1/bert-tiny", pooling_mode="cls"):
#         super().__init__()
#         self.tokenizer = AutoTokenizer.from_pretrained(model_name)
#         self.model     = AutoModel.from_pretrained(model_name)
#         self.config = self.model.config          # forward attr used by prep‑fn
#         #self.config = LlamaConfig()
#         self.config._name_or_path = "meta-llama/Meta-Llama-3-8B" # To fake the config
        
#         self.pooling_mode = pooling_mode   # value read by prepare_for_tokenization

#     @torch.no_grad()
#     def tokenize(self, texts):
#         return self.tokenizer(
#             texts, padding=True, truncation=True, return_tensors="pt", max_length=512
#         )

#     @torch.no_grad()
#     def encode(self, features):
#         out = self.model(**features).last_hidden_state   # (B, L, H)
#         if self.pooling_mode == "cls":
#             return out[:, 0]                             # (B, H)
#         elif self.pooling_mode == "mean":
#             mask = features["attention_mask"].unsqueeze(-1)
#             return (out * mask).sum(1) / mask.sum(1)     # (B, H)
#         else:
#             raise ValueError("Unknown pooling mode")

# model_tiny = TinyLLM2Vec(pooling_mode="mean")      # instead of Llama‑8B

### Test of prepare_for_tokenization

In [15]:
# # Set _name_or_path to define tokenizer model behavior
# model_tiny.config._name_or_path =  "meta-llama/Meta-Llama-3-8B"

# # Inspect the input query and the output query before and after
# print(f'Input query: {train_dataset_e5[0].texts[0]}')
# print(f'Output query: {prepare_for_tokenization(model_tiny, train_dataset_e5[0].texts[0], pooling_mode="eos_token")}')

In [16]:
# from llm2vec_da.loss import HardNegativeNLLLoss
# from torch.utils.data import DataLoader

# collator = MixedNegCollator(model_tiny)           # the new collator

# loader   = DataLoader(
#                dataset=train_dataset_e5,
#                batch_size=32,                 
#                shuffle=False, # DO NOT SHUFFLE, batching is done in the dataset class
#                collate_fn=collator
#            )

# loss_fn  = HardNegativeNLLLoss(scale=20.0)   # unchanged

In [17]:
# batch = next(iter(loader))
# (q_feat, p_feat, n_feat), _ = batch

# q_reps = model_tiny.encode(q_feat)                # (B, D)
# p_reps = model_tiny.encode(p_feat)                # (B, D)
# n_reps = model_tiny.encode(n_feat) if n_feat else None

# loss = loss_fn(q_reps, p_reps, n_reps)
# print("forward OK, loss =", loss.item())

## Load model

In [18]:
#|export
torch_dtype = (
    model_args.torch_dtype
    if model_args.torch_dtype in ["auto", None]
    else getattr(torch, model_args.torch_dtype)
)

#training_args.gradient_checkpointing = False   # turn it off
model = LLM2Vec.from_pretrained(
    base_model_name_or_path=model_args.model_name_or_path,
    enable_bidirectional=model_args.bidirectional,
    peft_model_name_or_path="McGill-NLP/LLM2Vec-Sheared-LLaMA-mntp",
    merge_peft=True,
    pooling_mode=model_args.pooling_mode,
    max_length=data_args.max_seq_length,
    torch_dtype=torch_dtype,
    low_cpu_mem_usage=model_args.low_cpu_mem_usage,
    attn_implementation="sdpa", #OBS SET BACK TO FLASH ATTENTION WHEN RUNNING ON A100 GPU!!
)

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.


## Set up PEFT

In [19]:
#|export
peft_model = initialize_peft(
    model.model,
    lora_r=custom_args.lora_r,
    lora_alpha=2 * custom_args.lora_r,
    lora_dropout=custom_args.lora_dropout,
)

# model organization is LLM2VecModel.model -> HF Model, we have to apply PEFT to the inner model
model.model = peft_model.model

INFO:peft.tuners.tuners_utils:Already found a `peft_config` attribute in the model. This will lead to having multiple adapters in the model. Make sure to know what you are doing!


Model's Lora trainable parameters:
trainable params: 14,991,360 || all params: 1,294,878,720 || trainable%: 1.1577


In [20]:
print("after-peft trainable:",
      [n for n,p in model.named_parameters() if p.requires_grad][:10])

after-peft trainable: ['model.layers.0.self_attn.q_proj.lora_A.default.weight', 'model.layers.0.self_attn.q_proj.lora_B.default.weight', 'model.layers.0.self_attn.k_proj.lora_A.default.weight', 'model.layers.0.self_attn.k_proj.lora_B.default.weight', 'model.layers.0.self_attn.v_proj.lora_A.default.weight', 'model.layers.0.self_attn.v_proj.lora_B.default.weight', 'model.layers.0.self_attn.o_proj.lora_A.default.weight', 'model.layers.0.self_attn.o_proj.lora_B.default.weight', 'model.layers.0.mlp.gate_proj.lora_A.default.weight', 'model.layers.0.mlp.gate_proj.lora_B.default.weight']


In [21]:
tok = model.tokenize(["just a test"])              # CPU tensors
dev = next(model.parameters()).device              # cuda:0 or cpu
tok = {k: v.to(dev) for k, v in tok.items()}       # move batch

with torch.set_grad_enabled(True):
    reps = model(tok)                              # runs __call__ → forward
print("requires_grad:", reps.requires_grad, reps.grad_fn)

requires_grad: True <StackBackward0 object at 0x7f481be80c70>


## Training

In [22]:
#|export

tokenizer = model.tokenizer
data_collator = MixedNegCollator(model)           # the new collator

# Load train examples into memory
train_examples = [
    train_dataset_e5[i]
    for i in tqdm(
        range(len(train_dataset_e5)),
        desc="Loading train examples...",
        disable=not accelerator.is_main_process,
    )
]

valid_examples = [
    valid_dataset_e5[i]
    for i in tqdm(
        range(len(valid_dataset_e5)),
        desc="Loading valid examples...",
        disable=not accelerator.is_main_process,
    )
]

Loading train examples...: 100%|██████████| 91936/91936 [00:00<00:00, 139876.47it/s]
Loading valid examples...: 100%|██████████| 4800/4800 [00:00<00:00, 442106.79it/s]


In [30]:
#|export
train_loss = load_loss(custom_args.loss_class, scale=custom_args.loss_scale)

trainer = SupervisedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_examples,
    eval_dataset=valid_examples,
    data_collator=data_collator,
    tokenizer=model.tokenizer,
    loss_function=train_loss,
)

if custom_args.stop_after_n_steps is not None:
    trainer.add_callback(StopTrainingCallback(custom_args.stop_after_n_steps))

  super().__init__(*args, **kwargs)


In [25]:
# os.environ["WANDB_DISABLED"] = "false"

In [31]:
#|export

trainer.train()

***** Running training *****
  Num examples = 90,656
  Num Epochs = 3
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 8,499
  Number of trainable parameters = 14,991,360
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Could not log the number of model parameters in Weights & Biases due to an AttributeError.


Step,Training Loss
1,0.9727
5,5.7549
10,0.9945
15,1.0188
20,1.093
25,0.9734
30,5.3203
35,0.7828
40,0.5684
45,0.543


INFO:__main__:Saving model checkpoint to output/mntp-supervised/Meta-Llama-3-sheared/checkpoint-50
Configuration saved in output/mntp-supervised/Meta-Llama-3-sheared/checkpoint-50/config.json
Model weights saved in output/mntp-supervised/Meta-Llama-3-sheared/checkpoint-50/model.safetensors
tokenizer config file saved in output/mntp-supervised/Meta-Llama-3-sheared/checkpoint-50/tokenizer_config.json
Special tokens file saved in output/mntp-supervised/Meta-Llama-3-sheared/checkpoint-50/special_tokens_map.json
INFO:__main__:Saving model checkpoint to output/mntp-supervised/Meta-Llama-3-sheared/checkpoint-100
Configuration saved in output/mntp-supervised/Meta-Llama-3-sheared/checkpoint-100/config.json
Model weights saved in output/mntp-supervised/Meta-Llama-3-sheared/checkpoint-100/model.safetensors
tokenizer config file saved in output/mntp-supervised/Meta-Llama-3-sheared/checkpoint-100/tokenizer_config.json
Special tokens file saved in output/mntp-supervised/Meta-Llama-3-sheared/checkpoi

KeyboardInterrupt: 

In [None]:
#|export

def main():
    trainer.train()

if __name__ == "__main__":
    main() 

In [2]:
from nbdev.export import nb_export
nb_export('4_supervised_training_to_py.ipynb', '.')