In [1]:
import datasets
import transformers
import os
import torch

In [2]:
# Handle lighting AI studio
if '/teamspace' in os.getcwd():
    os.chdir('/teamspace/studios/this_studio/llm2vec-da')
    print(os.getcwd())

/teamspace/studios/this_studio/llm2vec-da


In [3]:
from llm2vec_da.arguments import ModelArguments, DataTrainingArguments, CustomArguments
from transformers import HfArgumentParser, TrainingArguments

parser = HfArgumentParser(
    (ModelArguments, DataTrainingArguments, TrainingArguments, CustomArguments)
)

model_args, data_args, training_args, custom_args = parser.parse_json_file("configs/mntp/MetaLlama3-sheared.json")



In [4]:
config_kwargs = {
    "cache_dir": model_args.cache_dir,
    "revision": model_args.model_revision,
    "token": model_args.token,
    "trust_remote_code": model_args.trust_remote_code,
}
if training_args.gradient_checkpointing:
    training_args.gradient_checkpointing_kwargs = {"use_reentrant": False}
# Set seed before initializing model.

transformers.set_seed(training_args.seed)

In [5]:
from transformers import AutoConfig
from llm2vec_da.model import get_model_class

config = AutoConfig.from_pretrained(
    model_args.model_name_or_path, **config_kwargs
)

#Verifying that LLM2Vec is detecting the correct model class
model_class = get_model_class(config)
print(f'Model class detected by LLM2Vec clas:\n{model_class}')

Model class detected by LLM2Vec clas:
<class 'llm2vec_da.model_modifications.bidirectional_llama.LlamaBiForMNTP'>


## ! REMEMBER TO CHANGE ATTN_IMPLEMENTATION BACK TO FLASH !

In [6]:
torch_dtype = (
    model_args.torch_dtype
    if model_args.torch_dtype in ["auto", None]
    else getattr(torch, model_args.torch_dtype)
)
model = model_class.from_pretrained(
    model_args.model_name_or_path,
    from_tf=bool(".ckpt" in model_args.model_name_or_path),
    device_map="auto",
    config=config,
    cache_dir=model_args.cache_dir,
    revision=model_args.model_revision,
    token=model_args.token,
    trust_remote_code=model_args.trust_remote_code,
    torch_dtype=torch_dtype,
    low_cpu_mem_usage=model_args.low_cpu_mem_usage,
    attn_implementation="sdpa", #OBS SET BACK TO FLASH ATTENTION WHEN RUNNING ON A100 GPU!!
)
#    device_map="auto",
#model_args.attn_implementation,
#model.to('cuda')

Inspecting model to see the modified Attention layers

In [7]:
model

LlamaBiForMNTP(
  (model): LlamaBiModel(
    (embed_tokens): Embedding(32000, 2048, padding_idx=0)
    (layers): ModuleList(
      (0-23): 24 x ModifiedLlamaDecoderLayer(
        (self_attn): ModifiedLlamaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (v_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=5504, bias=False)
          (up_proj): Linear(in_features=2048, out_features=5504, bias=False)
          (down_proj): Linear(in_features=5504, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=

In [8]:
model.model.layers[0].self_attn

ModifiedLlamaSdpaAttention(
  (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
  (k_proj): Linear(in_features=2048, out_features=2048, bias=False)
  (v_proj): Linear(in_features=2048, out_features=2048, bias=False)
  (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
  (rotary_emb): LlamaRotaryEmbedding()
)

## Set up PEFT

In [9]:
from peft import LoraConfig, get_peft_model
from typing import List, Optional

def initialize_peft(
    model,
    lora_r: int = 8,
    lora_alpha: int = 16,
    lora_dropout: float = 0.05,
    lora_modules: Optional[List[str]] = None,
):
    if lora_modules is None and model.config.__class__.__name__ in [
        "LlamaConfig",
        "MistralConfig",
    ]:
        lora_modules = [
            "q_proj",
            "v_proj",
            "k_proj",
            "o_proj",
            "gate_proj",
            "up_proj",
            "down_proj",
        ]
    elif lora_modules is None:
        raise ValueError("lora_modules must be specified for this model.")

    config = LoraConfig(
        r=lora_r,
        lora_alpha=lora_alpha,
        target_modules=lora_modules,
        lora_dropout=lora_dropout,
        bias="none",
        task_type=None,
    )
    # model organization is MODEL_TYPEBiForMNTP.model -> MODEL_TYPELBiModel, we have to apply PEFT to the inner model
    peft_model = get_peft_model(model, config)
    print(f"Model's Lora trainable parameters:")
    peft_model.print_trainable_parameters()
    return peft_model

#Similar to the below, just copied out for readability
#from llm2vec_da.model import initialize_peft

peft_model = initialize_peft(
    model.model,
    lora_r=custom_args.lora_r,
    lora_alpha=2 * custom_args.lora_r,
    lora_dropout=custom_args.lora_dropout,
)

model.model = peft_model.model

Model's Lora trainable parameters:
trainable params: 14,991,360 || all params: 1,294,878,720 || trainable%: 1.1577


In [10]:
peft_model.model

LlamaBiModel(
  (embed_tokens): Embedding(32000, 2048, padding_idx=0)
  (layers): ModuleList(
    (0-23): 24 x ModifiedLlamaDecoderLayer(
      (self_attn): ModifiedLlamaSdpaAttention(
        (q_proj): lora.Linear(
          (base_layer): Linear(in_features=2048, out_features=2048, bias=False)
          (lora_dropout): ModuleDict(
            (default): Dropout(p=0.05, inplace=False)
          )
          (lora_A): ModuleDict(
            (default): Linear(in_features=2048, out_features=16, bias=False)
          )
          (lora_B): ModuleDict(
            (default): Linear(in_features=16, out_features=2048, bias=False)
          )
          (lora_embedding_A): ParameterDict()
          (lora_embedding_B): ParameterDict()
          (lora_magnitude_vector): ModuleDict()
        )
        (k_proj): lora.Linear(
          (base_layer): Linear(in_features=2048, out_features=2048, bias=False)
          (lora_dropout): ModuleDict(
            (default): Dropout(p=0.05, inplace=False)
     

## Set up data collation

In [11]:
from transformers import AutoTokenizer
tokenizer_kwargs = {
    #"cache_dir": model_args.cache_dir,
    "use_fast": model_args.use_fast_tokenizer,
    "revision": model_args.model_revision,
    "token": model_args.token,
    "trust_remote_code": model_args.trust_remote_code,
}
tokenizer = AutoTokenizer.from_pretrained(
        model_args.model_name_or_path, **tokenizer_kwargs
)
#tokenizer

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.


In [12]:
if tokenizer.mask_token is None:
    if custom_args.mask_token_type == "blank":
        print("Setting mask token to _")
        tokenizer.mask_token = "_"
    elif custom_args.mask_token_type == "eos":
        print("Setting mask token to eos")
        tokenizer.mask_token = tokenizer.eos_token
    elif custom_args.mask_token_type == "mask":
        print("Setting mask token to <mask>")
        tokenizer.add_tokens(["<mask>"])
        tokenizer.mask_token = "<mask>"
    else:
        raise ValueError(
            f"mask_token_type {custom_args.mask_token_type} is not supported."
        )

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

Setting mask token to _


In [13]:
from llm2vec_da.training import DataCollatorForLanguageModelingWithFullMasking

data_collator = DataCollatorForLanguageModelingWithFullMasking(
    tokenizer=tokenizer,
    mlm_probability=data_args.mlm_probability
)

**Verifying that the data collator works**


As seen below, parts of the input is now masked with the mask token (vocab 62)

In [14]:
data_collator.tokenizer.vocab['_']

29918

In [15]:
data_collator( (torch.randint(0, 10, (1, 10)), ))

{'input_ids': tensor([[[    2, 29918,     0, 29918,     4,     1,     6, 29918,     6,     7]]]),
 'labels': tensor([[[-100,    5, -100,    3, -100, -100, -100,    9, -100, -100]]])}

### Loading dataset
#### **REMEMBER TO CHANGE TO CORRECT DATASET**

In [16]:
# Model specific (tokenizer) dataset
tokenized_datasets = datasets.load_from_disk("data/mntp_wiki_dk_512_sheared")

In [17]:
train_dataset = tokenized_datasets["train"]
if data_args.max_train_samples is not None:
    max_train_samples = min(len(train_dataset), data_args.max_train_samples)
    train_dataset = train_dataset.select(range(max_train_samples))

In [18]:
eval_dataset = tokenized_datasets["validation"]
if data_args.max_eval_samples is not None:
    max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
    eval_dataset = eval_dataset.select(range(max_eval_samples))

## Set up trainer

In [19]:
from transformers import is_torch_tpu_available
from llm2vec_da.training import MNTPTrainer, StopTrainingCallback
from llm2vec_da.metrics import MetricEvaluator, preprocess_logits_for_metrics

In [20]:
evaluator = MetricEvaluator(model_args.cache_dir)

In [21]:
import os
import wandb

# Ensure W&B picks up the correct settings
os.environ["WANDB_PROJECT"] = custom_args.wandb_project
os.environ["WANDB_LOG_MODEL"] = custom_args.wandb_log_model
if custom_args.wandb_run_group:
    os.environ["WANDB_RUN_GROUP"] = custom_args.wandb_run_group
if custom_args.wandb_watch:
    os.environ["WANDB_WATCH"] = custom_args.wandb_watch

In [22]:

trainer = MNTPTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset if training_args.do_train else None,
    eval_dataset=eval_dataset if training_args.do_eval else None,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=evaluator if training_args.do_eval and not is_torch_tpu_available()
                              else None,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics
    if training_args.do_eval and not is_torch_tpu_available()
    else None,

)

model.config.use_cache = False

trainer.add_callback(StopTrainingCallback(custom_args.stop_after_n_steps))

#trainer.callback_handler.remove_callback(transformers.integrations.integration_utils.WandbCallback)



## Train
Bear in mind that ~50GB of GPU memory is required to run the below. Training was run on a A100 GPU with 80GB GPU.

In [None]:
train_result = trainer.train() 

# [optional] finish the wandb run, necessary in notebooks
wandb.finish()

***** Running training *****
  Num examples = 140,252
  Num Epochs = 3
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 13,149
  Number of trainable parameters = 80,527,360
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33mjalkestrup[0m ([33mjealk[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Step,Training Loss,Validation Loss


## Save model

In [None]:
model.save_pretrained(ModelArguments.model_name_or_path+"_mntp_trained")

In [None]:
trainer.save_model()  # Saves the tokenizer too for easy upload
metrics = train_result.metrics

max_train_samples = (
    data_args.max_train_samples
    if data_args.max_train_samples is not None
    else len(train_dataset)
)
metrics["train_samples"] = min(max_train_samples, len(train_dataset))

trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()

ModelArguments(model_name_or_path='AI-Sweden-Models/Llama-3-8B-instruct', model_type=None, config_overrides=None, config_name=None, tokenizer_name=None, cache_dir=None, use_fast_tokenizer=True, model_revision='main', token=None, use_auth_token=None, trust_remote_code=False, torch_dtype='bfloat16', attn_implementation='flash_attention_2', low_cpu_mem_usage=False)