This notebook fine-tune Mistral 7B with LoRA. 

First, we need all these dependencies:

Import all the necessary packages.

In [31]:
!pip install --upgrade -q -U git+https://github.com/huggingface/transformers
!pip install --upgrade -q -U git+https://github.com/huggingface/peft
!pip install --upgrade -q -U git+https://github.com/huggingface/accelerate
!pip install --upgrade -q -U datasets
!pip install --upgrade -q -U trl
!pip install --upgrade -q -U bitsandbytes
!pip install --upgrade -q -U accelerate 

In [1]:
import torch
print(torch.__version__)

2.2.2+cu121


In [33]:
!pip uninstall torch -y
!pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu115

In [2]:
import torch
from datasets import load_dataset
from peft import LoraConfig, PeftModel, get_peft_model
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    BitsAndBytesConfig
)
from trl import SFTTrainer
from huggingface_hub import login


  from .autonotebook import tqdm as notebook_tqdm


In [35]:
from huggingface_hub import notebook_login
# Log in to HF Hub
notebook_login()

Load the tokenizer and configure padding

In [36]:
def load_quantized_model(model_name: str):
    """
    :param model_name: Name or path of the model to be loaded.
    :return: Loaded quantized model.
    """
    bnb_config = BitsAndBytesConfig(
      load_in_4bit=True,
      bnb_4bit_use_double_quant=True,
      bnb_4bit_quant_type="nf4",
      bnb_4bit_compute_dtype="float16"
    )

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",
        quantization_config=bnb_config
    )

    return model

def initialize_tokenizer(model_name: str):
    """
    Initialize the tokenizer with the specified model_name.

    :param model_name: Name or path of the model for tokenizer initialization.
    :return: Initialized tokenizer.
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="right",add_eos_token=True,add_bos_token=True, use_fast=False)
    tokenizer.pad_token = tokenizer.eos_token
    return tokenizer

In [37]:
model_name = "mistralai/Mistral-7B-v0.1"
model = load_quantized_model(model_name)

# Tokenizer
tokenizer = initialize_tokenizer(model_name)

Loading checkpoint shards: 100%|██████████████████████████████████████| 2/2 [00:04<00:00,  2.36s/it]


In [38]:
train_dataset = load_dataset('json', data_files='combined_train.json', split='train')
eval_dataset = load_dataset('json', data_files='combined_test.json', split='train')

In [39]:
def create_prompt(source_lang, target_lang, dataset, one_shot=True):
    prompts = []
    if one_shot:
        for example in dataset:
            hutsul_sentence = example["hutsul"]
            ukrainian_sentence = example["ukrainian"]

            hutsul_src = source_lang + ": " + hutsul_sentence
            ukrainian_src = target_lang + ": " + ukrainian_sentence

            src_segment = hutsul_src + "\n" + ukrainian_src
            prompts.append(src_segment)
    return prompts

In [40]:
train_prompts = create_prompt("Hutsul", "Ukrainian", train_dataset)
eval_prompts = create_prompt("Hutsul", "Ukrainian", eval_dataset)

In [41]:
train_prompts[0]

'Hutsul: Всий пропій був на конех, буйних, золотогривих та груднистих.\nUkrainian: Весь пропій був на конях — буйних, золотогривих і грудастих.'

In [42]:
from datasets import Dataset, DatasetDict

dataset = DatasetDict({
    "train": Dataset.from_dict({"text": train_prompts}),
    "validation": Dataset.from_dict({"text": eval_prompts})
})

In [43]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 13073
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3269
    })
})

In [44]:
from pprint import pprint

dataset['train'][2444]

{'text': 'Hutsul: Уводно визерав.\nUkrainian: Завжди визирав.'}

In [45]:
def get_max_token_length(dataset, tokenizer):
    max_length = 0
    for example in dataset:
        with tokenizer.as_target_tokenizer():
            inputs = tokenizer(example["text"])
        max_length = max(max_length, len(inputs["input_ids"]))
    return max_length

In [46]:
max_token_length = get_max_token_length(dataset["train"], tokenizer)
max_token_length_val = get_max_token_length(dataset["validation"], tokenizer)
max_token_length, max_token_length_val

(919, 525)

In [47]:
#!pip install -q wandb -U
import wandb, os
wandb.login()

True

## Set up LoRa

In [48]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [49]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [50]:
print(model)

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-12): 13 x MistralDecoderLayer(
        (self_attn): MistralSdpaAttention(
          (q_proj): lora.Linear4bit(
            (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.05, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=4096, out_features=64, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=64, out_features=4096, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
            (lora_magnitude_vector): ParameterDict(  (default): Parameter containing: [torch.cuda.FloatTensor of size 4096 (cuda:0)])
          )
          (k_proj): lora.Linear4bit(
            (base_layer): Linear4bit(

In [51]:
peft_config = LoraConfig(
        lora_alpha=32,
        lora_dropout=0.05,
        r=64,
        bias="none",
        use_dora=True, #--- Uncomment for using DoRa
        task_type="CAUSAL_LM",
        target_modules= ['k_proj', 'q_proj', 'v_proj', 'o_proj', "gate_proj", "down_proj", "up_proj"]
)

model = get_peft_model(model, peft_config)
print_trainable_parameters(model)

trainable params: 169148416 || all params: 3921219584 || trainable%: 4.31366855072812


In [52]:
print(model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): Embedding(32000, 4096)
        (layers): ModuleList(
          (0-12): 13 x MistralDecoderLayer(
            (self_attn): MistralSdpaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=64, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=64, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ParameterDict(  (default): Parameter containi

In [53]:
if torch.cuda.device_count() > 1: # If more than 1 GPU
    model.is_parallelizable = True
    model.model_parallel = True

In [54]:
from datetime import datetime

project = "hutsul-finetune-combined"
base_model_name = "mistral"
run_name = base_model_name + "-" + project
output_dir = "./" + run_name

In [55]:
import warnings
warnings.filterwarnings('ignore')

This first training runs the standard LoRA for 3 epochs.

In [56]:
max_seq_length = 1024

training_arguments = TrainingArguments(
        output_dir=output_dir,
        evaluation_strategy="epoch",
        do_eval=True,
        optim="adamw_torch",
        per_device_train_batch_size=8,
        gradient_checkpointing=True,
        gradient_accumulation_steps=6,
        per_device_eval_batch_size=8,
        logging_steps=25,
        logging_dir="./logs",
        learning_rate=2e-5,
        weight_decay=0.001,
        # eval_steps=150, # comment it if we want to evaluate at the end of each epoch
        save_strategy='steps',
        num_train_epochs=3,
        warmup_steps=0.03,
        lr_scheduler_type="cosine",
        report_to="wandb",
        save_steps=100,
        run_name=f"{run_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}"
)

trainer = SFTTrainer(
        model=model,
        train_dataset=dataset["train"],
        eval_dataset=dataset["validation"],
        peft_config=peft_config,
        dataset_text_field="text",
        max_seq_length=max_seq_length,
        tokenizer=tokenizer,
        packing=False,
        args=training_arguments,
)

Map: 100%|███████████████████████████████████████████| 13073/13073 [00:01<00:00, 8052.22 examples/s]
Map: 100%|█████████████████████████████████████████████| 3269/3269 [00:00<00:00, 8085.37 examples/s]


In [58]:
trainer.train(resume_from_checkpoint="mistral-hutsul-finetune-combined/checkpoint-800")

Epoch,Training Loss,Validation Loss
2,1.3234,1.452098


TrainOutput(global_step=816, training_loss=0.026099607056262446, metrics={'train_runtime': 2459.9612, 'train_samples_per_second': 15.943, 'train_steps_per_second': 0.332, 'total_flos': 3.010260368545628e+17, 'train_loss': 0.026099607056262446, 'epoch': 3.0})

In [59]:
wandb.finish()

trainer.save_model()

0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁
train/global_step,▁▁

0,1
eval/loss,1.4521
eval/runtime,1467.3954
eval/samples_per_second,2.228
eval/steps_per_second,0.279
total_flos,3.010260368545628e+17
train/epoch,3.0
train/global_step,816.0
train_loss,0.0261
train_runtime,2459.9612
train_samples_per_second,15.943
