# Fine tuning LLMs on a budget

In [18]:
!pip install pandas datasets openai pydantic transformers trl huggingface_hub peft



In [19]:
from peft import LoraConfig, EvaConfig
from huggingface_hub import login
from trl import SFTConfig, SFTTrainer, setup_chat_format
from transformers import AutoModelForCausalLM, AutoTokenizer, EarlyStoppingCallback

import datasets
import torch

## Utils

Functions that will work on our dataset, determine available HW and demonstrate change in the model.

We don't need to pay them much attention.

In [20]:
# Determine available HW
device = (
      "cuda"
      if torch.cuda.is_available()
      else "mps"
      if torch.backends.mps.is_available()
      else "cpu"
  )
device

'cuda'

In [21]:
MODEL_DEMO_TEMPLATE = """
===================================
Query:\n{question_content}\n
Expected response:\n{answer_content}\n
Actual response:\n{decoded_response}\n
"""

def demo_model(
    model,
    tokenizer,
    samples: list[dict],
    device: str,
) -> None:
    """Demonstrate model behavior. Each sample is a list of message dictionaries,
    alternating conversation roles.
    """

    for sample in samples:
        messages = sample["messages"]
        question = messages[0]
        answer = messages[1]
        formatted_question = tokenizer.apply_chat_template([question], tokenize=False)
        tokenized_message = tokenizer(formatted_question, return_tensors="pt").to(
            device
        )

        question_len = tokenized_message["input_ids"].shape[1]

        # Reponse should not be much longer than what we expect,
        # if it is something has gone wrong.
        # We allow for pessimistic assumption that 1 char -> 1 token.
        max_response_len = len(answer["content"]) * 2

        outputs = model.generate(
            **tokenized_message,
            max_new_tokens=max_response_len,
        )

        # Remove tokens corresponding to question message
        decoded_response = tokenizer.decode(
            outputs[0][question_len:], skip_special_tokens=True
        )

        print(
            MODEL_DEMO_TEMPLATE.format(
                question_content=question["content"],
                decoded_response=decoded_response,
                answer_content=answer["content"],
            )
        )


In [22]:
def create_dataset_split(
    dataset: datasets.Dataset, n_demo_samples: int = 2
) -> tuple[datasets.Dataset, datasets.Dataset, list[dict]]:
    """Create test/train/demo split for the dataset."""

    list_dataset = dataset.to_list()
    dataset_split = len(list_dataset) // 5

    demo, list_dataset = list_dataset[:n_demo_samples], list_dataset[n_demo_samples:]

    train, test = list_dataset[dataset_split:], list_dataset[:dataset_split]
    train = datasets.Dataset.from_list(train)
    test = datasets.Dataset.from_list(test)

    return train, test, demo

## Login to Hugging Face

Some models and datasets require login to Hugging Face.
If you do have a Hugging Face account, enter your token here.

In [23]:
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Model and tokenizer

We'll use small 135 million parameter model from Hugging Face.
The tokenizer already has a chat template, but in cases when it is absent, we can initialize a default.


In [24]:
base_model = "HuggingFaceTB/SmolLM2-135M-Instruct"
model = AutoModelForCausalLM.from_pretrained(
        pretrained_model_name_or_path=base_model
    ).to(device)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=base_model)

# Replace chat template with a default.
# This will break your model, if applied without care!
if (not hasattr(tokenizer, "chat_template")
    or tokenizer.chat_template is None
):
    model, tokenizer = setup_chat_format(model, tokenizer)


## Dataset

For dataset we will use https://huggingface.co/datasets/fedora-copr/packaging-qna which was generated using the routine in our script using Fedora packaging guidelines as a source.

Not available features. We'll be using `messages`, as they are already preformatted to work with trainer.

When working wiht other datasets, this step has to be implemented separately. But it is simple enough transformation.

In [25]:
dataset = datasets.load_dataset("fedora-copr/packaging-qna")
dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'question_topic', 'source', 'document_topic', 'messages'],
        num_rows: 81508
    })
})


Since the dataset has over 80 thousand elements, it would take too long to train on all of it. We'll instead use just 2000 elements. Our model won't be able to learn as much. But it's not going to take as much time.

In [26]:
dataset = datasets.Dataset.from_dict(dataset["train"][:2000])
dataset

Dataset({
    features: ['question', 'answer', 'question_topic', 'source', 'document_topic', 'messages'],
    num_rows: 2000
})

Splitting dataset into training and evaluation subset is a necessity. Otherwise we wouldn't be able to estimate how well will our model perform with data it hasn't seen yet.

We'll also choose couple of samples for demonstrating difference between model behavior before, and after training.

In [27]:
train, eval, demo_samples = create_dataset_split(dataset)

Each sample is a short conversation between model, in an `assistant` role, and `user`. In chatbot applications, these conversations can last much longer.

For our purposes however, a single turn is sufficient.

In [28]:
demo_samples

[{'question': 'Where can large data files from the package be externally placed?',
  'answer': 'If the package contains excessively large data files, they may be placed in a separate `-data` subpackage, as per normal Fedora guidelines.',
  'question_topic': 'Packaging libraries',
  'source': './packaging-committee/guidelines/modules/ROOT/pages/OCaml.adoc',
  'document_topic': 'OCaml Packaging Guidelines',
  'messages': [{'content': 'Where can large data files from the package be externally placed?',
    'role': 'user'},
   {'content': 'If the package contains excessively large data files, they may be placed in a separate `-data` subpackage, as per normal Fedora guidelines.',
    'role': 'assistant'}]},
 {'question': 'What types of conflicts are absolutely unacceptable and under what conditions?',
  'answer': 'Keep in mind that implicit conflicts are NEVER acceptable.',
  'question_topic': 'Implicit Conflicts',
  'source': './packaging-committee/guidelines/modules/ROOT/pages/Conflicts.a

Calling the `demo_model` function, we'll submit a small sample of questions to LLM before fine tuning and compare the output with responses in our dataset.


In [29]:
demo_model(model, tokenizer, demo_samples, device=device)


Query:
Where can large data files from the package be externally placed?

Expected response:
If the package contains excessively large data files, they may be placed in a separate `-data` subpackage, as per normal Fedora guidelines.

Actual response:
assistant
SmolLM can be externally placed in various ways, depending on the specific requirements of your project. Here are some common methods:

1. **Externalized Files**: You can upload your large data files to a cloud storage service like Google Drive, Dropbox, or OneDrive. These services allow you to upload your files to a centralized location, making it easy to access and share them with others.

2. **Cloud Storage Services**: You can also use cloud storage services like Google Drive, Dropbox, or OneDrive to externalize your large data files. These services provide a centralized location for storing and sharing your files, making it easy to access and share them.

3. **Data Repositories**: You can also use data repositories like GitH

## Fine tuning

Instead of changing weights of the entire model, we will save up on memory, compute and time by using LoRa. This way we can train a much smaller model, and modify original models outputs with it.

Eventually, we'll merge this model with the base, creating fine tuned version of the base model for our purposes.

In [30]:
max_steps = 1000
output_dir = "./sft_output"
finetune_name = "FineTunedModel"
rank = 32


LoRa is now a well developed methodology, with many options and augmentations.

We'll use some of the more common ones. There are however many others that can be employed to great effect.

This code uses DoRa, RSLoRa and EVA weight initialization.

DoRA (Weight-Decomposed Low-Rank Adaptation) uses decomposition of base weights into magnitude and direction, and
uses LoRa to update their directions. This approach has been show to consistently outperform base LoRa.

https://huggingface.co/papers/2402.09353 	arXiv:2402.09353


RSLoRA (rank-stabilized LoRA) modifies the scaling factor of LoRa weights to square root of the rank. This is especially useful as rank of adapters increases.

https://huggingface.co/papers/2312.03732 arXiv:2312.03732


EVA (Explained Variance Adaptation) initializes weights of LoRa adapters based on training data, using singular value decomposition of activation vectors. This saves considerable time, as weights start in a state closer to our goal.

https://huggingface.co/papers/2410.07170 arXiv:2410.07170


To make further savings, we'll only train weights of linear modules from the base model. This cuts down the number of parameters in need of adjustment to small fraction of the original.

In [31]:
lora_config = LoraConfig(
      r=rank,
      target_modules="all-linear",  # Fine tune only linear modules
      lora_alpha=rank,  # Set to rank for rslora
      bias="none",
      use_rslora=True,
      init_lora_weights="eva",  # Initialize weights base on data arxiv:2410.07170
      eva_config=EvaConfig(),
      use_dora=True,
  )

In [32]:
sft_config = SFTConfig(
    output_dir=output_dir,
    max_steps=max_steps,  # Adjust based on dataset size and desired training duration
    per_device_train_batch_size=4,  # Set according to your device memory capacity
    learning_rate=1e-5,  # Common starting point for fine-tuning
    logging_steps=10,  # Frequency of logging training metrics
    save_steps=100,  # Frequency of saving model checkpoints
    eval_strategy="steps",  # Evaluate the model at regular intervals
    eval_steps=100,  # Frequency of evaluation
    use_mps_device=(device == "mps"),  # Use MPS for mixed precision training
    hub_model_id=finetune_name,  # Set a unique name for your model,
    metric_for_best_model="eval_loss",  # How we determine our improvement (used by callback)
    load_best_model_at_end=True,  # Return the last best checkpoint
    bf16=(device == "cuda"),  # Use bf16 only if you have CUDA device
    report_to="none",
)

In [33]:
# Initialize the SFTTrainer
trainer = SFTTrainer(
    model=model,
    args=sft_config,
    train_dataset=train,
    processing_class=tokenizer,
    eval_dataset=eval,
    callbacks=[
        EarlyStoppingCallback(early_stopping_threshold=0.01)
    ],  # Stop if we don't improve by at least X
    peft_config=lora_config,
)

# Train the model
trainer.train()

# Save the model
trainer.save_model(f"./{finetune_name}")



Tokenizing train dataset:   0%|          | 0/1598 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/1598 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/400 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/400 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
100,2.1695,2.167062,2.217872,30690.0,0.632381
200,1.9268,2.014471,2.046737,62778.0,0.641535
300,1.9896,1.946958,1.956198,94773.0,0.648312
400,1.7961,1.908914,1.920975,126381.0,0.653616
500,1.7624,1.880907,1.889728,157601.0,0.657177
600,1.8415,1.864224,1.864713,189247.0,0.660754
700,1.8379,1.852224,1.837811,221024.0,0.662814
800,1.8857,1.843835,1.844848,252762.0,0.664634


In [34]:
demo_model(model, tokenizer, demo_samples, device=device)


Query:
Where can large data files from the package be externally placed?

Expected response:
If the package contains excessively large data files, they may be placed in a separate `-data` subpackage, as per normal Fedora guidelines.

Actual response:
assistant
Large data files from the package can be externally placed in the `+/usr/lib/python2.7/dist-packages/+` directory.



Query:
What types of conflicts are absolutely unacceptable and under what conditions?

Expected response:
Keep in mind that implicit conflicts are NEVER acceptable.

Actual response:
assistant
*If a package is not compiled with the `++` compiler flags,
*it must be compiled with the `++` compiler flags.


