# Fine Tune the Minimal-Edit LLM

## Imports

Import all relevant packages

In [1]:
from prompts import minimal_prompt as prompt
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    BitsAndBytesConfig,
    Trainer,
    DataCollatorForSeq2Seq,
)
from datasets import load_dataset, load_from_disk
import torch
import bitsandbytes as bnb
from os import path
from peft import (
    prepare_model_for_kbit_training,
    LoraConfig,
    get_peft_model
)



## Model Setup

Load the model and tokenizer, then move the model onto the GPU.

Throw an error message if GPU is not accessible.

In [2]:
base_model_name = "LumiOpen/Viking-7B"
device = "cuda"
if not torch.cuda.is_available():
    raise RuntimeError("GPU is not available for training!")

nf4_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(
    base_model_name, quantization_config=nf4_config
)
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

`low_cpu_mem_usage` was None, now default to True since model is quantized.


Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [3]:
base_dataset_dir = "datasets"
minimal_dataset_path = path.join(base_dataset_dir, "minimal", "")
minimal_dataset = load_from_disk(minimal_dataset_path)
print(minimal_dataset)

DatasetDict({
    train: Dataset({
        features: ['source', 'target'],
        num_rows: 402
    })
    validation: Dataset({
        features: ['source', 'target'],
        num_rows: 50
    })
    test: Dataset({
        features: ['source', 'target'],
        num_rows: 50
    })
})


## Process Input and Output

Load the input and output into a dictionary with the following structure:

- "input": [PROMPT] + input_text
- "output": output_text


In [4]:
def preprocess_function(examples):
    inputs = [prompt + example for example in examples["source"]]
    targets = [example for example in examples["target"]]
    return tokenizer(inputs, text_target=targets, max_length=4096, padding="max_length")

In [5]:
tokenized_minimal_dataset = minimal_dataset.map(preprocess_function, batched=True)

In [6]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    # target_modules=["query_key_value"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)

training_arguments = TrainingArguments(
    output_dir="tmp_model",
    num_train_epochs=1,
    optim="adamw_bnb_8bit",
    learning_rate=5e-5,
    bf16=True,
    logging_steps=1,
    per_device_train_batch_size=4,
)

trainer = Trainer(
    model=model,
    args=training_arguments,
    train_dataset=tokenized_minimal_dataset["train"],
    eval_dataset=tokenized_minimal_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator
)
model.config.use_cache = False

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [7]:
trainer.train()

  return fn(*args, **kwargs)


Step,Training Loss
1,6.3364
2,6.4867
3,4.7889
4,3.6123
5,2.4576
6,3.0646
7,0.9601
8,2.1108
9,1.1551
10,1.2586


TrainOutput(global_step=101, training_loss=0.9117143683799422, metrics={'train_runtime': 382.7579, 'train_samples_per_second': 1.05, 'train_steps_per_second': 0.264, 'total_flos': 6.932814413325926e+16, 'train_loss': 0.9117143683799422, 'epoch': 1.0})