# Fine-Tuning Mistral

In [None]:
!pip install -q -U transformers bitsandbytes peft datasets accelerate trl

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/168.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━[0m [32m92.2/168.3 kB[0m [31m2.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m168.3/168.3 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import torch
from transformers import BitsAndBytesConfig
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM
from peft import LoraConfig, get_peft_model
from peft import get_peft_model
from transformers import TrainingArguments
from peft import prepare_model_for_kbit_training
from trl import SFTTrainer
from copy import deepcopy



# Loading the Model

In [None]:
def load_checkpoint(path: str):

  # this should make it fit to vram of gpu
  bnb_config = BitsAndBytesConfig(
      load_in_4bit=True,
      bnb_4bit_quant_type="nf4",
      bnb_4bit_use_double_quant=False,
      bnb_4bit_compute_dtype=torch.bfloat16
  )

  model = AutoModelForCausalLM.from_pretrained(
    path,
    load_in_4bit=True,
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16,
    device_map="auto",
  )

  tokenizer = AutoTokenizer.from_pretrained(
    path
  )

  return model, tokenizer



base_model = "mistralai/Mistral-7B-Instruct-v0.2"
model, tokenizer = load_checkpoint(base_model)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

# Loading the Dataset

In [None]:
from datasets import load_dataset

dataset_name = "databricks/databricks-dolly-15k"

train_dataset_raw = load_dataset(dataset_name, split="train[0:800]")
eval_dataset_raw = load_dataset(dataset_name, split="train[800:1000]")

# Dataset Preprocessing

converts to prompt format

In [None]:
train_dataset_raw

Dataset({
    features: ['instruction', 'context', 'response', 'category'],
    num_rows: 800
})

In [None]:
train_dataset_raw.to_pandas()

Unnamed: 0,instruction,context,response,category
0,When did Virgin Australia start operating?,"Virgin Australia, the trading name of Virgin A...",Virgin Australia commenced services on 31 Augu...,closed_qa
1,Which is a species of fish? Tope or Rope,,Tope,classification
2,Why can camels survive for long without water?,,Camels use the fat in their humps to keep them...,open_qa
3,"Alice's parents have three daughters: Amy, Jes...",,The name of the third daughter is Alice,open_qa
4,When was Tomoaki Komorida born?,Komorida was born in Kumamoto Prefecture on Ju...,"Tomoaki Komorida was born on July 10,1981.",closed_qa
...,...,...,...,...
795,Who is the founder of the Communist Party?,,Lenin,open_qa
796,What is gardening?,Gardening is the practice of growing and culti...,Gardening is laying out and caring for a plot ...,information_extraction
797,What are your thoughts of Michael Jackson as a...,,Michael Jackson is acclaimed as the greatest p...,creative_writing
798,What is the largest pollutant?,,Carbon dioxide (CO2) - a greenhouse gas emitte...,general_qa


In [None]:
train_dataset_raw.to_pandas().dtypes

instruction    object
context        object
response       object
category       object
dtype: object

In [None]:
train_dataset_raw.to_pandas().value_counts("category")

category
open_qa                   202
general_qa                132
classification            111
brainstorming              95
closed_qa                  90
information_extraction     68
summarization              63
creative_writing           39
dtype: int64

In [None]:
def generate_prompt(sample, tokenizer):
  messages = [
      {"role": "user", "content": (sample["context"]+" " if sample["context"] else "") + sample["instruction"]},
      {"role": "assistant", "content": sample["response"]},
  ]

  text = tokenizer.apply_chat_template(messages, tokenize=False) # wraps text with special tokens depending on role (assitant or user)
  return {"text": text}

In [None]:
generated_train_dataset = train_dataset_raw.map(
    lambda x: generate_prompt(x, tokenizer), remove_columns=list(train_dataset_raw.features))
generated_val_dataset = eval_dataset_raw.map(
    lambda x: generate_prompt(x, tokenizer), remove_columns=list(train_dataset_raw.features))

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [None]:
generated_train_dataset

Dataset({
    features: ['text'],
    num_rows: 800
})

# LoRA Configuration

In [None]:
def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
def convert_to_lora_pft_mut(model):
    model.gradient_checkpointing_enable()
    model = prepare_model_for_kbit_training(model)
    lora_config = LoraConfig(
        r=8,
        lora_alpha=16,
        target_modules=[
            "q_proj",
            "k_proj",
            "v_proj",
            "o_proj",
            "gate_proj",
            "up_proj",
            "down_proj",
            "lm_head",
        ],
        bias="none",
        lora_dropout=0.05,
        task_type="CAUSAL_LM",
    )

    model = get_peft_model(model, lora_config)
    return lora_config

lora_config = convert_to_lora_pft_mut(model)
print_trainable_parameters(model)

trainable params: 21260288 || all params: 3773331456 || trainable%: 0.5634354746703705


# Model Training

In [None]:
def define_trainer(model, tokenizer, lora_config, train_dataset, eval_dataset):
  training_arguments = TrainingArguments(
      output_dir="./results",
      num_train_epochs=1,
      per_device_train_batch_size=4,
      gradient_accumulation_steps=1,
      optim="paged_adamw_32bit",
      save_strategy="steps",
      save_steps=25,
      logging_steps=25,
      learning_rate=2e-4,
      weight_decay=0.001,
      max_steps=250,
      evaluation_strategy="steps",
      eval_steps=25,
      do_eval=True,
      report_to="none",
  )



  # this appareantly performs reinforcement learning
  # https://pypi.org/project/trl/

  # Setting sft parameters
  trainer = SFTTrainer(
      model=model,
      tokenizer=tokenizer,
      args=training_arguments,
      train_dataset=train_dataset,
      eval_dataset=eval_dataset,
      peft_config=lora_config,
      dataset_text_field="text", # SFTTrainer assumes instruction and response in the same string.
  )

  # necessary for training
  model.config.use_cache = False
  tokenizer.pad_token = tokenizer.eos_token

  return trainer

trainer = define_trainer(model, tokenizer, lora_config, generated_train_dataset, generated_val_dataset)



Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]



In [None]:
trainer.train()



Step,Training Loss,Validation Loss


You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
25,1.5088,1.486149
50,1.4986,1.426235
75,1.5251,1.411534
100,1.4342,1.40745
125,1.381,1.404542
150,1.4161,1.399403
175,1.3888,1.397642
200,1.3531,1.395627
225,1.1745,1.406148
250,1.1085,1.409241




TrainOutput(global_step=250, training_loss=1.3788602905273437, metrics={'train_runtime': 5034.6364, 'train_samples_per_second': 0.199, 'train_steps_per_second': 0.05, 'total_flos': 1.8007699582058496e+16, 'train_loss': 1.3788602905273437, 'epoch': 1.25})

In [None]:
inputs = tokenizer("When did Virgin Australia start operating?", return_tensors="pt")

outputs = model.generate(**inputs)

text = tokenizer.decode(outputs[-1], padding_side="left")
text

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


'<s> When did Virgin Australia start operating?</s> 2000\n    None\n    ['

In [None]:
my_finetuned_model = "mistral-7b-dolly"

#trainer.model.push_to_hub(my_finetuned_model)