## Fine Tuning an LLM using mini-platypus Dataset

In [46]:
# Install required packages
!pip install -q accelerate peft bitsandbytes transformers trl datasets

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [47]:
# Imports
import os
import torch
import warnings
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM, 
    AutoTokenizer, 
    BitsAndBytesConfig, 
    TrainingArguments, 
    pipeline
)
from peft import LoraConfig
from trl import SFTTrainer
warnings.filterwarnings('ignore')
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
torch.cuda.empty_cache()

In [48]:
# Using a small model suitable for Kaggle's Free GPU
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
dataset_name = "mlabonne/mini-platypus"
new_model = "tinyllama-mini-platypus"

In [49]:
# LoRA and quantization settings
lora_r = 64
lora_alpha = 16
lora_dropout = 0.1
use_4bit = True
bnb_4bit_compute_dtype = "float16"
bnb_4bit_quant_type = "nf4"
use_nested_quant = False

In [50]:
# Training configuration
output_dir = "./results"
num_train_epochs = 1
fp16 = False
bf16 = False
per_device_train_batch_size = 1
per_device_eval_batch_size = 1
gradient_accumulation_steps = 1
gradient_checkpointing = True
max_grad_norm = 0.3
learning_rate = 2e-4
weight_decay = 0.001
optim = "paged_adamw_32bit"
lr_scheduler_type = "constant"
max_steps = -1
warmup_ratio = 0.03
group_by_length = True
save_steps = 25
logging_steps = 25

In [51]:
# Load dataset
dataset_name = "mlabonne/mini-platypus"
dataset = load_dataset(dataset_name, split="train")
dataset = dataset.map(lambda x: {"text": x["instruction"][:256]})  
dataset = dataset.select(range(10)) 

In [52]:
# Setup quantization config
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
bnb_config = BitsAndBytesConfig(
   load_in_4bit=use_4bit,
   bnb_4bit_quant_type=bnb_4bit_quant_type,
   bnb_4bit_compute_dtype=compute_dtype,
   bnb_4bit_use_double_quant=use_nested_quant,
)

In [53]:
# GPU check
if compute_dtype == torch.float16 and use_4bit:
   major, _ = torch.cuda.get_device_capability()
   if major >= 8:
       print("=" * 80)
       print("Your GPU supports bfloat16: accelerate training with bf16=True")
       print("=" * 80)

In [54]:
# Load model
model = AutoModelForCausalLM.from_pretrained(
   model_name,
   quantization_config=bnb_config,
   
   device_map={"": 0},
   
)
model.config.use_cache = False
model.config.pretraining_tp = 1


In [55]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"


In [56]:
# PEFT config
peft_config = LoraConfig(
   lora_alpha=lora_alpha,
   lora_dropout=lora_dropout,
   r=lora_r,
   bias="none",
   task_type="CAUSAL_LM",
)

In [57]:
# Training arguments
training_arguments = TrainingArguments(
   output_dir=output_dir,
   num_train_epochs=num_train_epochs,
   per_device_train_batch_size=per_device_train_batch_size,
   gradient_accumulation_steps=gradient_accumulation_steps,
   optim=optim,
   save_steps=save_steps,
   logging_steps=logging_steps,
   learning_rate=learning_rate,
   weight_decay=weight_decay,
   fp16=fp16,
   bf16=bf16,
   max_grad_norm=max_grad_norm,
   max_steps=max_steps,
   warmup_ratio=warmup_ratio,
   group_by_length=group_by_length,
   lr_scheduler_type=lr_scheduler_type,
   report_to="none"
)

In [58]:
# Create trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    args=training_arguments,
)

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [59]:
# Before training
prompt = "What is a large language model?"
instruction = f"### Instruction:\n{prompt}\n\n### Response:\n"

pretrain_pipe = pipeline(
    task="text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=128
)

pretrain_result = pretrain_pipe(instruction)
print("Before fine-tuning:", pretrain_result[0]['generated_text'][len(instruction):])

Device set to use cuda:0


Before fine-tuning: A large language model is a neural network that has been trained on a large corpus of text data. It is capable of generating human-like text based on the training data.


In [60]:
# Train model
trainer.train()

Step,Training Loss


TrainOutput(global_step=10, training_loss=2.270804595947266, metrics={'train_runtime': 2.5267, 'train_samples_per_second': 3.958, 'train_steps_per_second': 3.958, 'total_flos': 5578677080064.0, 'train_loss': 2.270804595947266})

In [61]:
# Save model
trainer.model.save_pretrained(new_model)

In [62]:
# After training
prompt = "What is a large language model?"
instruction = f"### Instruction:\n{prompt}\n\n### Response:\n"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=128)
result = pipe(instruction)
print(result[0]['generated_text'][len(instruction):])

Device set to use cuda:0


A large language model is a type of artificial intelligence model used for natural language processing (NLP) in speech recognition, text classification, and language understanding. It is a complex computer model that has been trained using billions or trillions of examples of natural language data. Large language models are capable of processing large amounts of data and making accurate predictions in real-time. They are essential for advancing NLP technologies and enabling machines to understand natural language without the need for manually labelled training data.


In [None]:
from IPython.display import display, Markdown
import pandas as pd

def clean(text, max_len=300):
    return text.replace("|", "¦").replace("\n", " ").strip()[:max_len] + "..."

prompts = [
    "What is a large language model?",
    "Explain what machine learning is.",
    "What's the capital of Germany? What is there to see in the capital of Germany?",
    "How do airplanes fly?",
    "Explain photosynthesis in a simple way",
    "Describe a sustainable city of the future, including transport, energy, and social systems.",
    "Generate a short dialog between a doctor and a patient concerned about climate change."
]

results = []
for prompt in prompts:
    instruction = f"### Instruction:\n{prompt}\n\n### Response:\n"
    before = pretrain_pipe(instruction, max_length=10000)[0]['generated_text'][len(instruction):].strip()
    after = pipe(instruction, max_length=10000)[0]['generated_text'][len(instruction):].strip()
    results.append({
        "Prompt": prompt,
        "Before Fine-Tuning": clean(before),
        "After Fine-Tuning": clean(after)
    })

df = pd.DataFrame(results)


In [None]:
# Display the comparison table
def make_comparison_view(df_subset):
    md = "## Prompt vs Output Comparison\n\n\n"
    for row in df_subset.itertuples():
        md += f"<details>\n<summary><strong>{row.Prompt}</strong></summary>\n\n"
        md += f"**Before Fine-Tuning:**\n\n```\n{row._2.strip()}\n```\n\n"
        md += f"**After Fine-Tuning:**\n\n```\n{row._3.strip()}\n```\n"
        md += "</details>\n\n"
    return md

display(Markdown(make_comparison_view(df)))