In [None]:
%%capture
%pip install accelerate peft bitsandbytes trl peft transformers==4.38.2

In [None]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from peft import LoraConfig
from trl import SFTTrainer

In [None]:
model_name = 'NousResearch/Meta-Llama-3-8B' # The model to fine-tune, 'NousResearch/Meta-Llama-3-8B', 'NousResearch/Llama-2-7b-hf', 'NousResearch/Llama-2-7b-chat-hf',
dataset_name = "final_data.json" # The instruction dataset to use
new_model = 'your-new-model-name' # Fine-tuned model name

In [None]:
use_4bit = True # Use 4-bit quantization

In [None]:
# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, 'float16')

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, # Activate 4-bit precision base model loading
    bnb_4bit_quant_type='nf4', # Quantization type (fp4 or nf4)
    bnb_4bit_compute_dtype=compute_dtype, # Compute dtype for 4-bit base models
    bnb_4bit_use_double_quant=False, # Activate nested quantization for 4-bit base models (double quantization)
)

In [None]:
# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map='auto',
)

model.config.use_cache = False
model.config.pretraining_tp = 1

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

In [None]:
def generate_and_tokenize_prompt(prompt):
    return tokenizer(prompt['text'])

In [None]:
data = load_dataset("json", data_files=dataset_name)

In [None]:
dataset = data['train'].map(generate_and_tokenize_prompt)

In [None]:
# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=64,
    r=64,
    lora_dropout=0.1,
    bias="none",
    target_modules=["q_proj", "v_proj", "k_proj"],
    modules_to_save=["embed_tokens", "lm_head"],
    task_type="CAUSAL_LM",
)

In [None]:
# Set training parameters
training_arguments = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=8, # Batch size per GPU for training
    gradient_accumulation_steps=1, # Number of update steps to accumulate the gradients for
    optim='paged_adamw_32bit', # Optimizer to use
    save_steps=10000, # Save checkpoint every X updates steps
    logging_steps=250, # Log every X updates steps
    learning_rate=2e-4, # Initial learning rate (AdamW optimizer)
    weight_decay=0.001, # Weight decay to apply to all layers except bias/LayerNorm weights
    fp16=False,
    bf16=True, # Enable fp16/bf16 training (set bf16 to True with an A100)
    max_grad_norm=0.3, # Maximum gradient normal (gradient clipping),
    max_steps=-1, # Number of training steps (overrides num_train_epochs),
    warmup_ratio=0.05, # Ratio of steps for a linear warmup (from 0 to learning rate),
    group_by_length=True, # Group sequences into batches with same length,
    lr_scheduler_type="constant", # Learning rate schedule (constant a bit better than cosine),
)

# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=2048,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False,
)

# Train model
trainer.train()

# Save trained model
trainer.model.save_pretrained(new_model)