In [None]:
import sys
import os
import matplotlib.pyplot as plt
sys.path.append(os.path.abspath('..'))
from utils.login_huggingface import login_huggingface
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModelForCausalLM

login_huggingface()
model_id = "meta-llama/Llama-3.2-1B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)

# all parameters number
print(model.num_parameters())

message = "Hello, how are you?"
tokens = tokenizer.encode(message, return_tensors="pt")
print(tokens.shape)
outputs = model(tokens, output_hidden_states=True)

for idx, h in enumerate(outputs.hidden_states):
    print(idx, h.shape)

In [None]:
Step 1: Environment Setup
Install dependencies in a virtual environment (e.g., via venv or Conda). Use bitsandbytes for 4-bit quantization to fit the model in memory.
Bashpip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118  # Adjust for your CUDA
pip install -U transformers datasets accelerate peft trl bitsandbytes wandb  # Core libs
pip install xformers  # For efficient attention
# Optional for faster training on Colab:
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
Log in to Hugging Face for gated models:
Pythonfrom huggingface_hub import login
login()  # Paste your token
Step 2: Data Preparation
Load and format your small dataset into chat/instruction pairs. Use a chat template for Llama 3's format.
Pythonfrom datasets import load_dataset
import torch

# Load small dataset (e.g., 1,000 samples)
dataset = load_dataset("ruslanmv/ai-medical-chatbot", split="all")  # Or your dataset
dataset = dataset.shuffle(seed=42).select(range(1000))  # Subsample
dataset = dataset.train_test_split(test_size=0.2)  # 800 train, 200 eval

# Load tokenizer early for formatting
from transformers import AutoTokenizer
model_name = "meta-llama/Meta-Llama-3-8B-Instruct"  # Or Llama-3-8B
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # Fix padding

# Format as chat (user-assistant pairs)
def format_chat(example):
    messages = [
        {"role": "user", "content": example["Patient"]},  # Adapt to your fields
        {"role": "assistant", "content": example["Doctor"]}
    ]
    example["text"] = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
    return example

dataset = dataset.map(format_chat, num_proc=4)
Tip: For tiny data (<500 samples), add diversity by shuffling and splitting 80/20. Ensure sequences ≤512 tokens to avoid OOM.
Step 3: Load Model with Quantization (QLoRA Setup)
Load the base model in 4-bit for memory savings (~4–6 GB VRAM usage).
Pythonfrom transformers import AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

# Quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

# Load quantized model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",  # Auto-distribute if multi-GPU
    trust_remote_code=True,
    attn_implementation="flash_attention_2" if torch.cuda.is_available() else "eager",  # Faster attention
)
model = prepare_model_for_kbit_training(model)  # Enable gradient checkpoints

# Setup chat format
from trl import setup_chat_format
model, tokenizer = setup_chat_format(model, tokenizer)
Tip: On small data, quantization prevents overfitting by adding noise; use nf4 for better accuracy.
Step 4: Configure LoRA
Apply LoRA to target key modules (e.g., attention layers), training ~1–2% of params.
Pythonpeft_config = LoraConfig(
    r=16,  # Rank: Higher for more capacity, but 8–32 for small data
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()  # Verify: ~2M trainable params
Tip: For very small data, lower r=8 to reduce params and overfitting.
Step 5: Training
Use TRL's SFTTrainer for supervised fine-tuning. Set small batch sizes and epochs.
Pythonfrom trl import SFTTrainer
from transformers import TrainingArguments

args = TrainingArguments(
    output_dir="./llama-3-finetuned",
    num_train_epochs=1,  # 1–2 for small data
    per_device_train_batch_size=2,  # Adjust based on VRAM
    gradient_accumulation_steps=4,  # Effective batch=8
    learning_rate=2e-4,
    warmup_steps=10,
    logging_steps=10,
    eval_steps=50,
    evaluation_strategy="steps",
    save_strategy="epoch",
    load_best_model_at_end=True,
    fp16=True,  # Or bf16 for Ampere+ GPUs
    optim="paged_adamw_8bit",  # Memory-efficient optimizer
    report_to="wandb",  # Optional logging
    dataloader_num_workers=4,
)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    dataset_text_field="text",
    max_seq_length=512,
    peft_config=peft_config,
    args=args,
    packing=False,  # No packing for small data
)

trainer.train()
Tip: Train for 1 epoch; use cosine scheduler (lr_scheduler_type="cosine") for smooth convergence. Early stopping if val loss plateaus.
Step 6: Evaluation
Test on held-out data; compute metrics like perplexity or accuracy.
Pythonfrom datasets import load_metric
import numpy as np

# Simple perplexity eval (adapt for your task)
eval_results = trainer.evaluate()
print(eval_results)  # e.g., {'eval_loss': 0.45}

# For classification/QA: Use ROUGE/BLEU
rouge = load_metric("rouge")
# Generate predictions and score...
Tip: On small data, cross-validate or use k-fold to estimate performance reliably.
Step 7: Save, Merge, and Local Inference
Save the LoRA adapter (small ~10–50 MB file).
Pythontrainer.model.save_pretrained("./llama-3-lora-adapter")
trainer.model.push_to_hub("your-username/llama-3-small-finetune")  # Optional
Merge adapter with base for full model:
Pythonfrom peft import PeftModel
base_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
merged_model = PeftModel.from_pretrained(base_model, "./llama-3-lora-adapter")
merged_model = merged_model.merge_and_unload()
merged_model.save_pretrained("./llama-3-merged")
For local inference (e.g., on CPU/laptop), quantize to GGUF:

Clone llama.cpp: git clone https://github.com/ggerganov/llama.cpp
Convert: python convert-hf-to-gguf.py ./llama-3-merged --outfile llama-3-small.gguf --outtype f16
Quantize: ./llama-quantize llama-3-small.gguf llama-3-small-q4.gguf Q4_K_M (~4 GB file)
Run: ./llama-cli -m llama-3-small-q4.gguf -p "Your prompt here" --chat-template chatml (or use Ollama/Jan.ai for GUI).

Example Inference:
Pythonfrom transformers import pipeline
pipe = pipeline("text-generation", model="./llama-3-merged", tokenizer=tokenizer)
output = pipe("User: How to treat a headache? Assistant:", max_new_tokens=100)
print(output[0]["generated_text"])