In [None]:
!pip install -q transformers accelerate bitsandbytes peft datasets trl
!pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

In [None]:
# STEP 2: Import Libraries
# ============================================================
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import load_dataset
from trl import SFTTrainer
import os

In [None]:
# Check GPU availability
print(f"GPU Available: {torch.cuda.is_available()}")
print(f"GPU Name: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")
print(f"CUDA Version: {torch.version.cuda}")


In [None]:
# STEP 3: Configuration
# ============================================================
MODEL_NAME = "microsoft/Phi-3-mini-4k-instruct"
OUTPUT_DIR = "./phi3-finetuned"
DATASET_NAME = "mlabonne/guanaco-llama2-1k"  # Small open-source dataset

# Training hyperparameters
MAX_SEQ_LENGTH = 512
BATCH_SIZE = 4
GRADIENT_ACCUMULATION_STEPS = 4
LEARNING_RATE = 2e-4
NUM_EPOCHS = 3
WARMUP_STEPS = 10

In [None]:
# STEP 4: Load Dataset
# ============================================================
print("\nüì¶ Loading dataset...")
dataset = load_dataset(DATASET_NAME, split="train")
print(f"Dataset size: {len(dataset)} examples")
print(f"Sample data: {dataset[0]}")

# Split dataset
dataset = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]

In [None]:
# STEP 5: Configure 4-bit Quantization for P100 GPU
# ============================================================
print("\n‚öôÔ∏è Configuring quantization...")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

In [None]:
# STEP 6: Load Model and Tokenizer
# ============================================================
print("\nü§ñ Loading Phi-3 model and tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True,
    padding_side="right"
)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.float16
)

# Prepare model for k-bit training
model = prepare_model_for_kbit_training(model)
model.config.use_cache = False
model.config.pretraining_tp = 1

print(f"Model loaded successfully!")
print(f"Model parameters: {model.num_parameters() / 1e9:.2f}B")

In [None]:
# STEP 7: Configure LoRA
# ============================================================
print("\nüîß Configuring LoRA...")
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"]
)

model = get_peft_model(model, peft_config)
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
all_params = sum(p.numel() for p in model.parameters())
print(f"Trainable params: {trainable_params:,} ({100 * trainable_params / all_params:.2f}%)")


In [None]:
# STEP 8: Format Dataset
# ============================================================
def format_instruction(example):
    """Format the dataset into instruction format"""
    text = f"<|user|>\n{example['text']}<|end|>\n<|assistant|>\n"
    return {"formatted_text": text}

# Apply formatting
train_dataset = train_dataset.map(format_instruction)
eval_dataset = eval_dataset.map(format_instruction)


In [None]:
# STEP 9: Training Arguments
# ============================================================
print("\nüìù Setting up training arguments...")
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    learning_rate=LEARNING_RATE,
    logging_steps=10,
    save_steps=100,
    eval_strategy="steps",  # Changed from evaluation_strategy
    eval_steps=50,
    warmup_steps=WARMUP_STEPS,
    fp16=True,
    optim="paged_adamw_8bit",
    lr_scheduler_type="cosine",
    max_grad_norm=0.3,
    save_total_limit=2,
    load_best_model_at_end=True,
    save_strategy="steps",
    report_to="none",
    group_by_length=True,
)

In [None]:
# STEP 10: Initialize Trainer
# ============================================================
print("\nüéØ Initializing trainer...")

from transformers import DataCollatorForLanguageModeling

# Data collator for dynamic padding
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(
        examples["formatted_text"],
        truncation=True,
        max_length=MAX_SEQ_LENGTH,
        padding="max_length",
        return_tensors="pt"
    )

# Apply tokenization
tokenized_train = train_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=train_dataset.column_names
)

tokenized_eval = eval_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=eval_dataset.column_names
)

# Use standard Trainer instead of SFTTrainer
from transformers import Trainer

trainer = Trainer(
    model=model,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
    args=training_args,
    data_collator=data_collator,
)


In [None]:
# STEP 11: Train the Model
# ============================================================
print("\nüöÄ Starting training...")
trainer.train()


In [None]:
# STEP 12: Save Fine-tuned Model
# ============================================================
print("\nüíæ Saving fine-tuned model...")
trainer.model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"Model saved to {OUTPUT_DIR}")

In [None]:
# STEP 13: Test the Fine-tuned Model
# ============================================================
print("\nüß™ Testing the fine-tuned model...")

# Load the fine-tuned model
model = AutoModelForCausalLM.from_pretrained(
    OUTPUT_DIR,
    device_map="auto",
    torch_dtype=torch.float16,
    trust_remote_code=True
)
model.eval()

# Test inference
test_prompt = "<|user|>\nWhat is machine learning?<|end|>\n<|assistant|>\n"
inputs = tokenizer(test_prompt, return_tensors="pt").to("cuda")

with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=100,
        temperature=0.7,
        top_p=0.9,
        do_sample=True
    )

response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("\n" + "="*50)
print("Test Output:")
print("="*50)
print(response)
print("="*50)


In [None]:
# STEP 14: Upload to Kaggle Output (Optional)
# ============================================================
print("\n‚úÖ Training completed successfully!")
print(f"Fine-tuned model available at: {OUTPUT_DIR}")
print("\nTo use this model later:")
print(f"model = AutoModelForCausalLM.from_pretrained('{OUTPUT_DIR}')")
print(f"tokenizer = AutoTokenizer.from_pretrained('{OUTPUT_DIR}')")

# Memory cleanup
torch.cuda.empty_cache()
print("\nüéâ All done!")