In [1]:
pip  install datasets



In [2]:
pip install trl



In [3]:
pip install --upgrade git+https://github.com/huggingface/trl.git


Collecting git+https://github.com/huggingface/trl.git
  Cloning https://github.com/huggingface/trl.git to /tmp/pip-req-build-rv5ij7s9
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/trl.git /tmp/pip-req-build-rv5ij7s9
  Resolved https://github.com/huggingface/trl.git to commit 89556c8cbf1a816539167a46cdf285419e057fec
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [None]:
# Fine-tuning Phi-3 Mini on SQuAD for QA (Updated for Consistency with Summarization Script, W&B removed)
import os
import json
import torch
import pandas as pd
from datasets import load_dataset
from transformers import (
    AutoTokenizer, AutoModelForCausalLM, TrainingArguments, set_seed
)
from peft import LoraConfig, TaskType
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM

from datetime import datetime
import logging
os.environ["WANDB_MODE"] = "disabled"
os.environ["WANDB_DISABLED"] = "true"

# Logging setup
log_dir = "sbatch_logs"
os.makedirs(log_dir, exist_ok=True)
log_filename = os.path.join(log_dir, f"training_phi3_squad_{datetime.now():%Y%m%d_%H%M%S}.log")
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
file_handler = logging.FileHandler(log_filename)
file_handler.setFormatter(formatter)
console_handler = logging.StreamHandler()
console_handler.setFormatter(formatter)
logger.addHandler(file_handler)
logger.addHandler(console_handler)

# Constants and seed
set_seed(1234)
PATH = "aish/.cache/huggingface"
MODEL_ID = "microsoft/Phi-3-mini-4k-instruct"
NEW_MODEL_NAME = "Phi-3-mini-finetuned-SQuAD"

# Load and format SQuAD dataset
dataset = load_dataset("squad")

def format_qa(example):
    return {
        "instruction": "Answer the question based on the context",
        "input": f"Context: {example['context']}\nQuestion: {example['question']}",
        "output": example['answers']['text'][0] if example['answers']['text'] else "No answer"
    }

train_dataset = dataset['train'].map(format_qa)
eval_dataset = dataset['validation'].map(format_qa)

train_dataset = train_dataset.shuffle(seed=42).select(range(10000))
eval_dataset = eval_dataset.shuffle(seed=42).select(range(2000))

# Tokenizer setup
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, cache_dir=PATH + "/models", trust_remote_code=False, use_fast=True)
tokenizer.pad_token = tokenizer.unk_token
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
tokenizer.padding_side = "left"

# Format messages for chat model input
def create_message_column(row):
    return {
        "messages": [
            {"role": "user", "content": f"{row['instruction']}\nInput: {row['input']}"},
            {"role": "assistant", "content": row['output']}
        ]
    }

def format_dataset_chatml(row):
    return {"text": tokenizer.apply_chat_template(row["messages"], add_generation_prompt=False, tokenize=False)}

# Apply formatting
dataset = {
    "train": pd.DataFrame(train_dataset.map(create_message_column).map(format_dataset_chatml)),
    "test": pd.DataFrame(eval_dataset.map(create_message_column).map(format_dataset_chatml))
}

from datasets import Dataset as HFDataset
train_dataset = HFDataset.from_pandas(dataset["train"])
eval_dataset = HFDataset.from_pandas(dataset["test"])

# Model loading
compute_dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
attn_implementation = 'flash_attention_2' if torch.cuda.is_bf16_supported() else 'sdpa'

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    cache_dir=PATH + "/models",
    torch_dtype=compute_dtype,
    trust_remote_code=False,
    device_map="auto",
    attn_implementation=attn_implementation
)

# LoRA config
peft_config = LoraConfig(
    r=16, lora_alpha=16, lora_dropout=0.05, task_type=TaskType.CAUSAL_LM,
    target_modules=['k_proj', 'q_proj', 'v_proj', 'o_proj', 'gate_proj', 'down_proj', 'up_proj']
)

# Training arguments (W&B removed)
training_args = TrainingArguments(
    output_dir=f"./{NEW_MODEL_NAME}",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    learning_rate=1e-4,
    num_train_epochs=3,
    logging_steps=10,
    save_strategy="epoch",
    seed=42,
    fp16=not torch.cuda.is_bf16_supported(),
    bf16=torch.cuda.is_bf16_supported(),
    # report_to="none"  # Optional: if older transformers complains, just leave this out
)

# Data collator
collator = DataCollatorForCompletionOnlyLM(
    response_template="<|assistant|>",
    tokenizer=tokenizer
)

# Trainer setup (minimal version for compatibility)
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=collator,
    peft_config=peft_config
)

logger.info(f"Train size: {len(train_dataset)}, Eval size: {len(eval_dataset)}")
trainer.train()
trainer.save_model(f"./{NEW_MODEL_NAME}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
2025-04-24 20:05:20,353 - DEBUG - Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /datasets/squad/resolve/main/README.md HTTP/1.1" 307 0
2025-04-24 20:05:20,408 - DEBUG - https://huggingface.co:443 "HEAD /datasets/squad/resolve/main/README.md HTTP/1.1" 307 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /datasets/rajpurkar/squad/resolve/main/README.md HTTP/1.1" 200 0
2025-04-24 20:05:20,438 - DEBUG - https://huggingfa

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /microsoft/Phi-3-mini-4k-instruct/resolve/main/generation_config.json HTTP/1.1" 200 0
2025-04-24 20:06:08,576 - DEBUG - https://huggingface.co:443 "HEAD /microsoft/Phi-3-mini-4k-instruct/resolve/main/generation_config.json HTTP/1.1" 200 0
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /microsoft/Phi-3-mini-4k-instruct/resolve/main/tokenizer_config.json HTTP/1.1" 200 0
2025-04-24 20:06:08,824 - DEBUG - https://huggingface.co:443 "HEAD /microsoft/Phi-3-mini-4k-instruct/resolve/main/tokenizer_config.json HTTP/1.1" 200 0


Converting train dataset to ChatML:   0%|          | 0/10000 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/10000 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/10000 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/10000 [00:00<?, ? examples/s]

Converting eval dataset to ChatML:   0%|          | 0/2000 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/2000 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/2000 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/2000 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
INFO:root:Train size: 10000, Eval size: 2000
2025-04-24 20:06:32,111 - INFO - Train size: 10000, Eval size: 2000
