In [1]:
!pip install transformers peft bitsandbytes datasets trl accelerate torch flash-attn

Collecting bitsandbytes
  Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting trl
  Downloading trl-0.24.0-py3-none-any.whl.metadata (11 kB)
Collecting flash-attn
  Downloading flash_attn-2.8.3.tar.gz (8.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m37.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl (59.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trl-0.24.0-py3-none-any.whl (423 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m423.1/423.1 kB[0m [31m32.6 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: flash-attn
  Building wheel for flash-attn (setup.py) ... [?25l[?25hdone
  Created wheel for flash-attn: filename=flash_attn-2.8.3-cp312-cp312-linux_

In [2]:
import torch
import os
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from peft import LoraConfig # No need for get_peft_model explicitly with SFTTrainer
from trl import SFTTrainer

In [3]:
# --- 1. Configuration ---

base_model_name = "microsoft/phi-3-mini-4k-instruct"
dataset_path = "/content/drive/MyDrive/train_json_extraction.jsonl"

# Ensure Drive is mounted first:
# from google.colab import drive
# drive.mount('/content/drive')
output_dir = "/content/drive/MyDrive/my-phi3-json-extraction-adapter"
print(f"JSON extraction adapter will be saved to: {output_dir}")

JSON extraction adapter will be saved to: /content/drive/MyDrive/my-phi3-json-extraction-adapter


In [4]:
# --- 2. Load Model, Tokenizer, and Quantization (QLoRA) ---
print("Loading base model and tokenizer (4-bit)...")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    # attn_implementation="flash_attention_2" # Optional: Use Flash Attention if available/compatible for potential speedup
)
model.config.use_cache = False # Disable cache for training
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
# Important: Set padding side based on SFTTrainer/model expectations. 'right' is common.
tokenizer.padding_side = "right"

Loading base model and tokenizer (4-bit)...


config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

configuration_phi3.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/phi-3-mini-4k-instruct:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_phi3.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/phi-3-mini-4k-instruct:
- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

In [5]:
# --- 3. Configure LoRA ---
print("Configuring LoRA...")
peft_config = LoraConfig(
    r=16,                # Rank of LoRA matrices (16 or 32 is common)
    lora_alpha=32,       # Alpha scaling factor (often 2*r)
    lora_dropout=0.05,   # Dropout probability for LoRA layers
    bias="none",
    task_type="CAUSAL_LM",
    # Target modules can vary slightly per model. These are common for Phi-3.
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
)

Configuring LoRA...


In [6]:
# --- 4. Load Your Formatted Training Data ---
print(f"Loading dataset from: {dataset_path}")
try:
    # Ensure it loads correctly, verify path if error occurs
    dataset = load_dataset("json", data_files=dataset_path, split="train")
    print(f"Dataset loaded with {len(dataset)} examples.")
    # You can uncomment below to double-check the first formatted example
    # print("First example format check:\n", dataset[0]['text'][:500] + "...") # Print first 500 chars
except Exception as e:
    print(f"Error loading dataset: {e}")
    print(f"Please ensure '{dataset_path}' exists and is correctly formatted.")
    raise

Loading dataset from: /content/drive/MyDrive/train_json_extraction.jsonl


Generating train split: 0 examples [00:00, ? examples/s]

Dataset loaded with 400 examples.


In [7]:
# --- 5. Set Up Training Arguments ---
print("Setting up training arguments...")
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=1,       # Start with 1 due to potentially long sequences (schema+text)
    gradient_accumulation_steps=8,       # Increase accumulation to simulate larger batch size (1*8=8)
    # num_train_epochs=3,                  # Train for a few epochs (adjust based on results)
    num_train_epochs=1,                  # Train for a few epochs (adjust based on results)
    learning_rate=2e-4,                  # Standard learning rate for QLoRA
    logging_steps=10,                    # Log more frequently for smaller datasets
    optim="paged_adamw_8bit",
    save_strategy="epoch",               # Save at the end of each epoch
    # save_steps=50,                     # Or save every N steps if preferred
    # evaluation_strategy="no",
    # push_to_hub=output_dir.startswith("YourUsername/"),
    fp16=False,
    bf16=torch.cuda.is_bf16_supported(), # Use bfloat16 if available
    max_steps=-1,                        # Train for num_train_epochs unless max_steps is set
    warmup_ratio=0.03,
    group_by_length=True,                # Group sequences of similar length for efficiency
    lr_scheduler_type="constant",
    report_to="tensorboard",
    gradient_checkpointing=True,         # Crucial for saving memory
    gradient_checkpointing_kwargs={'use_reentrant':False},
)

Setting up training arguments...


In [8]:
# --- 6. Initialize the SFTTrainer ---
print("Initializing SFTTrainer...")
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    # dataset_text_field="text",
    # max_seq_length=1024,                 # Adjust based on average length in your data vs GPU memory. May need smaller (e.g., 512) or larger (e.g., 2048)
    # tokenizer=tokenizer,
    # args=training_args,
    # packing=False,                       # Set packing=False for instruction tuning datasets
)

Initializing SFTTrainer...


Adding EOS to train dataset:   0%|          | 0/400 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/400 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (9057 > 4096). Running this sequence through the model will result in indexing errors


Truncating train dataset:   0%|          | 0/400 [00:00<?, ? examples/s]

In [9]:
# --- 7. Start Fine-Tuning ---
print("\nStarting fine-tuning for JSON extraction...")
train_result = trainer.train()
print("Fine-tuning finished.")


Starting fine-tuning for JSON extraction...


  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mjeffpachecoo[0m ([33mjeffpachecoo-unisatc[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


  return fn(*args, **kwargs)


Step,Training Loss
10,0.5966


KeyboardInterrupt: 

In [None]:
# --- 8. Save the Final Adapter ---
print(f"Saving the final JSON extraction adapter model to {output_dir}")
trainer.save_model(output_dir) # Saves only the adapter

In [None]:
# Optional: Log and save metrics
metrics = train_result.metrics
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()

print("\nJSON extraction adapter saved successfully!")
print(f"The 'adapter_path' for your next inference script is: '{output_dir}'")