In [1]:
#!/usr/bin/env python3
# finetune_mistral_lora.py
#
# Full training script to fine‑tune Mistral-7B Instruct with proper freezing of base params and LoRA adapters.
# Optimized for an NVIDIA A6000.

import os
import gc
import logging
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

# --------------------
# 1. CONFIGURATION
# --------------------
BASE_MODEL      = "mistralai/Mistral-7B-Instruct-v0.3"
DATA_PATH       = "cleaned_data/all_subjects_cleaned.jsonl"
OUTPUT_DIR      = "mistral7b_lora_finetuned"

# LoRA / training hyperparameters
EPOCHS          = 3
LEARNING_RATE   = 2e-4
LORA_R          = 8
LORA_ALPHA      = 16
LORA_DROPOUT    = 0.05
MAX_LENGTH      = 512
BATCH_SIZE      = 16
MICRO_BATCH     = 4
WARMUP_STEPS    = 100
SAVE_STEPS      = 200
LOGGING_STEPS   = 20
SEED            = 42

# --------------------
# 2. SETUP LOGGING & DEVICE
# --------------------
logging.basicConfig(
    format="%(asctime)s — %(levelname)s — %(name)s —   %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    level=logging.INFO
)
logger = logging.getLogger(__name__)
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
logger.info(f"Using device: {DEVICE}")
torch.manual_seed(SEED)

os.makedirs(OUTPUT_DIR, exist_ok=True)

# --------------------
# 3. LOAD & SPLIT DATASET
# --------------------
logger.info(f"Loading dataset from {DATA_PATH}")
ds_full = load_dataset("json", data_files=DATA_PATH, split="train")
split = ds_full.train_test_split(test_size=0.1, seed=SEED)
train_ds, eval_ds = split["train"], split["test"]
logger.info(f"Dataset split: Train={len(train_ds)} Eval={len(eval_ds)}")

# --------------------
# 4. TOKENIZER & MODEL
# --------------------
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)
model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config=bnb_config,
    device_map={"": DEVICE},
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
    trust_remote_code=True
)
# Prepare for k-bit training
model = prepare_model_for_kbit_training(model)

# --------------------
# 5. FREEZE BASE PARAMETERS
# --------------------
for name, param in model.named_parameters():
    if 'lora_' not in name:
        param.requires_grad = False
logger.info("Froze all base model parameters, only LoRA will be trainable.")

# --------------------
# 6. APPLY LoRA
# --------------------
target_modules = ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"]
lora_config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=target_modules,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM",
    modules_to_save=["lm_head"]
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# --------------------
# 7. PREPROCESSING
# --------------------
def preprocess_fn(example):
    text = f"<s>[INST] {example['prompt'].strip()} [/INST] {example['response'].strip()} </s>"
    tok = tokenizer(text, truncation=True, max_length=MAX_LENGTH, padding="max_length")
    tok["labels"] = tok["input_ids"].copy()
    return tok

train_ds = train_ds.map(preprocess_fn, remove_columns=train_ds.column_names)
eval_ds  = eval_ds.map(preprocess_fn, remove_columns=eval_ds.column_names)

# --------------------
# 8. DATA COLLATOR
# --------------------
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# --------------------
# 9. TRAINING ARGUMENTS
# --------------------
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=MICRO_BATCH,
    gradient_accumulation_steps=BATCH_SIZE//MICRO_BATCH,
    learning_rate=LEARNING_RATE,
    num_train_epochs=EPOCHS,
    warmup_steps=WARMUP_STEPS,
    logging_steps=LOGGING_STEPS,
    save_steps=SAVE_STEPS,
    save_total_limit=2,
    eval_strategy="steps",
    eval_steps=SAVE_STEPS,
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    greater_is_better=False,
    fp16=True,
    gradient_checkpointing=True,
    optim="adamw_torch_fused",
    torch_compile=True,
    report_to="none",
    remove_unused_columns=False
)

# --------------------
# 10. INITIALIZE TRAINER
# --------------------
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    data_collator=data_collator
)

# --------------------
# 11. TRAINING LOOP
# --------------------
def free_memory():
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()

if __name__ == "__main__":
    free_memory()
    trainer.train()
    logger.info("Saving LoRA adapters and tokenizer...")
    model.save_pretrained(OUTPUT_DIR)
    tokenizer.save_pretrained(OUTPUT_DIR)
    free_memory()


  from .autonotebook import tqdm as notebook_tqdm
2025-05-04 21:53:28 — INFO — __main__ —   Using device: cuda:0
2025-05-04 21:53:28 — INFO — __main__ —   Loading dataset from cleaned_data/all_subjects_cleaned.jsonl
2025-05-04 21:53:30 — INFO — __main__ —   Dataset split: Train=2149 Eval=239
Loading checkpoint shards: 100%|██████████| 3/3 [00:17<00:00,  5.85s/it]
2025-05-04 21:53:50 — INFO — __main__ —   Froze all base model parameters, only LoRA will be trainable.


trainable params: 155,189,248 || all params: 7,403,212,800 || trainable%: 2.0962


Map: 100%|██████████| 239/239 [00:00<00:00, 1310.33 examples/s]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss


2025-05-04 22:37:44 — INFO — __main__ —   Saving LoRA adapters and tokenizer...
