In [None]:
# Cell 1: Environment, imports, paths
import os, time, math
import sys
from pathlib import Path
from dotenv import load_dotenv
import torch

load_dotenv()

In [None]:

# Environment token + quick GPU info (you provided HF_TOKEN in .env)
HF_TOKEN = os.getenv("HUGGINGFACE_HUB_TOKEN")
assert HF_TOKEN, "‚ùå Missing Hugging Face token! Please check your .env file."

print("‚úÖ Env ready. CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    for i in range(torch.cuda.device_count()):
        print(f"  GPU {i}:", torch.cuda.get_device_name(i), 
              "| Memory total (GB):", round(torch.cuda.get_device_properties(i).total_memory/1e9,1))
else:
    print("‚ö†Ô∏è No GPUs detected - training will be very slow or fail.")

In [None]:
# PATHS (use your values)
DATA_DIR = Path("/data/home/anjeshnarwal/LLM_price_predictor/data/finetune")
TRAIN_PATH = DATA_DIR / "train.jsonl"
VAL_PATH   = DATA_DIR / "val.jsonl"
OUTPUT_DIR = Path("../src/models/llama31_8b_qlora_full")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Quick reproducibility
seed = 42
torch.manual_seed(seed)


In [None]:
# Cell 2: Load dataset (expects train.jsonl / validation.jsonl in DATA_DIR)
from datasets import load_dataset

if "raw_datasets" in globals():
    print("‚úÖ Reusing existing raw_datasets")
else:
    assert TRAIN_PATH.exists() and VAL_PATH.exists(), f"Train/Val not found at {TRAIN_PATH} / {VAL_PATH}"
    print("üîÅ Loading JSONL dataset from disk (this may take a minute)...")
    raw_datasets = load_dataset("json", data_files={"train": str(TRAIN_PATH), "validation": str(VAL_PATH)})
    
print("DatasetDict:", raw_datasets)
print("Train rows:", len(raw_datasets["train"]), "Validation rows:", len(raw_datasets["validation"]))
# Quick sample check
for i, ex in enumerate(raw_datasets["train"].select(range(0, 3))):
    print(f"\nSAMPLE {i} prompt (trunc):", ex["prompt"][:200].replace("\n"," "), " -> response:", ex["response"])


In [None]:
# Cell 3: Tokenizer and optional pre-tokenization (recommended)
from transformers import AutoTokenizer

MODEL_ID = "meta-llama/Meta-Llama-3.1-8B"  # or your chosen base
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=False, trust_remote_code=True, use_auth_token=HF_TOKEN)

# ensure pad token
if tokenizer.pad_token is None:
    if tokenizer.eos_token is not None:
        tokenizer.pad_token = tokenizer.eos_token
    else:
        tokenizer.add_special_tokens({"pad_token": "[PAD]"})
print("Tokenizer loaded. vocab size:", len(tokenizer))

# ---- Pre-tokenize (batched) ----
DO_PRETOKENIZE = True  # set False to skip and rely on collator on-the-fly tokenization

if DO_PRETOKENIZE:
    def tokenize_batch(examples):
        # tokenizes prompt+response together so collator can only build labels easily
        texts = [p + " " + str(r) for p, r in zip(examples["prompt"], examples["response"])]
        out = tokenizer(texts, truncation=True, padding=False, max_length=512, add_special_tokens=False)
        return {"input_ids": out["input_ids"]}

    # run with multiple processes to speed up
    print("üîÅ Pre-tokenizing train split (num_proc=8)...")
    raw_datasets["train"] = raw_datasets["train"].map(tokenize_batch, batched=True, batch_size=1024, remove_columns=raw_datasets["train"].column_names, num_proc=8)
    print("üîÅ Pre-tokenizing validation split (num_proc=4)...")
    raw_datasets["validation"] = raw_datasets["validation"].map(tokenize_batch, batched=True, batch_size=1024, remove_columns=raw_datasets["validation"].column_names, num_proc=4)
    # rename columns to keep prompt/response not required ‚Äî we'll use input_ids and reconstruct response tokens in collator
    print("‚úÖ Pre-tokenization complete.")
else:
    print("‚ö†Ô∏è Skipping pre-tokenization; collator will tokenise on the fly.")


In [None]:
# ============================================
# Cell 4: Data collator (pre-tokenized mode only)
# ============================================
import os
import sys
import torch

# --- Add project root to sys.path (for both .py & Jupyter modes) ---
try:
    # __file__ is defined when run as a .py script
    current_dir = os.path.dirname(os.path.abspath(__file__))
except NameError:
    # Fallback for interactive or Jupyter mode
    current_dir = os.getcwd()

project_root = os.path.abspath(os.path.join(current_dir, '..'))
if project_root not in sys.path:
    sys.path.insert(0, project_root)
    print(f"üß© Added project root to sys.path: {project_root}")

# --- Import your collator ---
from src.collator import DataCollatorForPricePrediction

# --- Instantiate collator ---
collator = DataCollatorForPricePrediction(tokenizer=tokenizer, max_length=512)

# ‚úÖ Tokenizer sanity check
print("üß† Collator initialized successfully:")
print(f"   ‚Ä¢ Tokenizer pad_token: '{tokenizer.pad_token}' (ID: {tokenizer.pad_token_id})")
print(f"   ‚Ä¢ Collator max_length: {collator.max_length}")

# ‚úÖ Simple smoke test using actual pre-tokenized data
if len(raw_datasets["train"]) >= 4:
    sample = raw_datasets["train"].select(range(4))
    batch = collator(sample)
    print("‚úÖ Collator test successful.")
    for k, v in batch.items():
        print(f"{k}: {tuple(v.shape)} dtype={v.dtype}")
else:
    print("‚ö†Ô∏è Not enough samples for collator quick test.")


In [None]:
# ============================================
# Cell 5: Load model in 4-bit and prepare for k-bit training
# ============================================
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

# --- Memory-efficient quantization config ---
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16   # ‚úÖ FP16 compute for RTX A6000 Tensor Cores
)

print("üîß Loading model in 4-bit (this can take some time)...")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    device_map="auto",                    # ‚úÖ auto-balance layers across both GPUs
    trust_remote_code=True,
    quantization_config=bnb_config,
    token=HF_TOKEN,                       # ‚úÖ replaces deprecated use_auth_token
    torch_dtype=torch.bfloat16            # ‚úÖ safe, stable load precision
)
print("‚úÖ Model loaded in 4-bit.")

# --- Resize embeddings if tokenizer was extended ---
if model.get_input_embeddings().weight.shape[0] < len(tokenizer):
    model.resize_token_embeddings(len(tokenizer))
    print("Resized model embeddings.")

# --- Prepare for QLoRA fine-tuning ---
print("‚öôÔ∏è Preparing model for k-bit training with gradient checkpointing...")
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)

# --- Convert leftover FP32 tensors to FP16 to reclaim VRAM ---
for p in model.parameters():
    if p.dtype == torch.float32:
        p.data = p.data.to(torch.float16)
print("‚úÖ Model prepared and memory optimized (FP32 ‚Üí FP16 where safe).")

# --- Define LoRA target modules ---
target_modules = ['down_proj', 'gate_proj', 'k_proj', 'o_proj', 'q_proj', 'up_proj', 'v_proj']

# --- LoRA configuration ---
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=target_modules,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# --- Apply PEFT ---
model = get_peft_model(model, lora_config)

# --- Print summary of trainable parameters ---
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
total = sum(p.numel() for p in model.parameters())
print(f"Trainable params: {trainable:,} / {total:,} ({100*trainable/total:.6f}%)")


In [None]:
# ============================================
# Cell 6: Optimized TrainingArguments and Trainer initialization (plain Python)
# ============================================
from transformers import TrainingArguments, Trainer

# Hardware-informed params (2x A6000)
PER_DEVICE_TRAIN_BATCH = 12           # adjust to 12 if VRAM allows; 8 if OOM
GRADIENT_ACCUM_STEPS = 2              # effective batch = PER_DEVICE * ACCUM * GPUs
NUM_EPOCHS = 3
LEARNING_RATE = 2e-4

# Checkpoint/eval frequency
EVAL_STEPS = 30000                    # slightly more frequent eval (optional)
SAVE_STEPS = 30000
LOGGING_STEPS = 1000

training_args = TrainingArguments(
    output_dir=str(OUTPUT_DIR),
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH,
    per_device_eval_batch_size=PER_DEVICE_TRAIN_BATCH,
    gradient_accumulation_steps=GRADIENT_ACCUM_STEPS,
    learning_rate=LEARNING_RATE,
    fp16=True,                        # ‚úÖ use mixed precision (fast on A6000)
    bf16=False,                       # ‚úÖ explicitly disable bf16 training
    optim="adamw_bnb_8bit",           # ‚úÖ bitsandbytes optimizer for QLoRA
    warmup_ratio=0.03,
    logging_steps=LOGGING_STEPS,
    evaluation_strategy="steps",
    eval_steps=EVAL_STEPS,
    save_strategy="steps",
    save_steps=SAVE_STEPS,
    save_total_limit=3,
    dataloader_num_workers=8,         # ‚úÖ parallel dataloading
    group_by_length=True,
    gradient_checkpointing=True,      # ‚úÖ memory saving
    remove_unused_columns=False,
    torch_compile=False,              # ‚úÖ avoid compilation overhead
    report_to="none"                  # ‚úÖ no W&B / HF Hub logging
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=raw_datasets["train"],
    eval_dataset=raw_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=collator
)

# --- Expected steps and sanity checks ---
num_train = len(trainer.train_dataset)
gpus = torch.cuda.device_count() or 1
steps_per_epoch = math.ceil(num_train / (PER_DEVICE_TRAIN_BATCH * gpus * GRADIENT_ACCUM_STEPS))
total_steps = steps_per_epoch * NUM_EPOCHS
print(f"Num train examples: {num_train:,}; GPUs: {gpus}")
print(f"Steps/epoch ‚âà {steps_per_epoch:,}; Total steps ‚âà {total_steps:,}")

assert num_train > 1000, "Train dataset suspiciously small."
assert trainable > 0, "Trainable params = 0. Check LoRA target_modules or get_peft_model."


In [None]:
# # Cell 7: Measure dataloader throughput (pre-tokenized smoke test)
# import time
# from torch.utils.data import DataLoader

# # small slice of dataset for timing
# sample_ds = raw_datasets["train"].select(range(0, 512))

# # plain dataloader that just pads via collator (pretokenized)
# dl = DataLoader(sample_ds, batch_size=training_args.per_device_train_batch_size,
#                 shuffle=False, collate_fn=collator, num_workers=4)

# n_batches = 10
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# t0 = time.time()
# for i, batch in enumerate(dl):
#     if i >= n_batches:
#         break
#     batch = {k: v.to(device) for k, v in batch.items() if torch.is_tensor(v)}
#     with torch.no_grad():
#         outputs = trainer.model(**batch)
# t1 = time.time()

# avg_batch_time = (t1 - t0) / n_batches
# it_per_sec = 1.0 / avg_batch_time
# print(f"‚ö° Avg batch time (data+forward): {avg_batch_time:.3f}s  =>  {it_per_sec:.2f} it/s")

# effective_step_time = avg_batch_time * training_args.gradient_accumulation_steps
# print(f"‚è±Ô∏è  Est. optimizer step time (grad_accum={training_args.gradient_accumulation_steps}): "
#       f"{effective_step_time:.3f}s  =>  {1.0/effective_step_time:.2f} steps/s")
# print("Use these numbers to estimate total ETA: total_steps * step_time")


In [None]:
# ============================================
# Cell 8: Start training (Plain Python version)
# ============================================
import time, sys, os

print("üöÄ Starting full QLoRA fine-tuning (plain Python, no accelerate)...")

# --- Optional resume support from CLI ---
resume_from_checkpoint = None
if "--resume_from_checkpoint" in sys.argv:
    idx = sys.argv.index("--resume_from_checkpoint")
    if idx + 1 < len(sys.argv):
        resume_from_checkpoint = sys.argv[idx + 1]
        if os.path.exists(resume_from_checkpoint):
            print(f"üîÅ Resuming training from checkpoint: {resume_from_checkpoint}")
        else:
            print(f"‚ö†Ô∏è  Checkpoint path not found: {resume_from_checkpoint}. Starting fresh.")

# --- Training start ---
start_time = time.time()
try:
    train_result = trainer.train(resume_from_checkpoint=resume_from_checkpoint)
    trainer.save_model(str(OUTPUT_DIR))      # saves adapters and config
    trainer.save_state()
    print(f"‚úÖ Model + training state saved to: {OUTPUT_DIR}")
except Exception as e:
    print(f"‚ùå Training crashed due to error: {e}")
    raise

# --- Duration report ---
elapsed = time.time() - start_time
hours = int(elapsed // 3600)
minutes = int((elapsed % 3600) // 60)
seconds = int(elapsed % 60)
print(f"‚úÖ Training completed in {hours}h {minutes}m {seconds}s")
print("Train result summary:", train_result)
