In [None]:
# === Colab: install required libraries (run once) ===
!pip install -q transformers datasets accelerate sentencepiece openpyxl wandb
!pip install -U transformers datasets accelerate


Collecting datasets
  Downloading datasets-4.4.1-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Downloading datasets-4.4.1-py3-none-any.whl (511 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m511.6/511.6 kB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl (47.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyarrow, datasets
  Attempting uninstall: pyarrow
    Found existing installation: pyarrow 18.1.0
    Uninstalling pyarrow-18.1.0:
      Successfully uninstalled pyarrow-18.1.0
  Attempting uninstall: datasets
    Found existing installation: datasets 4.0.0
    Uninstalling datasets-4.0.0:
      Successfully uninstalled datasets-4.0.0
Successfully installed datasets-4.4.1 pya

In [None]:

# === Full optimized training script for Colab ===
import os
import random
from pathlib import Path
import pandas as pd
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    TrainingArguments,
    Trainer,
    DataCollatorForSeq2Seq,
    pipeline,
)

# ===== USER CONFIG =====
DRIVE_DATA_FOLDER = "/content/veda_files"   # change to your folder where the data files (.csv/.xlsx) are
OUTPUT_BASE = "/content/veda_finetuned_models"   # where models will be saved
os.makedirs(OUTPUT_BASE, exist_ok=True)

MODEL_NAME = "t5-small"     # safe default for Colab. change only if you know your GPU can handle it.
EPOCHS = 3
BATCH_SIZE = 4              # desired effective/global batch size (we will map this to per_device & grad_accum)
LEARNING_RATE = 2e-4
MAX_SOURCE_LENGTH = 256
MAX_TARGET_LENGTH = 256
SEED = 42
USE_FP16 = True             # set False if fp16 unsupported or causing issues in your runtime
AUTO_RETRY_ON_OOM = True    # will attempt one retry with smaller memory settings if OOM
# =======================

# --- Setup deterministic seeds ---
random.seed(SEED)
torch.manual_seed(SEED)

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)



Device: cpu


In [None]:
# --- Memory fragmentation mitigation ---
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# --- helper: find files in data folder ---
data_path = Path(DRIVE_DATA_FOLDER)
if not data_path.exists():
    raise FileNotFoundError(f"Data folder not found: {DRIVE_DATA_FOLDER}")
files = sorted([p for p in data_path.iterdir() if p.suffix.lower() in [".csv", ".xlsx", ".xls"]])
if len(files) == 0:
    raise FileNotFoundError(f"No CSV/XLSX files found in {DRIVE_DATA_FOLDER}")
print(f"Found {len(files)} files to process.")

# --- tokenizer (shared) ---
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
print("Loaded tokenizer:", MODEL_NAME)



Found 1 files to process.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Loaded tokenizer: t5-small


In [None]:
# --- helper: autodetect columns ---
def detect_input_target_columns(df: pd.DataFrame):
    input_candidates = ["input", "prompt", "text", "source", "question", "context", "sentence"]
    target_candidates = ["target", "label", "completion", "output", "answer", "summary"]

    cols = [c.lower() for c in df.columns]
    input_col = None
    target_col = None

    for c in input_candidates:
        if c in cols:
            input_col = df.columns[cols.index(c)]
            break
    for c in target_candidates:
        if c in cols:
            target_col = df.columns[cols.index(c)]
            break

    if input_col is None or target_col is None:
        text_cols = [c for c in df.columns if df[c].dtype == object]
        if len(text_cols) >= 2:
            input_col = input_col or text_cols[0]
            target_col = target_col or text_cols[1]
        elif len(text_cols) == 1:
            input_col = input_col or text_cols[0]
            target_col = target_col or text_cols[0]
        else:
            input_col = input_col or df.columns[0]
            target_col = target_col or (df.columns[1] if len(df.columns) > 1 else df.columns[0])
    return input_col, target_col

# --- preprocessing (no padding here: use dynamic padding in data_collator) ---
def preprocess_batch(batch):
    inputs = ["generate: " + s for s in batch["input_text"]]
    model_inputs = tokenizer(inputs, max_length=MAX_SOURCE_LENGTH, truncation=True, padding=False)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(batch["target_text"], max_length=MAX_TARGET_LENGTH, truncation=True, padding=False)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs




In [None]:
# --- iterate over files and train file-by-file to avoid memory growth ---
for file_path in files:
    name = file_path.stem
    print("\n\n=== Processing file:", file_path.name, "-> model will be saved to:", name, "===\n")

    # 1) load data
    if file_path.suffix.lower() in [".csv"]:
        df = pd.read_csv(file_path, low_memory=False)
    else:
        df = pd.read_excel(file_path)
    print("Original shape:", df.shape)

    input_col, target_col = detect_input_target_columns(df)
    print("Detected columns:", input_col, "(input)  |  ", target_col, "(target)")

    df = df.dropna(subset=[input_col, target_col]).astype({input_col: str, target_col: str})
    print("After dropping NA:", df.shape)
    display(df[[input_col, target_col]].head(2))

    hf_ds = Dataset.from_pandas(df[[input_col, target_col]].rename(columns={input_col: "input_text", target_col: "target_text"}))
    hf_ds = hf_ds.shuffle(seed=SEED)
    split = hf_ds.train_test_split(test_size=0.1, seed=SEED)
    train_ds = split["train"]
    val_ds = split["test"]
    print("Train size:", len(train_ds), "Val size:", len(val_ds))

    # 2) tokenize (batched)
    train_tokenized = train_ds.map(preprocess_batch, batched=True, remove_columns=train_ds.column_names)
    val_tokenized = val_ds.map(preprocess_batch, batched=True, remove_columns=val_ds.column_names)

    # 3) load model fresh each file (to avoid memory growth across files)
    print("Loading model:", MODEL_NAME)
    model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
    # memory optimizations:
    try:
        model.gradient_checkpointing_enable()   # reduce activations memory
    except Exception:
        pass
    # ensure model does not use cached kv during training (saves memory)
    model.config.use_cache = False

    # move to device (Trainer will take care but set device so pipeline later knows)
    model = model.to(device)

    # 4) prepare data collator (dynamic padding) - more memory efficient than padding all to max_length
    data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, label_pad_token_id=-100, pad_to_multiple_of=None)

    # 5) compute per-device and grad_accum such that effective batch ~= BATCH_SIZE
    per_device = 1 if torch.cuda.is_available() else max(1, BATCH_SIZE)
    grad_accum = max(1, BATCH_SIZE // per_device)
    print(f"Training config: per_device_train_batch_size={per_device}, gradient_accumulation_steps={grad_accum} (effective batch ~ {per_device*grad_accum})")

    output_dir = os.path.join(OUTPUT_BASE, f"{name}_t5_small")
    os.makedirs(output_dir, exist_ok=True)

    training_args = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=per_device,
        per_device_eval_batch_size=per_device,
        gradient_accumulation_steps=grad_accum,
        eval_strategy="epoch",
        save_strategy="epoch",
        num_train_epochs=EPOCHS,
        learning_rate=LEARNING_RATE,
        weight_decay=0.01,
        logging_steps=50,
        fp16=(USE_FP16 and torch.cuda.is_available()),
        save_total_limit=2,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        report_to="none",
        remove_unused_columns=False,
    )

    # 6) simple metric - exact match (fast)
    def compute_metrics(eval_pred):
        preds, labels = eval_pred
        if isinstance(preds, tuple):
            preds = preds[0]
        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
        labels = [[(l if l != -100 else tokenizer.pad_token_id) for l in lab] for lab in labels]
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
        exact = sum(int(a.strip()==b.strip()) for a,b in zip(decoded_preds, decoded_labels)) / max(1, len(decoded_preds))
        return {"exact_match": exact}

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_tokenized,
        eval_dataset=val_tokenized,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    # 7) Train with OOM handling (one automatic retry with smaller settings)
    def train_with_oom_handling(trainer_obj, retry_on_oom=True):
        try:
            trainer_obj.train()
            return True
        except RuntimeError as e:
            err = str(e).lower()
            if "out of memory" in err or "cuda out of memory" in err:
                print("CUDA OOM detected during trainer.train(). Attempting cleanup and retry...")
                torch.cuda.empty_cache()
                # delete model & trainer memory references
                try:
                    del trainer_obj.model
                except Exception:
                    pass
                torch.cuda.empty_cache()
                if retry_on_oom:
                    # reduce memory usage for retry: set smaller grad accumulation and disable fp16
                    print("Retrying training with safer settings: gradient_accumulation_steps=1 and fp16=False")
                    trainer_obj.args.gradient_accumulation_steps = 1
                    trainer_obj.args.fp16 = False
                    # reload model fresh
                    model_retry = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
                    try:
                        model_retry.gradient_checkpointing_enable()
                    except Exception:
                        pass
                    model_retry.config.use_cache = False
                    model_retry = model_retry.to(device)
                    trainer_obj.model = model_retry
                    try:
                        trainer_obj.train()
                        return True
                    except Exception as e2:
                        print("Retry also failed:", e2)
                        return False
                else:
                    return False
            else:
                # re-raise other runtime errors
                raise

    print("Starting training for:", file_path.name)
    ok = train_with_oom_handling(trainer, retry_on_oom=AUTO_RETRY_ON_OOM)
    if not ok:
        print(f"Training failed for {file_path.name}. Skipping to next file.")
        # ensure memory freed before next file
        try:
            del trainer
            del model
        except Exception:
            pass
        torch.cuda.empty_cache()
        continue

    # 8) Save model & tokenizer to output_dir
    print("Saving model to:", output_dir)
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)

    # 9) quick inference test (uses saved model path)
    device_for_pipeline = 0 if (device == "cuda") else -1
    gen_pipe = pipeline("text2text-generation", model=output_dir, tokenizer=output_dir, device=device_for_pipeline)
    sample = val_ds.shuffle(seed=SEED).select(range(min(3, len(val_ds))))
    for i, row in enumerate(sample):
        inp = row["input_text"]
        print(f"\n-- Example {i+1} --\nINPUT:\n{inp}\nTARGET:\n{row['target_text']}\nPREDICTION:")
        preds = gen_pipe("generate: " + inp, max_length=MAX_TARGET_LENGTH, do_sample=False)
        print(preds[0]["generated_text"])

    # 10) free memory fully before next dataset
    try:
        del trainer
        del model
    except Exception:
        pass
    torch.cuda.empty_cache()

print("\nAll files processed.")



=== Processing file: veda_fo_events_nse_hyperrealistic_100k.xlsx -> model will be saved to: veda_fo_events_nse_hyperrealistic_100k ===

Original shape: (100000, 17)
Detected columns: event_id (input)  |   time_ist (target)
After dropping NA: (100000, 17)


Unnamed: 0,event_id,time_ist
0,VEDA_FO_EVT_NSE_000001,2025-01-16 09:54:33
1,VEDA_FO_EVT_NSE_000002,2025-02-14 10:04:32


Train size: 90000 Val size: 10000


Map:   0%|          | 0/90000 [00:00<?, ? examples/s]



Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Loading model: t5-small


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Training config: per_device_train_batch_size=4, gradient_accumulation_steps=1 (effective batch ~ 4)
Starting training for: veda_fo_events_nse_hyperrealistic_100k.xlsx


  trainer = Trainer(


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 