In [None]:
%%bash
find /content/drive/MyDrive -maxdepth 4 -type f -name 'fever_tokenized_distilbert.zip' -print || true
ls -lh /content/drive/MyDrive/fever_tokenized_distilbert.zip || true

In [None]:
%%bash
TS=$(date +%s)
BASE="/content/drive/MyDrive"
ZIP="$BASE/fever_tokenized_distilbert.zip"
DEST="$BASE"

if [ ! -f "$ZIP" ]; then
  echo "Zip not found at $ZIP. Upload it to My Drive and re-run this cell."
else
  echo "Backing up possible existing top-level dataset files..."
  CONFLICTS=("dataset_dict.json" "train" "validation" "test" "fever_tokenized_distilbert")
  for name in "${CONFLICTS[@]}"; do
    if [ -e "$DEST/$name" ]; then
      echo "Backing up $name -> ${name}.bak.$TS"
      mv "$DEST/$name" "$DEST/${name}.bak.$TS"
    fi
  done

  echo "Unzipping (overwrite) $ZIP -> $DEST"
  unzip -o "$ZIP" -d "$DEST"
  echo "Unzip finished. Top-level listing:"
  ls -la "$DEST" | sed -n '1,200p'
fi

In [None]:
%%bash
mkdir -p /content/drive/MyDrive/fever_tokenized_distilbert

# move expected parts into folder (silently ignore missing)
mv /content/drive/MyDrive/dataset_dict.json /content/drive/MyDrive/fever_tokenized_distilbert/ 2>/dev/null || true
mv /content/drive/MyDrive/train /content/drive/MyDrive/fever_tokenized_distilbert/ 2>/dev/null || true
mv /content/drive/MyDrive/validation /content/drive/MyDrive/fever_tokenized_distilbert/ 2>/dev/null || true
mv /content/drive/MyDrive/test /content/drive/MyDrive/fever_tokenized_distilbert/ 2>/dev/null || true

echo "Contents of tokenized folder now:"
ls -la /content/drive/MyDrive/fever_tokenized_distilbert | sed -n '1,200p' || true

In [None]:
%%bash
git clone https://github.com/kvj-085/NLP_project.git /content/repo || true
ls -la /content/repo | sed -n '1,200p'

In [None]:
import os, sys

repo_dir = "/content/repo"   

if not os.path.exists(repo_dir):
    raise SystemExit(f"Repo not found at {repo_dir} — run the clone/copy step first or update repo_dir")

# add repo to path and change cwd
if repo_dir not in sys.path:
    sys.path.insert(0, repo_dir)
os.chdir(repo_dir)
print("cwd:", os.getcwd())
print("repo top-level:", os.listdir('.')[:200])
print("src present?:", os.path.exists('src'))

In [None]:
%%bash
pip install -q --no-deps transformers datasets tokenizers huggingface-hub safetensors fsspec scikit-learn accelerate
echo "Top-level packages installed (no-deps). If a specific import fails, install that single package."

In [None]:
import transformers, datasets, torch
from sklearn.metrics import accuracy_score, f1_score
print("transformers", transformers.__version__)
print("datasets", datasets.__version__)
print("torch", torch.__version__, "cuda available:", torch.cuda.is_available())
!nvidia-smi || true

In [None]:
import os
from datasets import load_from_disk

tokenized_path = '/content/drive/MyDrive/fever_tokenized_distilbert'
print('exists:', os.path.exists(tokenized_path))
if os.path.exists(tokenized_path):
    ds = load_from_disk(tokenized_path)
    print({split: len(ds[split]) for split in ds})
    print('train columns:', ds['train'].column_names)
    if 'input_ids' in ds['train'].column_names:
        print('sample input_ids len:', len(ds['train'][0]['input_ids']))
else:
    print('Tokenized dataset not found at', tokenized_path)

In [None]:
try:
    from src.models import train as train_module
    print("Imported src.models.train OK —", train_module.__file__)
except Exception as e:
    print("Import failed:", type(e).__name__, e)
    import traceback; traceback.print_exc()
    raise

In [None]:
import os
os.environ['WANDB_MODE'] = 'offline'
os.environ['WANDB_DISABLED'] = 'true'
print('WANDB_MODE=', os.environ.get('WANDB_MODE'))
print('WANDB_DISABLED=', os.environ.get('WANDB_DISABLED'))

In [None]:
from src.models.train import run_training
res = run_training(
    processed_data_dir='/content/drive/MyDrive/fever_tokenized_distilbert',
    model_name='distilbert-base-uncased',
    output_dir='/content/drive/MyDrive/outputs/finetune_test',
    epochs=0,
    batch_size=8,
    max_length=128,
    save_tokenized=False,
    gradient_accumulation_steps=1,
    fp16=False
)
print("Dry-run returned:", type(res))

In [None]:
%%bash
nvidia-smi --query-gpu=index,name,memory.total,memory.free --format=csv,noheader,nounits || true

In [None]:
import time
from src.models.train import run_training

processed_data_dir = '/content/drive/MyDrive/fever_tokenized_distilbert'
output_dir = '/content/drive/MyDrive/outputs/finetune_distilbert'

print("Starting training: model=distilbert-base-uncased, max_length=128")
start = time.time()
try:
    trainer, tokenized = run_training(
        processed_data_dir=processed_data_dir,
        model_name='distilbert-base-uncased',
        output_dir=output_dir,
        num_labels=3,
        epochs=3,
        batch_size=16,                      
        max_length=128,
        save_tokenized=False,
        gradient_accumulation_steps=2,       # effective batch = 16*2 = 32
        fp16=True
    )
    elapsed = time.time() - start
    print(f"Training finished in {elapsed/60:.1f} minutes. Check checkpoints in: {output_dir}")
except RuntimeError as e:
    # catch common CUDA OOM errors and give fallback advice
    msg = str(e)
    print("Training failed with RuntimeError:", msg)
    if 'out of memory' in msg.lower() or 'cuda' in msg.lower():
        print("\nCUDA OOM detected. Recommended fallback options:")
        print("- Reduce per-device `batch_size` (e.g. 8) and increase `gradient_accumulation_steps` (e.g. 4).")
        print("- Or try batch_size=8, gradient_accumulation_steps=4 (effective batch 32).")
        print("- After adjusting, restart the runtime and re-run the training cell.")
    else:
        raise

# show saved outputs (if any)
print("\nDrive outputs (top-level):")
!ls -la /content/drive/MyDrive/outputs | sed -n '1,200p' || true
print("\nCheckpoint folder listing (if exists):")
!ls -la "{output_dir}" | sed -n '1,200p' || true

In [None]:
%%bash
#Quick sanity: show saved checkpoints
ls -la /content/drive/MyDrive/outputs/finetune_distilbert | sed -n '1,200p'

GPU evaluation sometimes crashes with CUDA device-side asserts if there’s even a small data or label mismatch. Running evaluation on CPU avoids these crashes because CPUs handle errors safely without killing the entire process. It is slower, but much more stable.

In [None]:
# Manual CPU-only inference (no Trainer) — avoids CUDA device-side asserts
import os, glob, numpy as np, pandas as pd, torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_from_disk

tokenized_path = '/content/drive/MyDrive/fever_tokenized_distilbert'
outdir = '/content/drive/MyDrive/outputs/finetune_distilbert'
base_model_name = 'distilbert-base-uncased'
batch_size = 64

# pick checkpoint
ckpts = sorted(glob.glob(os.path.join(outdir, "checkpoint-*")), key=os.path.getmtime)
if not ckpts:
    raise SystemExit("No checkpoints found")
best_ckpt = ckpts[-1]
print("Using checkpoint:", best_ckpt)
print("Checkpoint files:", os.listdir(best_ckpt))

# load dataset
ds = load_from_disk(tokenized_path)
print("Splits:", {k: len(v) for k,v in ds.items()})
print("Validation columns:", ds['validation'].column_names)
print("Test columns:", ds['test'].column_names)

# Load tokenizer (fallback to Hub) and model (force CPU)
try:
    tokenizer = AutoTokenizer.from_pretrained(best_ckpt, local_files_only=True)
    print("Loaded tokenizer from checkpoint (local).")
except Exception as e:
    print("Checkpoint tokenizer load failed:", repr(e))
    tokenizer = AutoTokenizer.from_pretrained(base_model_name)
    print("Loaded tokenizer from Hub:", base_model_name)

# Load model to CPU explicitly
model = AutoModelForSequenceClassification.from_pretrained(best_ckpt, local_files_only=True)
model.to('cpu')
model.eval()
print("Model num_labels:", model.config.num_labels)

# Helper to run inference on a split using DataLoader
from torch.utils.data import DataLoader

def infer_split(split, split_name, model, batch_size=64, outdir=outdir):
    # determine which input fields exist
    input_fields = [f for f in ('input_ids','attention_mask','token_type_ids') if f in split.column_names]
    label_field = None
    for cand in ('label','labels'):
        if cand in split.column_names:
            label_field = cand
            break

    # set dataset format for torch
    fmt_cols = input_fields + ([label_field] if label_field else [])
    split.set_format(type='torch', columns=fmt_cols)
    dl = DataLoader(split, batch_size=batch_size)

    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in dl:
            # move inputs to CPU (they are already torch tensors)
            inputs = {k: batch[k].to('cpu') for k in input_fields}
            outputs = model(**inputs)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1).cpu().numpy()
            all_preds.extend(preds.tolist())
            if label_field:
                all_labels.extend(batch[label_field].cpu().numpy().tolist())

    # Save CSV
    os.makedirs(outdir, exist_ok=True)
    df = pd.DataFrame({
        "idx": np.arange(len(all_preds)),
        "prediction": all_preds,
        "label": (all_labels if len(all_labels) == len(all_preds) else [-1]*len(all_preds))
    })
    csv_path = os.path.join(outdir, f"predictions_{split_name}.csv")
    df.to_csv(csv_path, index=False)
    print(f"Saved {split_name} predictions to: {csv_path}")

    return np.array(all_preds), (np.array(all_labels) if all_labels else None)

# Run validation (compute metrics)
val_preds, val_labels = infer_split(ds['validation'], "validation", model, batch_size=batch_size)
if val_labels is None:
    print("No validation labels found: cannot compute metrics.")
else:
    from sklearn.metrics import accuracy_score, f1_score
    acc = accuracy_score(val_labels, val_preds)
    macro_f1 = f1_score(val_labels, val_preds, average='macro')
    print(f"Validation — accuracy: {acc:.6f}, macro_f1: {macro_f1:.6f}")

# Run test (save predictions). If test labels are invalid (e.g., -1) we skip metrics.
test_labels_arr = np.array(ds['test']['label'])
print("Test label min,max:", test_labels_arr.min(), test_labels_arr.max())
test_preds, test_labels = infer_split(ds['test'], "test", model, batch_size=batch_size)

if test_labels is not None and (test_labels.min() >= 0 and test_labels.max() < model.config.num_labels):
    from sklearn.metrics import accuracy_score, f1_score
    acc = accuracy_score(test_labels, test_preds)
    macro_f1 = f1_score(test_labels, test_preds, average='macro')
    print(f"Test — accuracy: {acc:.6f}, macro_f1: {macro_f1:.6f}")
else:
    print("Test labels invalid for metric computation (likely -1); metrics skipped. Predictions saved.")

In [None]:
final_save = "/content/drive/MyDrive/outputs/finetune_distilbert/final_model"
import os
os.makedirs(final_save, exist_ok=True)

# save model (try safetensors when available)
try:
    # newer transformers supports safe_serialization flag
    model.save_pretrained(final_save, safe_serialization=True)
    print("Model saved (safetensors) ->", final_save)
except TypeError:
    # fallback if flag unsupported
    model.save_pretrained(final_save)
    print("Model saved (pytorch) ->", final_save)

# save tokenizer
tokenizer.save_pretrained(final_save)
print("Tokenizer saved ->", final_save)

In [3]:
import os
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

base_dir = "C:/Users/jeesh/Desktop/NLP proj/outputs/DistilBERT"
files = ["predictions_validation.csv"] 

for fname in files:
    path = os.path.join(base_dir, fname)
    if not os.path.exists(path):
        print(f"Missing: {path}")
        continue

    df = pd.read_csv(path)
    if not {"label", "prediction"}.issubset(df.columns):
        print(f"{fname} missing required columns 'label' and 'prediction'.")
        continue

    y_true = df["label"].values
    y_pred = df["prediction"].values
    

    acc = accuracy_score(y_true, y_pred)
    macro_f1 = f1_score(y_true, y_pred, average="macro")
    labels_sorted = sorted(set(y_true) | set(y_pred))
    labels_sorted = [int(x) for x in labels_sorted]  # Convert to regular int for clean display
    cm = confusion_matrix(y_true, y_pred, labels=labels_sorted)

    print(f"\n=== {fname} ===")
    print(f"Accuracy: {acc:.6f}")
    print(f"Macro F1: {macro_f1:.6f}")
    print(f"\nConfusion Matrix:")
    print(f"           Predicted")
    print(f"           ", " ".join(f"{lab:>6}" for lab in labels_sorted))
    print(f"Actual")
    for i, row in enumerate(cm):
        print(f"  {labels_sorted[i]:>6}   {' '.join(f'{val:>6}' for val in row)}")



=== predictions_validation.csv ===
Accuracy: 0.868537
Macro F1: 0.866913

Confusion Matrix:
           Predicted
                 0      1      2
Actual
       0     6094    572      0
       1     2054   4612      0
       2        3      0   6663
