## Step 1: Mount Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Step 2: Verify GPU

In [2]:
!nvidia-smi

Sun Dec  7 22:12:59 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   66C    P8             13W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

## Step 3: Verify Tokenized Dataset

In [3]:
import os

tokenized_path = '/content/drive/MyDrive/fever_tokenized_roberta'

if not os.path.exists(tokenized_path):
    raise FileNotFoundError(f"Tokenized dataset not found at {tokenized_path}. Run tokenization notebook first.")

# Verify contents
!ls -lh /content/drive/MyDrive/fever_tokenized_roberta/

total 13K
-rw------- 1 root root   43 Dec  7 21:51 dataset_dict.json
drwx------ 2 root root 4.0K Dec  7 21:51 test
drwx------ 2 root root 4.0K Dec  7 21:51 train
drwx------ 2 root root 4.0K Dec  7 21:51 validation


## Step 4: Clone Repository

In [4]:
# Clone your repo (update with your actual repo URL)
!git clone https://github.com/kvj-085/NLP_project.git
%cd NLP_project

Cloning into 'NLP_project'...
remote: Enumerating objects: 34, done.[K
remote: Counting objects: 100% (34/34), done.[K
remote: Compressing objects: 100% (25/25), done.[K
remote: Total 34 (delta 9), reused 32 (delta 7), pack-reused 0 (from 0)[K
Receiving objects: 100% (34/34), 128.75 KiB | 1.07 MiB/s, done.
Resolving deltas: 100% (9/9), done.
/content/NLP_project


## Step 5: Install Dependencies

In [5]:
# Install required packages with --no-deps to avoid breaking Colab system packages
!pip install --no-deps transformers datasets tokenizers huggingface_hub safetensors accelerate scikit-learn

print("\n=== Package versions ===")
!pip show transformers datasets torch


=== Package versions ===
Name: transformers
Version: 4.57.2
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache 2.0 License
Location: /usr/local/lib/python3.12/dist-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
Required-by: peft, sentence-transformers
---
Name: datasets
Version: 4.0.0
Summary: HuggingFace community-driven open-source library of datasets
Home-page: https://github.com/huggingface/datasets
Author: HuggingFace Inc.
Author-email: thomas@huggingface.co
License: Apache 2.0
Location: /usr/local/lib/python3.12/dist-packages
Requires: dill, filelock, fsspec, huggingface-hub, multiprocess, numpy, packaging, pandas, pyarrow, 

## Step 6: Verify Imports and Dataset

In [6]:
from datasets import load_from_disk
from transformers import AutoTokenizer

# Load tokenized dataset
tokenized_path = '/content/drive/MyDrive/fever_tokenized_roberta'
ds = load_from_disk(tokenized_path)

print("Dataset splits:", {k: len(v) for k, v in ds.items()})
print("Train columns:", ds['train'].column_names)
print("Sample input_ids length:", len(ds['train'][0]['input_ids']))

# Verify RoBERTa tokenizer
tokenizer = AutoTokenizer.from_pretrained('roberta-base')
print("\nRoBERTa tokenizer loaded successfully.")

Dataset splits: {'train': 145449, 'validation': 19998, 'test': 19998}
Train columns: ['text', 'label', 'input_ids', 'attention_mask']
Sample input_ids length: 128


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]


RoBERTa tokenizer loaded successfully.


## Step 7: Disable W&B

In [7]:
import os
os.environ['WANDB_DISABLED'] = 'true'
os.environ['WANDB_MODE'] = 'offline'

## Step 8: Train RoBERTa Model

In [None]:
from src.models.train import run_training

# Training configuration for T4 GPU
trainer, tokenized = run_training(
    processed_data_dir='/content/drive/MyDrive/fever_tokenized_roberta',
    model_name='roberta-base',
    output_dir='/content/drive/MyDrive/outputs/finetune_roberta',
    num_labels=3,
    epochs=3,
    batch_size=16,
    max_length=128,
    gradient_accumulation_steps=2,  # effective batch size = 16 * 2 = 32
    fp16=False,
    save_tokenized=False  # already tokenized
)

print("\n=== Training Complete ===")

⚙️  Running in WANDB offline mode
Detected tokenized dataset (input_ids present). Skipping tokenization step.


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss
10,1.0291
20,0.8118
30,0.6681
40,0.5178
50,0.4682
60,0.4397
70,0.455
80,0.4704
90,0.4191
100,0.4517



=== Training Complete ===


## Step 9: List Checkpoints

In [9]:
import glob
import os

outdir = '/content/drive/MyDrive/outputs/finetune_roberta'
ckpts = sorted(glob.glob(os.path.join(outdir, "checkpoint-*")), key=os.path.getmtime)

print("Found checkpoints:")
for ckpt in ckpts:
    print(f"  {ckpt}")

if ckpts:
    print(f"\nLatest checkpoint: {ckpts[-1]}")

Found checkpoints:
  /content/drive/MyDrive/outputs/finetune_roberta/checkpoint-4546
  /content/drive/MyDrive/outputs/finetune_roberta/checkpoint-9092
  /content/drive/MyDrive/outputs/finetune_roberta/checkpoint-13638

Latest checkpoint: /content/drive/MyDrive/outputs/finetune_roberta/checkpoint-13638


## Step 10: Evaluate and Save Predictions (CPU-safe)

In [11]:
# Manual CPU-only inference to avoid CUDA device-side asserts
import os, glob, numpy as np, pandas as pd, torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_from_disk
from torch.utils.data import DataLoader

tokenized_path = '/content/drive/MyDrive/fever_tokenized_roberta'
outdir = '/content/drive/MyDrive/outputs/finetune_roberta'
base_model_name = 'roberta-base'
batch_size = 64

# Pick latest checkpoint
ckpts = sorted(glob.glob(os.path.join(outdir, "checkpoint-*")), key=os.path.getmtime)
if not ckpts:
    raise SystemExit("No checkpoints found")
best_ckpt = ckpts[-1]
print("Using checkpoint:", best_ckpt)

# Load dataset
ds = load_from_disk(tokenized_path)
print("Splits:", {k: len(v) for k,v in ds.items()})

# Load tokenizer (fallback to hub)
try:
    tokenizer = AutoTokenizer.from_pretrained(best_ckpt, local_files_only=True)
    print("Tokenizer loaded from checkpoint.")
except Exception as e:
    print("Checkpoint tokenizer load failed:", repr(e))
    tokenizer = AutoTokenizer.from_pretrained(base_model_name)
    print("Loaded tokenizer from Hub:", base_model_name)

# Load model to CPU
model = AutoModelForSequenceClassification.from_pretrained(best_ckpt, local_files_only=True)
model.to('cpu')
model.eval()
print("Model num_labels:", model.config.num_labels)

# Helper to run inference on a split
def infer_split(split, split_name, model, batch_size=64, outdir=outdir):
    input_fields = [f for f in ('input_ids','attention_mask','token_type_ids') if f in split.column_names]
    label_field = None
    for cand in ('label','labels'):
        if cand in split.column_names:
            label_field = cand
            break

    fmt_cols = input_fields + ([label_field] if label_field else [])
    split.set_format(type='torch', columns=fmt_cols)
    dl = DataLoader(split, batch_size=batch_size)

    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in dl:
            inputs = {k: batch[k].to('cpu') for k in input_fields}
            outputs = model(**inputs)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1).cpu().numpy()
            all_preds.extend(preds.tolist())
            if label_field:
                all_labels.extend(batch[label_field].cpu().numpy().tolist())

    # Save CSV
    os.makedirs(outdir, exist_ok=True)
    df = pd.DataFrame({
        "idx": np.arange(len(all_preds)),
        "prediction": all_preds,
        "label": (all_labels if len(all_labels) == len(all_preds) else [-1]*len(all_preds))
    })
    csv_path = os.path.join(outdir, f"predictions_{split_name}.csv")
    df.to_csv(csv_path, index=False)
    print(f"Saved {split_name} predictions to: {csv_path}")

    return np.array(all_preds), (np.array(all_labels) if all_labels else None)

# Run validation
val_preds, val_labels = infer_split(ds['validation'], "validation", model, batch_size=batch_size)
if val_labels is not None:
    from sklearn.metrics import accuracy_score, f1_score
    acc = accuracy_score(val_labels, val_preds)
    macro_f1 = f1_score(val_labels, val_preds, average='macro')
    print(f"Validation — accuracy: {acc:.6f}, macro_f1: {macro_f1:.6f}")

# Run test
test_labels_arr = np.array(ds['test']['label'])
print("Test label min,max:", test_labels_arr.min(), test_labels_arr.max())
test_preds, test_labels = infer_split(ds['test'], "test", model, batch_size=batch_size)

if test_labels is not None and (test_labels.min() >= 0 and test_labels.max() < model.config.num_labels):
    from sklearn.metrics import accuracy_score, f1_score
    acc = accuracy_score(test_labels, test_preds)
    macro_f1 = f1_score(test_labels, test_preds, average='macro')
    print(f"Test — accuracy: {acc:.6f}, macro_f1: {macro_f1:.6f}")
else:
    print("Test labels invalid for metric computation; metrics skipped. Predictions saved.")

Using checkpoint: /content/drive/MyDrive/outputs/finetune_roberta/checkpoint-13638
Splits: {'train': 145449, 'validation': 19998, 'test': 19998}
Checkpoint tokenizer load failed: TypeError('expected str, bytes or os.PathLike object, not NoneType')
Loaded tokenizer from Hub: roberta-base
Model num_labels: 3
Saved validation predictions to: /content/drive/MyDrive/outputs/finetune_roberta/predictions_validation.csv
Validation — accuracy: 0.870687, macro_f1: 0.868666
Test label min,max: -1 -1
Saved test predictions to: /content/drive/MyDrive/outputs/finetune_roberta/predictions_test.csv
Test labels invalid for metric computation; metrics skipped. Predictions saved.


## Step 11: Save Final Model and Tokenizer

In [12]:
final_save = "/content/drive/MyDrive/outputs/finetune_roberta/final_model"
os.makedirs(final_save, exist_ok=True)

# Save model
try:
    model.save_pretrained(final_save, safe_serialization=True)
    print("Model saved (safetensors) ->", final_save)
except TypeError:
    model.save_pretrained(final_save)
    print("Model saved (pytorch) ->", final_save)

# Save tokenizer
tokenizer.save_pretrained(final_save)
print("Tokenizer saved ->", final_save)
print("\nDone! Final model saved to Drive.")

Model saved (safetensors) -> /content/drive/MyDrive/outputs/finetune_roberta/final_model
Tokenizer saved -> /content/drive/MyDrive/outputs/finetune_roberta/final_model

Done! Final model saved to Drive.


In [1]:
import os
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

base_dir = "C:/Users/jeesh/Desktop/NLP proj/outputs/RoBERTa"
files = ["predictions_validation.csv"]

for fname in files:
    path = os.path.join(base_dir, fname)
    if not os.path.exists(path):
        print(f"Missing: {path}")
        continue

    df = pd.read_csv(path)
    if not {"label", "prediction"}.issubset(df.columns):
        print(f"{fname} missing required columns 'label' and 'prediction'.")
        continue

    y_true = df["label"].values
    y_pred = df["prediction"].values

    acc = accuracy_score(y_true, y_pred)
    macro_f1 = f1_score(y_true, y_pred, average="macro")
    labels_sorted = sorted(set(y_true) | set(y_pred))
    labels_sorted = [int(x) for x in labels_sorted]  # Convert to regular int for clean display
    cm = confusion_matrix(y_true, y_pred, labels=labels_sorted)

    print(f"\n=== {fname} ===")
    print(f"Accuracy: {acc:.6f}")
    print(f"Macro F1: {macro_f1:.6f}")
    print(f"\nConfusion Matrix:")
    print(f"           Predicted")
    print(f"           ", " ".join(f"{lab:>6}" for lab in labels_sorted))
    print(f"Actual")
    for i, row in enumerate(cm):
        print(f"  {labels_sorted[i]:>6}   {' '.join(f'{val:>6}' for val in row)}")



=== predictions_validation.csv ===
Accuracy: 0.870687
Macro F1: 0.868666

Confusion Matrix:
           Predicted
                 0      1      2
Actual
       0     6200    466      0
       1     2120   4546      0
       2        0      0   6666
