<a href="https://colab.research.google.com/github/jorge-martinez-gil/colab-notebooks/blob/main/GraphCodeBERT%2BFeatures.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# -*- coding: utf-8 -*-
"""
Runtime/Resource benchmarking for:
- Baseline: GraphCodeBERT classifier
- Ours: GraphCodeBERT + additional scalar feature

No wandb. Works on old/new transformers.
Dataset JSON fields: code1, code2, score (0/1), output (float)

Author: Jorge Martinez-Gil
"""

# ---- Disable external loggers (esp. wandb) ----
import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "disabled"
os.environ["WANDB_SILENT"] = "true"
os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "true"

import time, json, random
from dataclasses import dataclass
from typing import Optional, Dict, Any

import os, urllib.request
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split

from transformers import (
    AutoTokenizer, AutoModel,
    Trainer, TrainingArguments
)
from transformers.modeling_outputs import SequenceClassifierOutput
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

# ----------------------------
# Config
# ----------------------------
MODEL_NAME = "microsoft/graphcodebert-base"
DATASET_URL = "https://www.jorgemar.com/data/data2.json"
DATASET_PATH = "data2.json"
MAX_LENGTH = 512
BATCH_SIZE = 8
EPOCHS = 3
EVAL_STEPS = 500          # used only if the runtime supports step-based eval
SAVE_STEPS = 500          # used only if the runtime supports step-based save
WARMUP_STEPS = 500
WEIGHT_DECAY = 0.01
SEED = 42
LAT_SAMPLES = 64          # samples for latency timing


# Download dataset if not present
if not os.path.exists(DATASET_PATH):
    print(f"Downloading dataset from {DATASET_URL}")
    urllib.request.urlretrieve(DATASET_URL, DATASET_PATH)

# ----------------------------
# Utils
# ----------------------------
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

def count_params(model: nn.Module) -> int:
    return sum(p.numel() for p in model.parameters())

def gpu_name() -> str:
    return torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU"

def reset_peak_mem():
    if torch.cuda.is_available():
        torch.cuda.reset_peak_memory_stats(0)

def peak_gpu_gb() -> float:
    if torch.cuda.is_available():
        torch.cuda.synchronize()
        alloc = torch.cuda.max_memory_allocated(0) / (1024 ** 3)
        reserv = torch.cuda.max_memory_reserved(0) / (1024 ** 3)
        return max(alloc, reserv)
    return 0.0

def compute_metrics(eval_pred):
    # Works with both older and newer HF EvalPrediction objects
    if isinstance(eval_pred, (tuple, list)):
        preds, labels = eval_pred
    else:
        preds, labels = eval_pred.predictions, eval_pred.label_ids
    preds = np.argmax(preds, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary', zero_division=0)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

# ----------------------------
# Dataset
# ----------------------------
class CodePairDataset(Dataset):
    def __init__(self, file_path: str, tokenizer: AutoTokenizer, max_length: int = 512, use_feature: bool = True):
        with open(file_path, "r", encoding="utf-8") as f:
            self.data = json.load(f)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.use_feature = use_feature

    def __getitem__(self, idx: int) -> Dict[str, Any]:
        item = self.data[idx]
        enc = self.tokenizer(
            text=item["code1"],
            text_pair=item["code2"],
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt",
        )
        enc = {k: v.squeeze(0) for k, v in enc.items()}  # remove batch dim
        enc["labels"] = torch.tensor(int(item["score"]), dtype=torch.long)
        if self.use_feature:
            enc["output_feature"] = torch.tensor(float(item["output"]), dtype=torch.float)
        else:
            enc["output_feature"] = torch.tensor(0.0, dtype=torch.float)  # placeholder for baseline
        return enc

    def __len__(self) -> int:
        return len(self.data)

# ----------------------------
# Models
# ----------------------------
class GCBaseline(nn.Module):
    """Baseline GraphCodeBERT classifier using CLS."""
    def __init__(self, num_labels: int = 2):
        super().__init__()
        self.num_labels = num_labels
        self.encoder = AutoModel.from_pretrained(MODEL_NAME)
        self.dropout = nn.Dropout(getattr(self.encoder.config, "hidden_dropout_prob", 0.1))
        self.classifier = nn.Linear(self.encoder.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask=None, labels=None, output_feature=None):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        cls = outputs.last_hidden_state[:, 0, :]
        x = self.dropout(cls)
        logits = self.classifier(x)
        loss = None
        if labels is not None:
            loss = nn.CrossEntropyLoss()(logits.view(-1, self.num_labels), labels.view(-1))
        return SequenceClassifierOutput(loss=loss, logits=logits)

class GCOurs(nn.Module):
    """GraphCodeBERT + scalar feature projected and concatenated to CLS."""
    def __init__(self, num_labels: int = 2, feature_dim: int = 1):
        super().__init__()
        self.num_labels = num_labels
        self.encoder = AutoModel.from_pretrained(MODEL_NAME)
        self.dropout = nn.Dropout(getattr(self.encoder.config, "hidden_dropout_prob", 0.1))
        self.feature_proj = nn.Linear(feature_dim, self.encoder.config.hidden_size)
        self.classifier = nn.Linear(self.encoder.config.hidden_size * 2, num_labels)

    def forward(self, input_ids, attention_mask=None, labels=None, output_feature=None):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        cls = outputs.last_hidden_state[:, 0, :]
        feat = self.feature_proj(output_feature.unsqueeze(-1))
        x = torch.cat([cls, feat], dim=1)
        x = self.dropout(x)
        logits = self.classifier(x)
        loss = None
        if labels is not None:
            loss = nn.CrossEntropyLoss()(logits.view(-1, self.num_labels), labels.view(-1))
        return SequenceClassifierOutput(loss=loss, logits=logits)

# ----------------------------
# Training / Eval Runner
# ----------------------------
@dataclass
class RunResult:
    variant: str
    params_m: float
    train_hours: float
    latency_ms: float
    gpu: str
    peak_mem_gb: float
    batch_size: int
    seq_len: int
    f1: Optional[float] = None
    precision: Optional[float] = None
    recall: Optional[float] = None
    accuracy: Optional[float] = None

def make_splits(dataset: Dataset, train_ratio=0.8):
    n = len(dataset)
    n_train = int(train_ratio * n)
    n_val = (n - n_train) // 2
    n_test = n - n_train - n_val
    return random_split(dataset, [n_train, n_val, n_test], generator=torch.Generator().manual_seed(SEED))

def measure_latency(model: nn.Module, test_dataset: Dataset, lat_samples: int = 64) -> float:
    dl = DataLoader(test_dataset, batch_size=1, shuffle=False)
    model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Warmup a few steps
    with torch.no_grad():
        for i, batch in enumerate(dl):
            batch = {k: (v.to(device) if isinstance(v, torch.Tensor) else v) for k, v in batch.items()}
            _ = model(**{k: v for k, v in batch.items() if k in ("input_ids", "attention_mask", "labels", "output_feature")})
            if i >= 3: break

    if torch.cuda.is_available(): torch.cuda.synchronize()
    t1 = time.perf_counter()
    with torch.no_grad():
        i = 0
        for batch in dl:
            batch = {k: (v.to(device) if isinstance(v, torch.Tensor) else v) for k, v in batch.items()}
            _ = model(**{k: v for k, v in batch.items() if k in ("input_ids", "attention_mask", "labels", "output_feature")})
            i += 1
            if i >= min(lat_samples, len(test_dataset)): break
    if torch.cuda.is_available(): torch.cuda.synchronize()
    t2 = time.perf_counter()
    return ((t2 - t1) / max(1, i)) * 1000.0

def build_training_args(variant_name: str) -> TrainingArguments:
    """
    Version-robust TrainingArguments:
    - Try new-style args first (eval/save strategies).
    - If unsupported, fall back to minimal args.
    """
    base = dict(
        output_dir=f"./results_{variant_name.replace(' ', '_').lower()}",
        num_train_epochs=EPOCHS,
        per_device_train_batch_size=BATCH_SIZE,
        learning_rate=5e-5,
        warmup_steps=WARMUP_STEPS,
        weight_decay=WEIGHT_DECAY,
        logging_dir=f"./logs_{variant_name.replace(' ', '_').lower()}",
        seed=SEED,
    )
    # Try modern API
    try:
        return TrainingArguments(
            evaluation_strategy="steps",
            eval_steps=EVAL_STEPS,
            save_strategy="steps",
            save_steps=SAVE_STEPS,
            report_to=["none"],  # no wandb/tensorboard
            **base,
        )
    except TypeError:
        # Older API fallback
        return TrainingArguments(**base)

def run_variant(variant_name: str, model: nn.Module, dataset: CodePairDataset) -> RunResult:
    set_seed(SEED)
    train_ds, val_ds, test_ds = make_splits(dataset)
    training_args = build_training_args(variant_name)

    # Try to build Trainer with eval hooks; if it fails, fallback to train-only
    try:
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_ds,
            eval_dataset=val_ds,
            compute_metrics=compute_metrics,
        )
        supports_eval = True
    except TypeError:
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_ds,
        )
        supports_eval = False

    # ---- Train & time
    reset_peak_mem()
    t0 = time.perf_counter()
    trainer.train()
    train_seconds = time.perf_counter() - t0
    train_hours = train_seconds / 3600.0

    # ---- Evaluate (test)
    if supports_eval:
        _ = trainer.evaluate(eval_dataset=val_ds)  # optional val metrics
        test_metrics = trainer.evaluate(eval_dataset=test_ds)
        f1 = float(test_metrics.get("eval_f1", float("nan")))
        prec = float(test_metrics.get("eval_precision", float("nan")))
        rec  = float(test_metrics.get("eval_recall", float("nan")))
        acc  = float(test_metrics.get("eval_accuracy", float("nan")))
    else:
        # Manual evaluation for older versions
        preds, labels = [], []
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model.to(device); model.eval()
        dl = DataLoader(test_ds, batch_size=1, shuffle=False)
        with torch.no_grad():
            for batch in dl:
                b = {k: (v.to(device) if isinstance(v, torch.Tensor) else v) for k, v in batch.items()}
                out = model(**{k: v for k, v in b.items() if k in ("input_ids", "attention_mask", "labels", "output_feature")})
                pred = int(out.logits.argmax(dim=-1).detach().cpu().item())
                label = int(b["labels"].item())
                preds.append(pred); labels.append(label)
        precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary', zero_division=0)
        acc = accuracy_score(labels, preds)
        prec = float(precision); rec = float(recall)

    # ---- Resource stats
    peak_mem = peak_gpu_gb()
    params_m = count_params(model) / 1e6

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    latency_ms = measure_latency(model, test_ds, LAT_SAMPLES)

    return RunResult(
        variant=variant_name,
        params_m=params_m,
        train_hours=train_hours,
        latency_ms=latency_ms,
        gpu=gpu_name(),
        peak_mem_gb=peak_mem,
        batch_size=BATCH_SIZE,
        seq_len=MAX_LENGTH,
        f1=f1, precision=prec, recall=rec, accuracy=acc
    )

# ----------------------------
# Printing helpers
# ----------------------------
def print_summary_table(results):
    print("\n=== RUNTIME/RESOURCE SUMMARY ===")
    header = f"{'Variant':40s} {'Params(M)':>10s} {'Train(h)':>10s} {'Infer(ms)':>10s} {'GPU':>12s} {'PeakMem(GB)':>12s} {'Batch':>7s} {'Seq':>5s}"
    print(header)
    print("-" * len(header))
    for r in results:
        print(f"{r.variant:40s} {r.params_m:10.2f} {r.train_hours:10.2f} {r.latency_ms:10.2f} {r.gpu:12s} {r.peak_mem_gb:12.2f} {r.batch_size:7d} {r.seq_len:5d}")

def print_metrics_table(results):
    print("\n=== TEST METRICS (for context) ===")
    header = f"{'Variant':40s} {'F1':>8s} {'Prec':>8s} {'Rec':>8s} {'Acc':>8s}"
    print(header)
    print("-" * len(header))
    for r in results:
        print(f"{r.variant:40s} {r.f1 if r.f1 is not None else float('nan'):8.4f} "
              f"{r.precision if r.precision is not None else float('nan'):8.4f} "
              f"{r.recall if r.recall is not None else float('nan'):8.4f} "
              f"{r.accuracy if r.accuracy is not None else float('nan'):8.4f}")

def print_latex_table(results):
    print("\n=== LaTeX (copy/paste) ===")
    print(r"\begin{table}[t]")
    print(r"\centering")
    print(r"\caption{Training and inference resources on NVIDIA A100 (same setup for all rows).}")
    print(r"\label{tab:runtime}")
    print(r"\setlength{\tabcolsep}{6pt}")
    print(r"\renewcommand{\arraystretch}{1.2}")
    print(r"\begin{tabular}{l"
          r"                S[table-format=3.2]"
          r"                S[table-format=2.2]"
          r"                S[table-format=2.2]"
          r"                l"
          r"                S[table-format=2.2]"
          r"                S[table-format=2.0]"
          r"                S[table-format=3.0]}")
    print(r"\toprule")
    print(r"{Variant} & {Params (M)} & {Train (h)} & {Infer (ms)} & {GPU} & {Peak Mem (GB)} & {Batch} & {Seq} \\")
    print(r"\midrule")
    for r in results:
        print(fr"{r.variant} &  {r.params_m:.2f} & {r.train_hours:.2f} & {r.latency_ms:.2f} & {r.gpu} & {r.peak_mem_gb:.2f} & {r.batch_size} & {r.seq_len} \\")
    print(r"\bottomrule")
    print(r"\end{tabular}")
    print(r"\vspace{2mm}")
    print(r"\footnotesize")
    print(fr"\textbf{{Setup:}} PyTorch \texttt{{{torch.__version__}}}, CUDA \texttt{{{torch.version.cuda}}}. "
          fr"Seed = {SEED}, epochs = {EPOCHS}, warmup = {WARMUP_STEPS}, weight decay = {WEIGHT_DECAY}, eval/save steps = {EVAL_STEPS}.")
    print(r"\end{table}")

# ----------------------------
# Main
# ----------------------------
def main():
    if not os.path.exists(DATASET_PATH):
        raise FileNotFoundError(f"Dataset not found at {DATASET_PATH}.")
    set_seed(SEED)

    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    ds_baseline = CodePairDataset(DATASET_PATH, tokenizer, MAX_LENGTH, use_feature=False)
    ds_ours = CodePairDataset(DATASET_PATH, tokenizer, MAX_LENGTH, use_feature=True)

    baseline_model = GCBaseline(num_labels=2)
    ours_model = GCOurs(num_labels=2, feature_dim=1)

    res_baseline = run_variant("GraphCodeBERT (baseline)", baseline_model, ds_baseline)
    res_ours = run_variant("Ours: + additional feature", ours_model, ds_ours)

    results = [res_baseline, res_ours]

    print_summary_table(results)
    print_metrics_table(results)
    print_latex_table(results)
    print("\nTip: keep batch size and seq length identical across rows; mention any param/memory delta in the caption.")

if __name__ == "__main__":
    main()





Downloading dataset from https://www.jorgemar.com/data/data2.json


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/539 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss



=== RUNTIME/RESOURCE SUMMARY ===
Variant                                   Params(M)   Train(h)  Infer(ms)          GPU  PeakMem(GB)   Batch   Seq
-----------------------------------------------------------------------------------------------------------------
GraphCodeBERT (baseline)                     124.65       0.04      31.14 Tesla T4             4.02       8   512
Ours: + additional feature                   124.65       0.04      31.42 Tesla T4             4.48       8   512

=== TEST METRICS (for context) ===
Variant                                        F1     Prec      Rec      Acc
----------------------------------------------------------------------------
GraphCodeBERT (baseline)                   0.9859   1.0000   0.9722   0.9783
Ours: + additional feature                 0.8788   0.9667   0.8056   0.8261

=== LaTeX (copy/paste) ===
\begin{table}[t]
\centering
\caption{Training and inference resources on NVIDIA A100 (same setup for all rows).}
\label{tab:runtime}
\setl