<a href="https://colab.research.google.com/github/jorge-martinez-gil/colab-notebooks/blob/main/GraphCodeBERT%2BFeatures.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# -*- coding: utf-8 -*-
"""
Runtime/Resource benchmarking for:
- Baseline: GraphCodeBERT classifier
- Ours: GraphCodeBERT + additional scalar feature

No wandb. Works on old/new transformers.
Dataset JSON fields: code1, code2, score (0/1), output (float)

Author: Jorge Martinez-Gil
"""

# ---- Disable external loggers ----
import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "disabled"
os.environ["WANDB_SILENT"] = "true"
os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "true"

import time, json, random, urllib.request
from dataclasses import dataclass
from typing import Optional, Dict, Any

import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split

from transformers import AutoTokenizer, AutoModel, Trainer, TrainingArguments
from transformers.modeling_outputs import SequenceClassifierOutput
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

# ----------------------------
# Config
# ----------------------------
MODEL_NAME = "microsoft/graphcodebert-base"
DATASET_URL = "https://www.jorgemar.com/data/data2.json"
DATASET_PATH = "data2.json"
MAX_LENGTH = 512
BATCH_SIZE = 8
EPOCHS = 3
EVAL_STEPS = 500
SAVE_STEPS = 500
WARMUP_STEPS = 500
WEIGHT_DECAY = 0.01
SEED = 42
LAT_SAMPLES = 64

if not os.path.exists(DATASET_PATH):
    urllib.request.urlretrieve(DATASET_URL, DATASET_PATH)

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    preds = np.argmax(preds, axis=-1)
    p, r, f1, _ = precision_recall_fscore_support(labels, preds, average='binary', zero_division=0)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": p, "recall": r}

class CodePairDataset(Dataset):
    def __init__(self, path, tokenizer, max_length, use_feature=True):
        with open(path, "r", encoding="utf-8") as f:
            self.data = json.load(f)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.use_feature = use_feature

    def __len__(self): return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        enc = self.tokenizer(item["code1"], item["code2"], truncation=True,
                             padding="max_length", max_length=self.max_length,
                             return_tensors="pt")
        enc = {k: v.squeeze(0) for k, v in enc.items()}
        enc["labels"] = torch.tensor(item["score"], dtype=torch.long)
        enc["output_feature"] = torch.tensor(float(item["output"]) if self.use_feature else 0.0)
        return enc

class GCBaseline(nn.Module):
    def __init__(self):
        super().__init__()
        self.enc = AutoModel.from_pretrained(MODEL_NAME)
        self.cls = nn.Linear(self.enc.config.hidden_size, 2)

    def forward(self, input_ids, attention_mask=None, labels=None, output_feature=None):
        h = self.enc(input_ids, attention_mask).last_hidden_state[:,0]
        logits = self.cls(h)
        loss = nn.CrossEntropyLoss()(logits, labels) if labels is not None else None
        return SequenceClassifierOutput(loss=loss, logits=logits)

def main():
    set_seed(SEED)
    tok = AutoTokenizer.from_pretrained(MODEL_NAME)
    ds = CodePairDataset(DATASET_PATH, tok, MAX_LENGTH)
    model = GCBaseline()
    args = TrainingArguments(output_dir="./out", per_device_train_batch_size=BATCH_SIZE,
                             num_train_epochs=EPOCHS, report_to=["none"])
    Trainer(model=model, args=args, train_dataset=ds).train()

if __name__ == "__main__":
    main()
