<a href="https://colab.research.google.com/github/jitani04/AI-Bug-Detection/blob/main/CodeBert_CodeXGlue_6_AST.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Install dependencies

## Load CodeXGLUE Database for Defect detect

In [None]:
# Imports
import pandas as pd
from datasets import load_dataset

# Load dataset
ds = load_dataset("google/code_x_glue_cc_defect_detection")

# Convert to pandas for exploration
train_df = pd.DataFrame(ds["train"])
val_df = pd.DataFrame(ds["validation"])
test_df = pd.DataFrame(ds["test"])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/17.8M [00:00<?, ?B/s]

data/validation-00000-of-00001.parquet:   0%|          | 0.00/2.21M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/2.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/21854 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2732 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2732 [00:00<?, ? examples/s]

In [None]:
# Ensure numeric labels
for df in [train_df, val_df, test_df]:
    df["target"] = df["target"].astype(int)

In [None]:
import ast

def extract_ast_features(code):
    """
    Parse code into an AST and extract structural statistics.
    """
    try:
        tree = ast.parse(code)
    except Exception:
        return "[AST: invalid]"

    stats = {
        "funcs": 0, "loops": 0, "branches": 0,
        "depth": 0, "returns": 0, "exceptions": 0
    }

    def walk(node, depth=1):
        stats["depth"] = max(stats["depth"], depth)
        for child in ast.iter_child_nodes(node):
            if isinstance(child, (ast.FunctionDef, ast.AsyncFunctionDef)):
                stats["funcs"] += 1
            elif isinstance(child, (ast.For, ast.While)):
                stats["loops"] += 1
            elif isinstance(child, ast.If):
                stats["branches"] += 1
            elif isinstance(child, ast.Return):
                stats["returns"] += 1
            elif isinstance(child, ast.Try):
                stats["exceptions"] += 1
            walk(child, depth + 1)

    walk(tree)
    return f"[AST: funcs={stats['funcs']}, loops={stats['loops']}, branches={stats['branches']}, depth={stats['depth']}, returns={stats['returns']}, exceptions={stats['exceptions']}]"

def augment_with_ast(example):
    """
    Append AST summary for symbolic context.
    """
    code = example["func"]
    symbolic = extract_ast_features(code)
    example["func_augmented"] = code + "\n" + symbolic
    return example

# Apply augmentation to all splits
ds_aug = ds.map(augment_with_ast)

print("✅ Augmentation example:")
print(ds_aug["train"][0]["func_augmented"][:300])

Map:   0%|          | 0/21854 [00:00<?, ? examples/s]

Map:   0%|          | 0/2732 [00:00<?, ? examples/s]

Map:   0%|          | 0/2732 [00:00<?, ? examples/s]

✅ Augmentation example:
static av_cold int vdadec_init(AVCodecContext *avctx)

{

    VDADecoderContext *ctx = avctx->priv_data;

    struct vda_context *vda_ctx = &ctx->vda_ctx;

    OSStatus status;

    int ret;



    ctx->h264_initialized = 0;



    /* init pix_fmts of codec */

    if (!ff_h264_vda_decoder.pix_fmts)


## Convert Dataset
converting the pandas dataframe to a huggingface dataset for training

In [None]:
from datasets import Dataset

train_ds = Dataset.from_pandas(pd.DataFrame(ds_aug["train"]))
val_ds = Dataset.from_pandas(pd.DataFrame(ds_aug["validation"]))
test_ds = Dataset.from_pandas(pd.DataFrame(ds_aug["test"]))

## Tokenize Code

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")

def tokenize_and_prepare_labels(example):
    tokens = tokenizer(
        example["func_augmented"],
        padding="max_length",
        truncation=True,
        max_length=256
    )
    tokens["labels"] = int(example["target"])
    return tokens

# Apply tokenization and label conversion
tokenized_train = train_ds.map(tokenize_and_prepare_labels)
tokenized_val   = val_ds.map(tokenize_and_prepare_labels)
tokenized_test  = test_ds.map(tokenize_and_prepare_labels)

#Set format for PyTorch
columns = ["input_ids", "attention_mask", "labels"]
tokenized_train.set_format("torch", columns=columns)
tokenized_val.set_format("torch", columns=columns)
tokenized_test.set_format("torch", columns=columns)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Map:   0%|          | 0/21854 [00:00<?, ? examples/s]

Map:   0%|          | 0/2732 [00:00<?, ? examples/s]

Map:   0%|          | 0/2732 [00:00<?, ? examples/s]

## Train and Evaluate CodeBert with CodeXGlUE

In [None]:
import transformers
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import TrainerCallback, TrainingArguments, TrainerState, TrainerControl
from torch.utils.data import DataLoader
import numpy as np
import torch

model = AutoModelForSequenceClassification.from_pretrained(
    "microsoft/codebert-base", num_labels=2
)

# -------------------------------
# Metric computation
# -----------------------------
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "precision": precision_score(labels, preds, zero_division=0),
        "recall": recall_score(labels, preds, zero_division=0),
        "f1": f1_score(labels, preds, zero_division=0),
        "accuracy": (preds == labels).mean()
    }

# -------------------------------
# Training arguments
# -------------------------------
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=6,            # longer training
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=5e-6,             # lower LR
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,
    load_best_model_at_end=True,
)

class PredictionMonitorCallback(TrainerCallback):
    def on_epoch_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
        model.eval()
        val_loader = DataLoader(tokenized_val, batch_size=32)
        all_preds = []
        with torch.no_grad():
            for batch in val_loader:
                # Ensure batch is on the same device as the model
                batch = {k: v.to(model.device) for k, v in batch.items()}
                outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
                preds = torch.argmax(outputs.logits, dim=-1)
                all_preds.extend(preds.cpu().numpy())
        counts = np.bincount(all_preds, minlength=2)
        print(f"\n[Epoch {state.epoch}] Predicted counts: 0 -> {counts[0]}, 1 -> {counts[1]}")
        model.train()

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)
trainer.add_callback(PredictionMonitorCallback)
trainer.train()

results = trainer.evaluate(tokenized_test)
print("Test metrics:", results)

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

  trainer = Trainer(
  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mjmitani4[0m ([33mjmitani4-ucla[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.6484,0.627614,0.69209,0.206403,0.317975,0.6153
2,0.6222,0.636173,0.909502,0.169334,0.285511,0.631772
3,0.5878,0.598503,0.611374,0.434709,0.508124,0.634334
4,0.5759,0.606387,0.672831,0.398484,0.500529,0.654466
5,0.5529,0.622584,0.740443,0.310025,0.437055,0.653001
6,0.5312,0.624122,0.666225,0.423757,0.518023,0.657394



[Epoch 1.0] Predicted counts: 0 -> 2378, 1 -> 354

[Epoch 2.0] Predicted counts: 0 -> 2511, 1 -> 221

[Epoch 3.0] Predicted counts: 0 -> 1888, 1 -> 844

[Epoch 4.0] Predicted counts: 0 -> 2029, 1 -> 703

[Epoch 5.0] Predicted counts: 0 -> 2235, 1 -> 497

[Epoch 6.0] Predicted counts: 0 -> 1977, 1 -> 755


Test metrics: {'eval_loss': 0.6224766373634338, 'eval_precision': 0.6252847380410023, 'eval_recall': 0.43745019920318723, 'eval_f1': 0.5147679324894515, 'eval_accuracy': 0.6211566617862372, 'eval_runtime': 34.5679, 'eval_samples_per_second': 79.033, 'eval_steps_per_second': 4.947, 'epoch': 6.0}


In [None]:
import torch
from torch.utils.data import DataLoader

# Pick a few examples from the validation set
sample_indices = [0, 1, 2, 3, 4]  # first 5 examples
val_samples = [tokenized_val[i] for i in sample_indices]

model.eval()
for i, sample in enumerate(val_samples):
    # Move tensors to model's device
    input_ids = sample["input_ids"].unsqueeze(0).to(model.device)
    attention_mask = sample["attention_mask"].unsqueeze(0).to(model.device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        pred = torch.argmax(outputs.logits, dim=-1).item()

    print(f"Example {i}:")
    print("Code snippet:\n", val_ds[sample_indices[i]]["func"][:500], "...")  # print first 500 chars
    print("Ground truth:", val_ds[sample_indices[i]]["target"])
    print("Model prediction:", pred)
    print("-"*50)


Example 0:
Code snippet:
 static void filter_mirror_setup(NetFilterState *nf, Error **errp)
{
    MirrorState *s = FILTER_MIRROR(nf);
    Chardev *chr;
    chr = qemu_chr_find(s->outdev);
    if (chr == NULL) {
        error_set(errp, ERROR_CLASS_DEVICE_NOT_FOUND,
                  "Device '%s' not found", s->outdev);
    qemu_chr_fe_init(&s->chr_out, chr, errp); ...
Ground truth: True
Model prediction: 1
--------------------------------------------------
Example 1:
Code snippet:
 static inline int64_t sub64(const int64_t a, const int64_t b)

{

	return a - b;

}
 ...
Ground truth: True
Model prediction: 0
--------------------------------------------------
Example 2:
Code snippet:
 void fw_cfg_add_callback(FWCfgState *s, uint16_t key, FWCfgCallback callback,

                         void *callback_opaque, uint8_t *data, size_t len)

{

    int arch = !!(key & FW_CFG_ARCH_LOCAL);



    assert(key & FW_CFG_WRITE_CHANNEL);



    key &= FW_CFG_ENTRY_MASK;



    assert(key < FW_CFG_MAX_