In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import torch

In [None]:
# Load a subset of the CodeXGLUE defect detection dataset
dataset = load_dataset("google/code_x_glue_cc_defect_detection", split="test[:50]")

# Inspect a few examples
print(dataset[0])




In [None]:
model_name = "deepseek-ai/deepseek-coder-1.3b-instruct"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16 if torch.backends.mps.is_available() else torch.float32,
    device_map="auto" if torch.backends.mps.is_available() else None,
)

device = "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Using device: {device}")


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/631 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/2.69G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

Using device: cpu


In [None]:
from tqdm import tqdm

y_true, y_pred = [], []

for example in tqdm(dataset, desc="Evaluating"):
    code = example["func"]
    prompt = f"Is the following code buggy? Answer only True or False.\n\n{code}\n\nAnswer:"

    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(**inputs, max_new_tokens=10)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True).lower()

    pred = 1 if "true" in response else 0
    y_pred.append(pred)
    y_true.append(example["target"])


Evaluating:   0%|          | 0/50 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
Evaluating:   2%|▏         | 1/50 [01:33<1:16:27, 93.62s/it]Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
Evaluating:   4%|▍         | 2/50 [03:27<1:24:28, 105.60s/it]Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
Evaluating:   6%|▌         | 3/50 [03:57<55:40, 71.08s/it]   Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
Evaluating:   8%|▊         | 4/50 [07:11<1:31:34, 119.44s/it]Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
Evaluating:  10%|█         | 5/50 [08:33<1:19:25, 105.89s/it]Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
Evaluating:  12%|█▏        | 6/50 [08:48<55:12, 75.29s/it]   Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
Evaluating:  14%|█▍        | 7/50 [09:17<43:07, 60.17s/it]Setting `pad_token_id` 

In [None]:
# Compute evaluation metrics
acc = accuracy_score(y_true, y_pred)
prec = precision_score(y_true, y_pred, zero_division=0)
rec = recall_score(y_true, y_pred, zero_division=0)
f1 = f1_score(y_true, y_pred, zero_division=0)
cm = confusion_matrix(y_true, y_pred)

# Print results
print(f"Accuracy:  {acc:.3f}")
print(f"Precision: {prec:.3f}")
print(f"Recall:    {rec:.3f}")
print(f"F1 Score:  {f1:.3f}")
print("\nConfusion Matrix:\n", cm)


Accuracy:  0.600
Precision: 0.600
Recall:    1.000
F1 Score:  0.750

Confusion Matrix:
 [[ 0 20]
 [ 0 30]]
