# Load the Database

In [1]:
from datasets import load_dataset
import pandas as pd

# The official, stable CodeXGLUE dataset from Microsoft for Defect Detection
dataset_name = "code_x_glue_cc_defect_detection"

print(f"Attempting to load the canonical dataset: '{dataset_name}'")

try:
    # This dataset is large, so the download may take a moment
    dataset = load_dataset(dataset_name)
    
    print("\n✅ OFFICIAL BENCHMARK DATASET LOADED SUCCESSFULLY!")
    print("--- Dataset Structure ---")
    print(dataset)
    
    # Explore the training data
    df = dataset['train'].to_pandas()
    print("\nFirst 5 rows:")
    print(df.head())

    # Check the label column, which is named 'target'
    # 0 = correct, 1 = incorrect/defective
    print("\nLabel distribution:")
    print(df['target'].value_counts())
    
except Exception as e:
    print(f"\n❌ An error occurred during loading: {e}")

  from .autonotebook import tqdm as notebook_tqdm


Attempting to load the canonical dataset: 'code_x_glue_cc_defect_detection'


Downloading readme: 100%|██████████| 5.60k/5.60k [00:00<00:00, 6.14kB/s]
Downloading data: 100%|██████████| 17.8M/17.8M [00:06<00:00, 2.91MB/s]
Downloading data: 100%|██████████| 2.21M/2.21M [00:01<00:00, 1.86MB/s]
Downloading data: 100%|██████████| 2.23M/2.23M [00:01<00:00, 2.19MB/s]
Generating train split: 100%|██████████| 21854/21854 [00:00<00:00, 247718.59 examples/s]
Generating validation split: 100%|██████████| 2732/2732 [00:00<00:00, 238184.92 examples/s]
Generating test split: 100%|██████████| 2732/2732 [00:00<00:00, 254138.23 examples/s]


✅ OFFICIAL BENCHMARK DATASET LOADED SUCCESSFULLY!
--- Dataset Structure ---
DatasetDict({
    train: Dataset({
        features: ['id', 'func', 'target', 'project', 'commit_id'],
        num_rows: 21854
    })
    validation: Dataset({
        features: ['id', 'func', 'target', 'project', 'commit_id'],
        num_rows: 2732
    })
    test: Dataset({
        features: ['id', 'func', 'target', 'project', 'commit_id'],
        num_rows: 2732
    })
})

First 5 rows:
   id                                               func  target project  \
0   0  static av_cold int vdadec_init(AVCodecContext ...   False  FFmpeg   
1   1  static int transcode(AVFormatContext **output_...   False  FFmpeg   
2   2  static void v4l2_free_buffer(void *opaque, uin...   False  FFmpeg   
3   4  int av_opencl_buffer_write(cl_mem dst_cl_buf, ...   False  FFmpeg   
4   5  static int r3d_read_rdvo(AVFormatContext *s, A...    True  FFmpeg   

                                  commit_id  
0  973b1a6b9070e2bf17d1756




In [3]:
# load tokenizer 
from transformers import RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")

In [4]:
# define tokenization function 

def tokenize_function(examples):
    """
    Applies the tokenizer to a batch of code examples from the 'func' column.
    """
    return tokenizer(
        examples["func"], 
        padding="max_length", 
        truncation=True, 
        max_length=512
    )

In [5]:
# apply the function to the dataset 

tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 21854/21854 [01:40<00:00, 217.98 examples/s]
Map: 100%|██████████| 2732/2732 [00:12<00:00, 218.02 examples/s]
Map: 100%|██████████| 2732/2732 [00:12<00:00, 215.31 examples/s]


In [6]:
# Rename the 'target' column to 'labels' for the trainer
tokenized_datasets = tokenized_datasets.rename_column("target", "labels")

# Remove the original columns that the model doesn't need for training
tokenized_datasets = tokenized_datasets.remove_columns(["id", "func", "project", "commit_id"])

# Set the format of the dataset to PyTorch tensors
tokenized_datasets.set_format("torch")

# Let's inspect the final result of one example
print(tokenized_datasets['train'][0])

{'labels': tensor(False), 'input_ids': tensor([    0, 42653,  6402,  1215, 33912,  6979,   748,   417,  1829,   438,
         1215, 25153,  1640, 10612, 47436,  3204, 48522,  1009,  1469, 49575,
           43, 50118, 50118, 45152, 50140,  1437,  1437,  1437,   468,   495,
         2606,  3204, 15362, 48522,  1009, 49575,  5457,  6402, 49575, 46613,
        25943,  1215, 23687,   131, 50140,  1437,  1437,  1437, 29916,   748,
         6106,  1215, 46796,  1009,   705,  6106,  1215, 49575,  5457,   359,
        49575, 46613,   705,  6106,  1215, 49575,   131, 50140,  1437,  1437,
         1437,  8192, 47731,  2194,   131, 50140,  1437,  1437,  1437,  6979,
         5494,   131, 50140, 50140,  1437,  1437,  1437,   740, 43820, 46613,
          298, 29137,  1215, 49722,  5457,   321,   131, 50140, 50140,  1437,
         1437,  1437, 48565, 45511,   181,  3181,  1215, 40523,  1872,     9,
        45797, 48404, 50140,  1437,  1437,  1437,   114, 48209,  3145,  1215,
          298, 29137,  12

# Load the Pre Trained Model

In [7]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "microsoft/codebert-base", 
    num_labels=2
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# define training arguments

from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./kodo-codebert-finetuned-defect", # Directory to save the trained model
    evaluation_strategy="epoch",          # Evaluate performance at the end of each epoch
    num_train_epochs=1,                   # We'll start with 1 epoch for a quick first run
    per_device_train_batch_size=8,        # Number of examples per batch for training
    per_device_eval_batch_size=8,         # Number of examples per batch for evaluation
    logging_steps=100,                    # Log training progress every 100 steps
    save_strategy="epoch",                # Save a checkpoint at the end of each epoch
    load_best_model_at_end=True,          # Load the best model at the end of training
)

TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'