
**Navigation**
1. [Dependencies and Data Loading](#Dependencies-and-Data-Loading)
4. [Dataset Stratification](#Dataset-Stratification)
3. [Model Selection](#Model-Selection)

# **Dependencies and Data Loading**
Install and import all required libraries for the analysis.

In [1]:
!pip install pandas datasets transformers torch scikit-learn

import pandas as pd
import os
from datasets import Dataset, load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
)
import torch
from tqdm.auto import tqdm
import numpy as np
from torch.utils.data import DataLoader, TensorDataset
from torch.nn import BCEWithLogitsLoss
from sklearn.model_selection import train_test_split

tqdm.pandas()

# dataset from hugging face
langs = ['java', 'python', 'pharo']
labels = {
    'java': ['summary', 'Ownership', 'Expand', 'usage', 'Pointer', 'deprecation', 'rational'],
    'python': ['Usage', 'Parameters', 'DevelopmentNotes', 'Expand', 'Summary'],
    'pharo': ['Keyimplementationpoints', 'Example', 'Responsibilities', 'Classreferences', 'Intent', 'Keymessages', 'Collaborators']
}
ds = load_dataset('NLBSE/nlbse25-code-comment-classification')
ds

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m27.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

java_train-00000-of-00001.parquet:   0%|          | 0.00/680k [00:00<?, ?B/s]

java_test-00000-of-00001.parquet:   0%|          | 0.00/174k [00:00<?, ?B/s]

python_train-00000-of-00001.parquet:   0%|          | 0.00/126k [00:00<?, ?B/s]

python_test-00000-of-00001.parquet:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

pharo_train-00000-of-00001.parquet:   0%|          | 0.00/113k [00:00<?, ?B/s]

pharo_test-00000-of-00001.parquet:   0%|          | 0.00/30.6k [00:00<?, ?B/s]

Generating java_train split:   0%|          | 0/7614 [00:00<?, ? examples/s]

Generating java_test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

Generating python_train split:   0%|          | 0/1884 [00:00<?, ? examples/s]

Generating python_test split:   0%|          | 0/406 [00:00<?, ? examples/s]

Generating pharo_train split:   0%|          | 0/1298 [00:00<?, ? examples/s]

Generating pharo_test split:   0%|          | 0/289 [00:00<?, ? examples/s]

DatasetDict({
    java_train: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 7614
    })
    java_test: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 1725
    })
    python_train: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 1884
    })
    python_test: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 406
    })
    pharo_train: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 1298
    })
    pharo_test: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 289
    })
})

**Data Analysis**

Count of +ve and -ve labels per category in the original dataset

In [2]:
def check_label_distribution(dataset, category_labels):
    distribution = {category: {'Positive': 0, 'Negative': 0} for category in category_labels}

    for i, sample in enumerate(dataset):
        if 'labels' in sample:
            label_vector = sample['labels']
            if len(label_vector) != len(category_labels):
                print(f"Warning: Sample {i} has {len(label_vector)} labels, expected {len(category_labels)}.")
                continue

            for idx, category in enumerate(category_labels):
                label = label_vector[idx]
                if label == 1:
                    distribution[category]['Positive'] += 1
                else:
                    distribution[category]['Negative'] += 1
        else:
            print(f"Warning: Sample {i} does not contain 'labels'.")

    return pd.DataFrame(distribution).T

for lang in langs:
    print(f"{lang.capitalize()} Label Distribution:")
    label_distribution = check_label_distribution(ds[lang + '_train'], labels[lang])
    print(label_distribution)

Java Label Distribution:
             Positive  Negative
summary          3610      4004
Ownership         267      7347
Expand            509      7105
usage            2093      5521
Pointer           904      6710
deprecation       117      7497
rational          311      7303
Python Label Distribution:
                  Positive  Negative
Usage                  578      1306
Parameters             572      1312
DevelopmentNotes       210      1674
Expand                 343      1541
Summary                347      1537
Pharo Label Distribution:
                         Positive  Negative
Keyimplementationpoints       178      1120
Example                       547       751
Responsibilities              245      1053
Classreferences                46      1252
Intent                        151      1147
Keymessages                   214      1084
Collaborators                  76      1222


# **Dataset Stratification**
Ensure diversity and balance within the dataset.
Improve model performance by maintaining proportional label distribution across splits.

In [3]:
output_dir = "stratified_dataset"
os.makedirs(output_dir, exist_ok=True)

def process_language(language, dataset):
    print(f"Processing {language} dataset...")

    df = dataset.to_pandas()

    df = df.dropna()
    df = df[~df.isin([float("inf"), float("-inf")]).any(axis=1)]
    labels_df = pd.DataFrame(df['labels'].tolist())

    df['stratify_key'] = labels_df.sum(axis=1)

    if df['stratify_key'].nunique() < 2:
        raise ValueError(f"Cannot perform stratification for {language}. Not enough diversity in stratify_key.")

    _, stratified_set = train_test_split(
        df,
        test_size=0.2,  # 20% stratified split
        stratify=df['stratify_key'],
        random_state=42
    )

    stratified_set = stratified_set.drop(columns=['stratify_key'])

    stratified_set_path = os.path.join(output_dir, f"{language}_stratified_set.parquet")
    stratified_set.to_parquet(stratified_set_path, index=False)
    print(f"Saved {language} stratified set to '{stratified_set_path}'.")

    train_set, test_set = train_test_split(
        stratified_set,
        test_size=0.2,  # 20% test split
        random_state=42
    )

    train_path = os.path.join(output_dir, f"{language}_train_set.parquet")
    test_path = os.path.join(output_dir, f"{language}_test_set.parquet")
    train_set.to_parquet(train_path, index=False)
    test_set.to_parquet(test_path, index=False)
    print(f"Saved {language} train set to '{train_path}' and test set to '{test_path}'.")

for lang in langs:
    dataset = ds[f"{lang}_train"]
    process_language(lang, dataset)

print(f"All datasets saved in '{output_dir}'.")

Processing java dataset...
Saved java stratified set to 'stratified_dataset/java_stratified_set.parquet'.
Saved java train set to 'stratified_dataset/java_train_set.parquet' and test set to 'stratified_dataset/java_test_set.parquet'.
Processing python dataset...
Saved python stratified set to 'stratified_dataset/python_stratified_set.parquet'.
Saved python train set to 'stratified_dataset/python_train_set.parquet' and test set to 'stratified_dataset/python_test_set.parquet'.
Processing pharo dataset...
Saved pharo stratified set to 'stratified_dataset/pharo_stratified_set.parquet'.
Saved pharo train set to 'stratified_dataset/pharo_train_set.parquet' and test set to 'stratified_dataset/pharo_test_set.parquet'.
All datasets saved in 'stratified_dataset'.


**Stratified Data Analysis**

Count of +ve and -ve labels per category in the stratified dataset

In [4]:
def check_label_distribution(dataset, category_labels):
    distribution = {category: {'Positive': 0, 'Negative': 0} for category in category_labels}

    for i, row in dataset.iterrows():
        if 'labels' in row:
            label_vector = row['labels']
            if len(label_vector) != len(category_labels):
                print(f"Warning: Sample {i} has {len(label_vector)} labels, expected {len(category_labels)}.")
                continue

            for idx, category in enumerate(category_labels):
                label = label_vector[idx]
                if label == 1:
                    distribution[category]['Positive'] += 1
                else:
                    distribution[category]['Negative'] += 1
        else:
            print(f"Warning: Sample {i} does not contain 'labels'.")

    return pd.DataFrame(distribution).T

for lang in langs:
    print(f"{lang.capitalize()} Label Distribution:")
    train_path = f"stratified_dataset/{lang}_train_set.parquet"
    test_path = f"stratified_dataset/{lang}_test_set.parquet"

    train_df = pd.read_parquet(train_path)
    test_df = pd.read_parquet(test_path)

    print("Training Set Label Distribution:")
    label_distribution = check_label_distribution(train_df, labels[lang])
    print(label_distribution)

    print("Test Set Label Distribution:")
    label_distribution = check_label_distribution(test_df, labels[lang])
    print(label_distribution)
    print()

Java Label Distribution:
Training Set Label Distribution:
             Positive  Negative
summary           574       644
Ownership          54      1164
Expand             70      1148
usage             326       892
Pointer           158      1060
deprecation        18      1200
rational           47      1171
Test Set Label Distribution:
             Positive  Negative
summary           134       171
Ownership           5       300
Expand             26       279
usage              99       206
Pointer            41       264
deprecation         4       301
rational            6       299

Python Label Distribution:
Training Set Label Distribution:
                  Positive  Negative
Usage                   92       209
Parameters              93       208
DevelopmentNotes        34       267
Expand                  56       245
Summary                 54       247
Test Set Label Distribution:
                  Positive  Negative
Usage                   18        58
Parameters     

# **Model Selection**

### **Fine-Tuning**

In [5]:
def load_stratified_dataset(language):
    train_path = f"stratified_dataset/{language}_train_set.parquet"
    return pd.read_parquet(train_path)

def preprocess_data(examples, tokenizer):
    return tokenizer(examples['combo'], truncation=True, padding='longest', max_length=512)

class MultiLabelClassificationModel(torch.nn.Module):
    def __init__(self, model_name, problem_type, num_labels):
        super(MultiLabelClassificationModel, self).__init__()
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name, problem_type=problem_type, num_labels=num_labels)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        if labels is not None:
            loss_fn = torch.nn.BCEWithLogitsLoss()
            loss = loss_fn(logits, labels.float())
            return {'loss': loss, 'logits': logits}
        return {'logits': logits}

model_names = {
    "graphcodebert": "microsoft/graphcodebert-base",
    "codeberta": "huggingface/CodeBERTa-small-v1",
    "codebert": "microsoft/codebert-base"
}

for model_name, model_path in model_names.items():
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    for lang in langs:
        train_df = load_stratified_dataset(lang)
        train_dataset = Dataset.from_pandas(train_df).map(lambda x: preprocess_data(x, tokenizer), batched=True)

        num_labels = len(labels[lang])
        model = MultiLabelClassificationModel(model_path, "multi_label_classification", num_labels)

        if torch.cuda.is_available():
            model = model.to('cuda')

        training_args = TrainingArguments(
            output_dir=f"./models/{model_name}/{lang}_model",
            eval_strategy="no",
            save_strategy="epoch",
            logging_dir=f"./logs_{model_name}_{lang}",
            logging_steps=50,
            num_train_epochs=5 if lang == 'java' else 8,
            per_device_train_batch_size=8,
            gradient_accumulation_steps=2,
            fp16=True,
            save_total_limit=1,
            load_best_model_at_end=False,
            dataloader_pin_memory=True,
            disable_tqdm=True,
            report_to="none",
        )

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            tokenizer=tokenizer,
        )

        print(f"Training {model_name} for {lang}...")
        trainer.train()

        model.model.save_pretrained(f"./models/{model_name}/{lang}_model")
        tokenizer.save_pretrained(f"./models/{model_name}/{lang}_tokenizer")

        if torch.cuda.is_available():
            torch.cuda.empty_cache()

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/539 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

Map:   0%|          | 0/1218 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Training graphcodebert for java...
{'loss': 0.2991, 'grad_norm': 1.5920451879501343, 'learning_rate': 4.342105263157895e-05, 'epoch': 0.6535947712418301}
{'loss': 0.1544, 'grad_norm': 1.2345279455184937, 'learning_rate': 3.6842105263157895e-05, 'epoch': 1.3071895424836601}
{'loss': 0.1162, 'grad_norm': 2.761554718017578, 'learning_rate': 3.0263157894736844e-05, 'epoch': 1.9607843137254903}
{'loss': 0.0862, 'grad_norm': 1.3223257064819336, 'learning_rate': 2.368421052631579e-05, 'epoch': 2.6143790849673203}
{'loss': 0.0775, 'grad_norm': 3.224026918411255, 'learning_rate': 1.7105263157894737e-05, 'epoch': 3.2679738562091503}
{'loss': 0.0548, 'grad_norm': 1.6038814783096313, 'learning_rate': 1.0526315789473684e-05, 'epoch': 3.9215686274509802}
{'loss': 0.0477, 'grad_norm': 1.7221064567565918, 'learning_rate': 3.9473684210526315e-06, 'epoch': 4.57516339869281}
{'train_runtime': 175.2082, 'train_samples_per_second': 34.759, 'train_steps_per_second': 2.169, 'train_loss': 0.11350367508436504,

Map:   0%|          | 0/301 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Training graphcodebert for python...
{'loss': 0.5067, 'grad_norm': 3.809748888015747, 'learning_rate': 3.4539473684210524e-05, 'epoch': 2.6315789473684212}
{'loss': 0.3461, 'grad_norm': 3.0974979400634766, 'learning_rate': 1.8092105263157896e-05, 'epoch': 5.2631578947368425}
{'loss': 0.215, 'grad_norm': 6.205741882324219, 'learning_rate': 1.6447368421052632e-06, 'epoch': 7.894736842105263}
{'train_runtime': 110.8429, 'train_samples_per_second': 21.724, 'train_steps_per_second': 1.371, 'train_loss': 0.35354549751469966, 'epoch': 8.0}


Map:   0%|          | 0/208 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Training graphcodebert for pharo...


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

{'loss': 0.3907, 'grad_norm': 1.946009874343872, 'learning_rate': 2.6923076923076923e-05, 'epoch': 3.8461538461538463}
{'loss': 0.2292, 'grad_norm': 1.4139641523361206, 'learning_rate': 2.884615384615385e-06, 'epoch': 7.6923076923076925}
{'train_runtime': 125.7299, 'train_samples_per_second': 13.235, 'train_steps_per_second': 0.827, 'train_loss': 0.30508991617422837, 'epoch': 8.0}


tokenizer_config.json:   0%|          | 0.00/19.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/994k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/483k [00:00<?, ?B/s]

Map:   0%|          | 0/1218 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/336M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at huggingface/CodeBERTa-small-v1 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Training codeberta for java...


model.safetensors:   0%|          | 0.00/336M [00:00<?, ?B/s]

{'loss': 0.2679, 'grad_norm': 1.6513632535934448, 'learning_rate': 4.342105263157895e-05, 'epoch': 0.6535947712418301}
{'loss': 0.1524, 'grad_norm': 2.0526621341705322, 'learning_rate': 3.6842105263157895e-05, 'epoch': 1.3071895424836601}
{'loss': 0.1123, 'grad_norm': 2.407440662384033, 'learning_rate': 3.0263157894736844e-05, 'epoch': 1.9607843137254903}
{'loss': 0.0848, 'grad_norm': 1.816125750541687, 'learning_rate': 2.368421052631579e-05, 'epoch': 2.6143790849673203}
{'loss': 0.0698, 'grad_norm': 1.7831331491470337, 'learning_rate': 1.7105263157894737e-05, 'epoch': 3.2679738562091503}
{'loss': 0.0514, 'grad_norm': 1.1147102117538452, 'learning_rate': 1.0526315789473684e-05, 'epoch': 3.9215686274509802}
{'loss': 0.0418, 'grad_norm': 1.237090826034546, 'learning_rate': 3.9473684210526315e-06, 'epoch': 4.57516339869281}
{'train_runtime': 92.6955, 'train_samples_per_second': 65.699, 'train_steps_per_second': 4.099, 'train_loss': 0.10589251110428258, 'epoch': 4.967320261437909}


Map:   0%|          | 0/301 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at huggingface/CodeBERTa-small-v1 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Training codeberta for python...
{'loss': 0.444, 'grad_norm': 3.2594780921936035, 'learning_rate': 3.355263157894737e-05, 'epoch': 2.6315789473684212}
{'loss': 0.2215, 'grad_norm': 2.8022868633270264, 'learning_rate': 1.7105263157894737e-05, 'epoch': 5.2631578947368425}
{'loss': 0.1173, 'grad_norm': 1.682822823524475, 'learning_rate': 6.578947368421053e-07, 'epoch': 7.894736842105263}
{'train_runtime': 79.3611, 'train_samples_per_second': 30.342, 'train_steps_per_second': 1.915, 'train_loss': 0.25897426589539174, 'epoch': 8.0}


Map:   0%|          | 0/208 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at huggingface/CodeBERTa-small-v1 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Training codeberta for pharo...
{'loss': 0.3643, 'grad_norm': 2.211106538772583, 'learning_rate': 2.5961538461538464e-05, 'epoch': 3.8461538461538463}
{'loss': 0.202, 'grad_norm': 1.3123348951339722, 'learning_rate': 1.9230769230769234e-06, 'epoch': 7.6923076923076925}
{'train_runtime': 95.2461, 'train_samples_per_second': 17.471, 'train_steps_per_second': 1.092, 'train_loss': 0.2785806501140961, 'epoch': 8.0}


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Map:   0%|          | 0/1218 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Training codebert for java...


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

{'loss': 0.3185, 'grad_norm': 1.6502324342727661, 'learning_rate': 4.342105263157895e-05, 'epoch': 0.6535947712418301}
{'loss': 0.1687, 'grad_norm': 2.2428698539733887, 'learning_rate': 3.6842105263157895e-05, 'epoch': 1.3071895424836601}
{'loss': 0.1386, 'grad_norm': 3.9783682823181152, 'learning_rate': 3.0263157894736844e-05, 'epoch': 1.9607843137254903}
{'loss': 0.1103, 'grad_norm': 2.185314416885376, 'learning_rate': 2.368421052631579e-05, 'epoch': 2.6143790849673203}
{'loss': 0.1061, 'grad_norm': 2.644301414489746, 'learning_rate': 1.7105263157894737e-05, 'epoch': 3.2679738562091503}
{'loss': 0.0837, 'grad_norm': 3.890254020690918, 'learning_rate': 1.0526315789473684e-05, 'epoch': 3.9215686274509802}
{'loss': 0.0662, 'grad_norm': 2.0282890796661377, 'learning_rate': 3.9473684210526315e-06, 'epoch': 4.57516339869281}
{'train_runtime': 190.6711, 'train_samples_per_second': 31.94, 'train_steps_per_second': 1.993, 'train_loss': 0.1361026123950356, 'epoch': 4.967320261437909}


Map:   0%|          | 0/301 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Training codebert for python...
{'loss': 0.4936, 'grad_norm': 6.098516464233398, 'learning_rate': 3.355263157894737e-05, 'epoch': 2.6315789473684212}
{'loss': 0.3187, 'grad_norm': 3.5834848880767822, 'learning_rate': 1.7105263157894737e-05, 'epoch': 5.2631578947368425}
{'loss': 0.2048, 'grad_norm': 2.4634642601013184, 'learning_rate': 6.578947368421053e-07, 'epoch': 7.894736842105263}
{'train_runtime': 109.0456, 'train_samples_per_second': 22.083, 'train_steps_per_second': 1.394, 'train_loss': 0.3370391287301716, 'epoch': 8.0}


Map:   0%|          | 0/208 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Training codebert for pharo...
{'loss': 0.403, 'grad_norm': 1.90989351272583, 'learning_rate': 2.5961538461538464e-05, 'epoch': 3.8461538461538463}
{'loss': 0.2617, 'grad_norm': 1.3353289365768433, 'learning_rate': 1.9230769230769234e-06, 'epoch': 7.6923076923076925}
{'train_runtime': 99.5514, 'train_samples_per_second': 16.715, 'train_steps_per_second': 1.045, 'train_loss': 0.32778747437091976, 'epoch': 8.0}


### **Results**

Summarize and display results for each model below their respective headings.

In [6]:
from torch.cuda.amp import autocast

models = {
    "GraphCodeBERT": "./models/graphcodebert",
    "CodeBERTa": "./models/codeberta",
    "CodeBERT": "./models/codebert"
}

all_scores = []

for model_name, model_dir in models.items():
    for lang in langs:
        model = AutoModelForSequenceClassification.from_pretrained(f"{model_dir}/{lang}_model")
        tokenizer = AutoTokenizer.from_pretrained(f"{model_dir}/{lang}_tokenizer")

        model.eval()
        if torch.cuda.is_available():
            model = model.to('cuda')

        test_df = pd.read_parquet(f"stratified_dataset/{lang}_test_set.parquet")
        inputs = tokenizer(test_df['combo'].tolist(), truncation=True, padding=True, return_tensors="pt")

        dataset = TensorDataset(inputs['input_ids'], inputs['attention_mask'])
        batch_size = 32
        dataloader = DataLoader(dataset, batch_size=batch_size)

        total_outputs = []

        with torch.no_grad():
            for input_batch in dataloader:
                input_ids, attention_mask = input_batch
                if torch.cuda.is_available():
                    input_ids, attention_mask = input_ids.to('cuda'), attention_mask.to('cuda')

                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                logits = torch.sigmoid(outputs.logits)
                total_outputs.append(logits.cpu().numpy())

        total_outputs = np.concatenate(total_outputs, axis=0)
        y_pred = (total_outputs > 0.5).astype(int).T
        y_true = np.array(test_df['labels'].tolist()).T

        # Calculate precision, recall, and F1 for each category
        for i, label in enumerate(labels[lang]):
            tp = sum([true == pred == 1 for (true, pred) in zip(y_true[i], y_pred[i])])
            tn = sum([true == pred == 0 for (true, pred) in zip(y_true[i], y_pred[i])])
            fp = sum([true == 0 and pred == 1 for (true, pred) in zip(y_true[i], y_pred[i])])
            fn = sum([true == 1 and pred == 0 for (true, pred) in zip(y_true[i], y_pred[i])])

            precision = tp / (tp + fp) if (tp + fp) > 0 else 0
            recall = tp / (tp + fn) if (tp + fn) > 0 else 0
            f1 = (2 * tp) / (2 * tp + fp + fn) if (2 * tp + fp + fn) > 0 else 0

            all_scores.append({
                'model': model_name,
                'lang': lang,
                'cat': label,
                'precision': precision,
                'recall': recall,
                'f1': f1
            })

results_df = pd.DataFrame(all_scores)

average_scores = results_df.groupby('model')[['precision', 'recall', 'f1']].mean().reset_index()

**GraphCodeBERT**

In [7]:
graphcodebert_results = results_df[results_df['model'] == "GraphCodeBERT"]

print("Results for GraphCodeBERT:\n")
print(graphcodebert_results[['lang', 'cat', 'precision', 'recall', 'f1']])

avg_precision = graphcodebert_results['precision'].mean()
avg_recall = graphcodebert_results['recall'].mean()
avg_f1 = graphcodebert_results['f1'].mean()

print("\nAverage Scores for GraphCodeBERT:")
print(f"Average Precision: {avg_precision:.4f}")
print(f"Average Recall: {avg_recall:.4f}")
print(f"Average F1 Score: {avg_f1:.4f}")

Results for GraphCodeBERT:

      lang                      cat  precision    recall        f1
0     java                  summary   0.871429  0.910448  0.890511
1     java                Ownership   1.000000  1.000000  1.000000
2     java                   Expand   0.411765  0.269231  0.325581
3     java                    usage   0.977778  0.888889  0.931217
4     java                  Pointer   0.972973  0.878049  0.923077
5     java              deprecation   1.000000  0.750000  0.857143
6     java                 rational   0.500000  0.166667  0.250000
7   python                    Usage   0.846154  0.611111  0.709677
8   python               Parameters   0.818182  0.666667  0.734694
9   python         DevelopmentNotes   0.000000  0.000000  0.000000
10  python                   Expand   0.454545  0.384615  0.416667
11  python                  Summary   0.642857  0.818182  0.720000
12   pharo  Keyimplementationpoints   1.000000  0.500000  0.666667
13   pharo                  Exampl

**CodeBERTa**

In [8]:
codeberta_results = results_df[results_df['model'] == "CodeBERTa"]

print("Results for CodeBERTa:\n")
print(codeberta_results[['lang', 'cat', 'precision', 'recall', 'f1']])

avg_precision = codeberta_results['precision'].mean()
avg_recall = codeberta_results['recall'].mean()
avg_f1 = codeberta_results['f1'].mean()

print("\nAverage Scores for CodeBERTa:")
print(f"Average Precision: {avg_precision:.4f}")
print(f"Average Recall: {avg_recall:.4f}")
print(f"Average F1 Score: {avg_f1:.4f}")

Results for CodeBERTa:

      lang                      cat  precision    recall        f1
19    java                  summary   0.893617  0.940299  0.916364
20    java                Ownership   1.000000  1.000000  1.000000
21    java                   Expand   0.600000  0.230769  0.333333
22    java                    usage   0.936170  0.888889  0.911917
23    java                  Pointer   0.948718  0.902439  0.925000
24    java              deprecation   1.000000  0.750000  0.857143
25    java                 rational   0.400000  0.333333  0.363636
26  python                    Usage   0.705882  0.666667  0.685714
27  python               Parameters   0.750000  0.777778  0.763636
28  python         DevelopmentNotes   0.000000  0.000000  0.000000
29  python                   Expand   0.250000  0.230769  0.240000
30  python                  Summary   0.333333  0.181818  0.235294
31   pharo  Keyimplementationpoints   1.000000  0.166667  0.285714
32   pharo                  Example   

**CodeBERT**

In [9]:
codebert_results = results_df[results_df['model'] == "CodeBERT"]

print("Results for CodeBERT:\n")
print(codebert_results[['lang', 'cat', 'precision', 'recall', 'f1']])

avg_precision = codebert_results['precision'].mean()
avg_recall = codebert_results['recall'].mean()
avg_f1 = codebert_results['f1'].mean()

print("\nAverage Scores for CodeBERT:")
print(f"Average Precision: {avg_precision:.4f}")
print(f"Average Recall: {avg_recall:.4f}")
print(f"Average F1 Score: {avg_f1:.4f}")

Results for CodeBERT:

      lang                      cat  precision    recall        f1
38    java                  summary   0.893617  0.940299  0.916364
39    java                Ownership   1.000000  1.000000  1.000000
40    java                   Expand   0.384615  0.192308  0.256410
41    java                    usage   0.957447  0.909091  0.932642
42    java                  Pointer   0.923077  0.878049  0.900000
43    java              deprecation   0.500000  0.250000  0.333333
44    java                 rational   0.000000  0.000000  0.000000
45  python                    Usage   0.764706  0.722222  0.742857
46  python               Parameters   0.826087  0.703704  0.760000
47  python         DevelopmentNotes   0.000000  0.000000  0.000000
48  python                   Expand   0.333333  0.307692  0.320000
49  python                  Summary   0.692308  0.818182  0.750000
50   pharo  Keyimplementationpoints   0.000000  0.000000  0.000000
51   pharo                  Example   0