**Navigation**
1. [Dependencies and Data Loading](#Dependencies-and-Data-Loading)
2. [Class Weighting](#Class-Weighting)
3. [Fine-Tuning](#Fine-Tuning)
4. [Evaluation](#Evaluation)
5. [Submission Score](#Submission-Score)

# **Dependencies and Data Loading**
Install and import all required libraries for the analysis.

In [1]:
!pip install pandas datasets transformers torch scikit-learn

import os
import random
import time
import numpy as np
import pandas as pd
from datasets import Dataset, load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, set_seed
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import DataLoader, TensorDataset
from torch.nn import BCEWithLogitsLoss
from tqdm.auto import tqdm

SEED = 42
set_seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
np.random.seed(SEED)
random.seed(SEED)

tqdm.pandas()

langs = ['java', 'python', 'pharo']
labels = {
    'java': ['summary', 'Ownership', 'Expand', 'usage', 'Pointer', 'deprecation', 'rational'],
    'python': ['Usage', 'Parameters', 'DevelopmentNotes', 'Expand', 'Summary'],
    'pharo': ['Keyimplementationpoints', 'Example', 'Responsibilities', 'Classreferences', 'Intent', 'Keymessages', 'Collaborators']
}
ds = load_dataset('NLBSE/nlbse25-code-comment-classification')
ds

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

java_train-00000-of-00001.parquet:   0%|          | 0.00/680k [00:00<?, ?B/s]

java_test-00000-of-00001.parquet:   0%|          | 0.00/174k [00:00<?, ?B/s]

python_train-00000-of-00001.parquet:   0%|          | 0.00/126k [00:00<?, ?B/s]

python_test-00000-of-00001.parquet:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

pharo_train-00000-of-00001.parquet:   0%|          | 0.00/113k [00:00<?, ?B/s]

pharo_test-00000-of-00001.parquet:   0%|          | 0.00/30.6k [00:00<?, ?B/s]

Generating java_train split:   0%|          | 0/7614 [00:00<?, ? examples/s]

Generating java_test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

Generating python_train split:   0%|          | 0/1884 [00:00<?, ? examples/s]

Generating python_test split:   0%|          | 0/406 [00:00<?, ? examples/s]

Generating pharo_train split:   0%|          | 0/1298 [00:00<?, ? examples/s]

Generating pharo_test split:   0%|          | 0/289 [00:00<?, ? examples/s]

DatasetDict({
    java_train: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 7614
    })
    java_test: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 1725
    })
    python_train: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 1884
    })
    python_test: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 406
    })
    pharo_train: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 1298
    })
    pharo_test: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 289
    })
})

## **Class Weighting**


In [2]:
def compute_class_weights(labels):
    labels = torch.tensor(labels)
    num_samples = labels.shape[0]

    class_counts = labels.sum(axis=0)
    class_weights = num_samples / (len(class_counts) * class_counts)
    return torch.tensor(class_weights).float()

class_weights_dict = {
    lang: compute_class_weights(ds[lang + "_train"]["labels"]).tolist()
    for lang in langs
}

for lang, weights in class_weights_dict.items():
    print(f"Class weights for {lang}: {weights}")

Class weights for java: [0.301305890083313, 4.073836326599121, 2.136963129043579, 0.5196914672851562, 1.2032238245010376, 9.296703338623047, 3.49747371673584]
Class weights for python: [0.6519031524658203, 0.658741295337677, 1.7942856550216675, 1.0985422134399414, 1.0858789682388306]
Class weights for pharo: [1.0417335033416748, 0.3389919102191925, 0.7568512558937073, 4.031055927276611, 1.228003740310669, 0.8664886951446533, 2.439849615097046]


  return torch.tensor(class_weights).float()


# **Fine-Tuning**

In [None]:
from transformers import set_seed
import torch.nn as nn

tokenizer = AutoTokenizer.from_pretrained("microsoft/graphcodebert-base")

def preprocess_data(examples):
    return tokenizer(examples['combo'], truncation=True, padding='longest')

for lang in langs:
    ds[lang + '_train'] = ds[lang + '_train'].map(preprocess_data, batched=True)

class MultiLabelClassificationModel(torch.nn.Module):
    def __init__(self, model_name, problem_type, num_labels, class_weights=None):
        super(MultiLabelClassificationModel, self).__init__()
        self.model = AutoModelForSequenceClassification.from_pretrained(
            model_name, problem_type=problem_type, num_labels=num_labels
        )
        self.class_weights = class_weights

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        if labels is not None:
            loss_fn = nn.BCEWithLogitsLoss(pos_weight=self.class_weights.to(logits.device))
            loss = loss_fn(logits, labels.float())
            return {'loss': loss, 'logits': logits}
        return {'logits': logits}

shared_hyperparams = {
    "batch_size": 4,
    "epochs": {
        "java": 6,
        "python": 11,
        "pharo": 12
    }
}

for lang in langs:
    print(f"Length of {lang} train dataset: {len(ds[lang + '_train'])}")
    print(f"Length of {lang} test dataset: {len(ds[lang + '_test'])}")

    train_dataset = ds[lang + '_train']

    class_weights = torch.tensor(class_weights_dict[lang], dtype=torch.float32)
    print(f"Class weights for {lang}: {class_weights}")

    num_labels = len(labels[lang])
    model = MultiLabelClassificationModel(
        "microsoft/graphcodebert-base",
        problem_type="multi_label_classification",
        num_labels=num_labels,
        class_weights=class_weights
    )

    if torch.cuda.is_available():
        model = model.to('cuda')

    training_args = TrainingArguments(
        output_dir=f"./graphcodebert_models_{lang}",
        eval_strategy="no",
        save_strategy="epoch",
        logging_dir=f"./logs_{lang}",
        logging_steps=50,
        num_train_epochs=shared_hyperparams["epochs"][lang],
        per_device_train_batch_size=shared_hyperparams["batch_size"],
        gradient_accumulation_steps=2,
        fp16=True,
        save_total_limit=1,
        load_best_model_at_end=False,
        dataloader_pin_memory=True,
        disable_tqdm=True,
        report_to="none",
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        tokenizer=tokenizer,
    )

    print(f"Training model for {lang}...")
    trainer.train()

    model.model.save_pretrained(f"./graphcodebert_models/{lang}_model")
    tokenizer.save_pretrained(f"./graphcodebert_models/{lang}_tokenizer")

    if torch.cuda.is_available():
        torch.cuda.empty_cache()

print("GraphCodeBERT Training complete for all languages!")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/539 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

Map:   0%|          | 0/7614 [00:00<?, ? examples/s]

Map:   0%|          | 0/1884 [00:00<?, ? examples/s]

Map:   0%|          | 0/1298 [00:00<?, ? examples/s]

Length of java train dataset: 7614
Length of java test dataset: 1725
Class weights for java: tensor([0.3013, 4.0738, 2.1370, 0.5197, 1.2032, 9.2967, 3.4975])


pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Training model for java...
{'loss': 0.7923, 'grad_norm': 1.6053199768066406, 'learning_rate': 4.956232492997199e-05, 'epoch': 0.052521008403361345}
{'loss': 0.5596, 'grad_norm': 1.4600449800491333, 'learning_rate': 4.912464985994398e-05, 'epoch': 0.10504201680672269}
{'loss': 0.3982, 'grad_norm': 1.2515009641647339, 'learning_rate': 4.868697478991597e-05, 'epoch': 0.15756302521008403}
{'loss': 0.442, 'grad_norm': 1.2667076587677002, 'learning_rate': 4.824929971988796e-05, 'epoch': 0.21008403361344538}
{'loss': 0.3728, 'grad_norm': 3.01339054107666, 'learning_rate': 4.782037815126051e-05, 'epoch': 0.26260504201680673}
{'loss': 0.3371, 'grad_norm': 6.397977828979492, 'learning_rate': 4.738270308123249e-05, 'epoch': 0.31512605042016806}
{'loss': 0.3664, 'grad_norm': 1.450764536857605, 'learning_rate': 4.6945028011204486e-05, 'epoch': 0.36764705882352944}
{'loss': 0.3392, 'grad_norm': 4.975523471832275, 'learning_rate': 4.6507352941176475e-05, 'epoch': 0.42016806722689076}
{'loss': 0.3146,

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Training model for python...
{'loss': 1.0647, 'grad_norm': 3.400925874710083, 'learning_rate': 4.909090909090909e-05, 'epoch': 0.21231422505307856}
{'loss': 0.9412, 'grad_norm': 3.4967925548553467, 'learning_rate': 4.81431334622824e-05, 'epoch': 0.42462845010615713}
{'loss': 0.8483, 'grad_norm': 2.1231682300567627, 'learning_rate': 4.717601547388782e-05, 'epoch': 0.6369426751592356}
{'loss': 0.7537, 'grad_norm': 6.506028652191162, 'learning_rate': 4.620889748549323e-05, 'epoch': 0.8492569002123143}
{'loss': 0.6618, 'grad_norm': 5.086826801300049, 'learning_rate': 4.5241779497098646e-05, 'epoch': 1.059447983014862}
{'loss': 0.5099, 'grad_norm': 6.174143314361572, 'learning_rate': 4.4274661508704066e-05, 'epoch': 1.2717622080679405}
{'loss': 0.4952, 'grad_norm': 5.4806108474731445, 'learning_rate': 4.330754352030948e-05, 'epoch': 1.484076433121019}
{'loss': 0.4935, 'grad_norm': 7.531985282897949, 'learning_rate': 4.23404255319149e-05, 'epoch': 1.6963906581740975}
{'loss': 0.5321, 'grad_n

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Training model for pharo...
{'loss': 0.8803, 'grad_norm': 3.182511329650879, 'learning_rate': 4.871399176954733e-05, 'epoch': 0.3076923076923077}
{'loss': 0.6687, 'grad_norm': 2.5290520191192627, 'learning_rate': 4.742798353909465e-05, 'epoch': 0.6153846153846154}
{'loss': 0.6182, 'grad_norm': 5.395261287689209, 'learning_rate': 4.616769547325103e-05, 'epoch': 0.9230769230769231}
{'loss': 0.4595, 'grad_norm': 2.8278560638427734, 'learning_rate': 4.4881687242798354e-05, 'epoch': 1.2276923076923076}
{'loss': 0.4205, 'grad_norm': 4.680913925170898, 'learning_rate': 4.359567901234568e-05, 'epoch': 1.5353846153846153}
{'loss': 0.4141, 'grad_norm': 3.6043167114257812, 'learning_rate': 4.230967078189301e-05, 'epoch': 1.843076923076923}
{'loss': 0.3944, 'grad_norm': 2.052513837814331, 'learning_rate': 4.102366255144034e-05, 'epoch': 2.147692307692308}
{'loss': 0.3326, 'grad_norm': 4.205859661102295, 'learning_rate': 3.973765432098765e-05, 'epoch': 2.4553846153846153}
{'loss': 0.254, 'grad_norm

# **Evaluation**
The model's performance was assessed using standard classification metrics, including precision, recall, and F1 scores, to ensure effectiveness on the task.

In [4]:
total_flops = 0
total_time = 0
scores = []

for lan in langs:
    model_name = f"harisathar04/graphic-nlbse-{lan}"
    model = AutoModelForSequenceClassification.from_pretrained(model_name, use_auth_token=False)
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=False)

    # model = AutoModelForSequenceClassification.from_pretrained(f'./graphcodebert_models/{lan}_model')
    # tokenizer = AutoTokenizer.from_pretrained(f'./graphcodebert_models/{lan}_tokenizer')
    model.eval()
    if torch.cuda.is_available():
        model = model.to('cuda')

    test_dataset = ds[f'{lan}_test']
    inputs = tokenizer(test_dataset['combo'], truncation=True, padding=True, return_tensors="pt")
    dataset = TensorDataset(inputs['input_ids'], inputs['attention_mask'])
    dataloader = DataLoader(dataset, batch_size=32)

    total_outputs = []
    with torch.no_grad():
        for input_ids, attention_mask in dataloader:
            if torch.cuda.is_available():
                input_ids = input_ids.to('cuda')
                attention_mask = attention_mask.to('cuda')

            with torch.profiler.profile(with_flops=True) as p:
                begin = time.time()
                for i in range(10):
                    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                total_time += (time.time() - begin)

            batch_flops = sum([e.flops for e in p.key_averages() if e.flops is not None]) / len(dataloader)
            total_flops += batch_flops / 1e9

            logits = torch.sigmoid(outputs.logits)
            total_outputs.append(logits.cpu().numpy())

    y_pred = (np.concatenate(total_outputs, axis=0) > 0.5).astype(int).T
    y_true = np.array(test_dataset['labels']).T
    for i in range(len(y_pred)):
        assert len(y_pred[i]) == len(y_true[i])
        tp = sum([true == pred == 1 for true, pred in zip(y_true[i], y_pred[i])])
        tn = sum([true == pred == 0 for true, pred in zip(y_true[i], y_pred[i])])
        fp = sum([true == 0 and pred == 1 for true, pred in zip(y_true[i], y_pred[i])])
        fn = sum([true == 1 and pred == 0 for true, pred in zip(y_true[i], y_pred[i])])
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1 = (2 * tp) / (2 * tp + fp + fn) if (2 * tp + fp + fn) > 0 else 0
        scores.append({
            'lan': lan,
            'cat': labels[lan][i],
            'precision': precision,
            'recall': recall,
            'f1': f1
        })

print("Compute in GFLOPs:", total_flops / 10)
print("Avg runtime in seconds:", total_time / (10))

scores = pd.DataFrame(scores)

Compute in GFLOPs: 3245.0810635754215
Avg runtime in seconds: 39.87161800861359


In [5]:
avg_precision = scores['precision'].mean()
avg_recall = scores['recall'].mean()
avg_f1 = scores['f1'].mean()

print(f"Average Precision: {avg_precision}")
print(f"Average Recall: {avg_recall}")
print(f"Average F1 Score: {avg_f1}")

scores

Average Precision: 0.7065471532790458
Average Recall: 0.7246359082485911
Average F1 Score: 0.7120645989790534


Unnamed: 0,lan,cat,precision,recall,f1
0,java,summary,0.897321,0.901345,0.899329
1,java,Ownership,1.0,1.0,1.0
2,java,Expand,0.459184,0.441176,0.45
3,java,usage,0.928218,0.87007,0.898204
4,java,Pointer,0.818182,0.978261,0.891089
5,java,deprecation,0.733333,0.733333,0.733333
6,java,rational,0.333333,0.382353,0.356164
7,python,Usage,0.841121,0.743802,0.789474
8,python,Parameters,0.846774,0.820312,0.833333
9,python,DevelopmentNotes,0.340909,0.365854,0.352941


# **Submission Score**

In [6]:
max_avg_runtime = 5
max_avg_flops = 5000

def submission_score(avg_f1, avg_runtime, avg_flops):
    return (
        0.6 * avg_f1 +
        0.2 * max(0, ((max_avg_runtime - avg_runtime) / max_avg_runtime)) +
        0.2 * max(0, ((max_avg_flops - avg_flops) / max_avg_flops))
    )

avg_runtime = total_time / 10
avg_flops = total_flops / 10

final_score = round(submission_score(avg_f1, avg_runtime, avg_flops), 2)
print(f"Submission Score: {final_score}")

Submission Score: 0.5
