<a href="https://colab.research.google.com/github/ikoojos/Algorithm-Debt-Research/blob/master/Final_CLS_Fine_Tuned_RoBERTa_and_CLS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%cd '/content/drive/My Drive/AD Final Experiments'

[Errno 2] No such file or directory: '/content/drive/My Drive/AD Final Experiments'
/content


In [2]:
import sys
import os
from google.colab import drive
drive.mount('/content/drive')
sys.path.append('/content/drive/My Drive/AD Final Experiments')

import importlib
import numpy as np
import pandas as pd
from itertools import product

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, MultiLabelBinarizer
from sklearn.multioutput import MultiOutputClassifier

from nltk.tokenize import word_tokenize
#from gensim.models import Word2Vec

import torch
from torch.utils.data import DataLoader, Dataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback

from preprocessing import preprocess_data
from splitting import split_data
from utils import *
from evaluate_model import evaluate_best_model
from lr_tuning import hyperparameter_tuning

for module in ['preprocessing', 'splitting', 'utils', 'evaluate_model', 'lr_tuning']:
    importlib.reload(sys.modules[module])


Mounted at /content/drive


In [3]:
file_path = '/content/drive/My Drive/AD Identification using SATD/liu_datset_processed.csv'
data = preprocess_data(file_path)

In [4]:
pip install transformers



In [5]:
pip install wandb




In [6]:
import wandb

wandb.init(mode="disabled")

In [7]:
class_mapping = {label: idx for idx, label in enumerate(data['TDType'].unique())}
data['label'] = data['TDType'].map(class_mapping)


X_train_temp, X_test, y_train_temp, y_test = train_test_split(data['Comments'], data['label'], test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_temp, y_train_temp, test_size=0.2, random_state=42)



In [8]:
keywords = ['shape', 'input', 'tensor', 'number', 'matrix']

def extract_custom_features(texts):
    features = []
    for t in texts:
        t_lower = str(t).lower()
        features.append([int(kw in t_lower) for kw in keywords])
    return np.array(features)

custom_train = extract_custom_features(X_train)
custom_val   = extract_custom_features(X_val)
custom_test  = extract_custom_features(X_test)

# ===============================


# ===============================


In [9]:
# 6. Define dataset class
# ===============================
class CustomADDataset(Dataset):
    def __init__(self, texts, labels, custom_features, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.custom_features = custom_features
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx])
        label = torch.tensor(self.labels.iloc[idx], dtype=torch.long)
        custom_feat = torch.tensor(self.custom_features[idx], dtype=torch.float)

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'custom_features': custom_feat,
            'labels': label
        }


In [10]:
# ===============================
# 7. Instantiate tokenizer & datasets
# ===============================
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

train_dataset = CustomADDataset(X_train, y_train, custom_train, tokenizer)
val_dataset   = CustomADDataset(X_val, y_val, custom_val, tokenizer)
test_dataset  = CustomADDataset(X_test, y_test, custom_test, tokenizer)

# ===============================

# ===============================



tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [11]:
from torch.optim import AdamW

In [12]:
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaModel
# 8. Define model with custom features
# ===============================
class RobertaWithCustomFeatures(nn.Module):
    def __init__(self, num_labels, num_custom_features, roberta_model_name='roberta-base', dropout=0.1):
        super(RobertaWithCustomFeatures, self).__init__()
        self.roberta = RobertaModel.from_pretrained(roberta_model_name)
        hidden_size = self.roberta.config.hidden_size
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(hidden_size + num_custom_features, num_labels)

    def forward(self, input_ids, attention_mask, custom_features, labels=None):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        cls_embedding = outputs.last_hidden_state[:, 0, :]  # [CLS] token
        combined = torch.cat((cls_embedding, custom_features), dim=1)
        combined = self.dropout(combined)
        logits = self.classifier(combined)

        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)
            return loss, logits
        return logits


In [13]:
# 9. Initialize model, optimizer, dataloaders
# ===============================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = RobertaWithCustomFeatures(num_labels=len(class_mapping), num_custom_features=len(keywords))
model.to(device)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=16)
test_loader  = DataLoader(test_dataset, batch_size=16)

optimizer = AdamW(model.parameters(), lr=2e-5)
num_epochs = 10

# ===============================


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
print(device)   # should say "cuda"


cuda


In [15]:
model.to(device)


RobertaWithCustomFeatures(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (L

In [16]:
from tqdm import tqdm

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        custom_features = batch['custom_features'].to(device)
        labels = batch['labels'].to(device)

        loss, logits = model(input_ids, attention_mask, custom_features, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} - Loss: {total_loss/len(train_loader):.4f}")


Epoch 1/10: 100%|██████████| 1555/1555 [02:25<00:00, 10.66it/s]


Epoch 1 - Loss: 0.4983


Epoch 2/10: 100%|██████████| 1555/1555 [02:22<00:00, 10.92it/s]


Epoch 2 - Loss: 0.3346


Epoch 3/10: 100%|██████████| 1555/1555 [02:22<00:00, 10.91it/s]


Epoch 3 - Loss: 0.2539


Epoch 4/10: 100%|██████████| 1555/1555 [02:22<00:00, 10.91it/s]


Epoch 4 - Loss: 0.1866


Epoch 5/10: 100%|██████████| 1555/1555 [02:23<00:00, 10.84it/s]


Epoch 5 - Loss: 0.1367


Epoch 6/10: 100%|██████████| 1555/1555 [02:23<00:00, 10.83it/s]


Epoch 6 - Loss: 0.0985


Epoch 7/10: 100%|██████████| 1555/1555 [02:23<00:00, 10.85it/s]


Epoch 7 - Loss: 0.0776


Epoch 8/10: 100%|██████████| 1555/1555 [02:23<00:00, 10.85it/s]


Epoch 8 - Loss: 0.0652


Epoch 9/10: 100%|██████████| 1555/1555 [02:23<00:00, 10.85it/s]


Epoch 9 - Loss: 0.0554


Epoch 10/10: 100%|██████████| 1555/1555 [02:23<00:00, 10.84it/s]

Epoch 10 - Loss: 0.0480





In [17]:
# 11. Evaluation
# ===============================
model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        custom_features = batch['custom_features'].to(device)
        labels = batch['labels'].to(device)

        logits = model(input_ids, attention_mask, custom_features)
        preds = torch.argmax(logits, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())



In [18]:
print("Classification Report:")


Classification Report:


In [19]:
print(classification_report(all_labels, all_preds, target_names=class_mapping.keys()))


                        precision    recall  f1-score   support

             ALGORITHM       0.53      0.47      0.50       200
         COMPATIBILITY       0.51      0.52      0.51        89
                DEFECT       0.53      0.63      0.58       135
                DESIGN       0.85      0.85      0.85      2206
         DOCUMENTATION       0.50      0.57      0.53        23
        IMPLEMENTATION       0.70      0.76      0.73       387
                  TEST       0.82      0.69      0.75       143
WITHOUT_CLASSIFICATION       0.97      0.97      0.97      4592

              accuracy                           0.89      7775
             macro avg       0.68      0.68      0.68      7775
          weighted avg       0.89      0.89      0.89      7775



In [20]:
print("F1 Score (Weighted):", f1_score(all_labels, all_preds, average='weighted'))

F1 Score (Weighted): 0.8935138905417956
