<a href="https://colab.research.google.com/github/ikoojos/Algorithm-Debt-Research/blob/master/Albert_DL_and_Embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import sys
import os
from google.colab import drive
drive.mount('/content/drive')
%cd '/content/drive/My Drive/AD Final Experiments'
sys.path.append('/content/drive/My Drive/AD Final Experiments')

import importlib
import numpy as np
import pandas as pd
from itertools import product

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, MultiLabelBinarizer
from sklearn.multioutput import MultiOutputClassifier

from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec

import torch
from torch.utils.data import DataLoader, Dataset
from transformers import (
    AlbertTokenizer, AlbertForSequenceClassification,
    Trainer, TrainingArguments, EarlyStoppingCallback
)

from preprocessing import preprocess_data
from splitting import split_data
from utils import *
from evaluate_model import evaluate_best_model
from lr_tuning import hyperparameter_tuning

for module in ['preprocessing', 'splitting', 'utils', 'evaluate_model', 'lr_tuning']:
    importlib.reload(sys.modules[module])

file_path = '/content/drive/My Drive/AD Identification using SATD/liu_datset_processed.csv'
data = preprocess_data(file_path)



Mounted at /content/drive
/content/drive/My Drive/AD Final Experiments


In [4]:
!pip install transformers



In [5]:

import wandb
wandb.init(mode="disabled")


In [8]:
class_mapping = {label: idx for idx, label in enumerate(data['TDType'].unique())}
data['label'] = data['TDType'].map(class_mapping)

X_train_temp, X_test, y_train_temp, y_test = train_test_split(data['Comments'], data['label'], test_size=0.2, random_state=42, stratify=data['label'])
X_train, X_val, y_train, y_val = train_test_split(X_train_temp, y_train_temp, test_size=0.2, random_state=42, stratify=y_train_temp)



# Using ALBERT tokenizer and model
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=len(class_mapping))

class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx])
        label = self.labels.iloc[idx]

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

train_dataset = CustomDataset(X_train, y_train, tokenizer)
val_dataset = CustomDataset(X_val, y_val, tokenizer)
test_dataset = CustomDataset(X_test, y_test, tokenizer)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=20,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
)

def compute_metrics(pred):
    from sklearn.metrics import accuracy_score, f1_score
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1_score(labels, preds, average='weighted')
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=6)]
)

trainer.train()

test_results = trainer.predict(test_dataset)

y_test_pred = np.argmax(test_results.predictions, axis=1)
print("\nClassification Report on Test Set:")
from sklearn.metrics import classification_report
print(classification_report(y_test, y_test_pred, target_names=class_mapping.keys()))

# Embedding extraction
def extract_embeddings(model, dataset):
    embeddings = []
    dataloader = DataLoader(dataset, batch_size=16)

    model.eval()
    with torch.no_grad():
        for batch in dataloader:
            inputs = {
                'input_ids': batch['input_ids'].to(model.device),
                'attention_mask': batch['attention_mask'].to(model.device),
            }
            outputs = model.albert(**inputs)  # Extract ALBERT embeddings
            hidden_states = outputs.last_hidden_state[:, 0, :]  # [CLS] token embeddings
            embeddings.append(hidden_states.cpu().numpy())

    return np.concatenate(embeddings, axis=0)

embeddings_train = extract_embeddings(model, train_dataset)
embeddings_val = extract_embeddings(model, val_dataset)
embeddings_test = extract_embeddings(model, test_dataset)

def save_embeddings_to_csv(embeddings, file_path, labels=None):
    df = pd.DataFrame(embeddings)
    if labels is not None:
        df['label'] = labels
    df.to_csv(file_path, index=False)

save_embeddings_to_csv(embeddings_train, '_Alber_train_embeddings.csv', y_train.values)
save_embeddings_to_csv(embeddings_val, '_Alber_val_embeddings.csv', y_val.values)
save_embeddings_to_csv(embeddings_test, '_Alber_test_embeddings.csv', y_test.values)

print("Embeddings saved to CSV files.")


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.8086,0.724477,0.795498,0.753198
2,0.5261,0.666082,0.82283,0.774175
3,0.6061,0.586889,0.829904,0.781486
4,0.5017,0.548273,0.846463,0.819925
5,0.5002,0.518915,0.858521,0.834792
6,0.4514,0.497344,0.863987,0.838425
7,0.3964,0.526401,0.865434,0.839044
8,0.2809,0.516733,0.866238,0.850007
9,0.3769,0.575333,0.865595,0.848494
10,0.1504,0.580025,0.872508,0.862053



Classification Report on Test Set:
                        precision    recall  f1-score   support

             ALGORITHM       0.29      0.16      0.21       187
         COMPATIBILITY       0.40      0.02      0.04        91
                DEFECT       0.31      0.22      0.26       132
                DESIGN       0.79      0.87      0.83      2178
         DOCUMENTATION       0.00      0.00      0.00        32
        IMPLEMENTATION       0.63      0.64      0.64       393
                  TEST       0.72      0.63      0.67       131
WITHOUT_CLASSIFICATION       0.96      0.97      0.97      4631

              accuracy                           0.87      7775
             macro avg       0.51      0.44      0.45      7775
          weighted avg       0.86      0.87      0.86      7775

Embeddings saved to CSV files.


In [9]:
from sklearn.pipeline import Pipeline

train = pd.read_csv('_Alber_train_embeddings.csv')
val = pd.read_csv('_Alber_val_embeddings.csv')
test = pd.read_csv('_Alber_test_embeddings.csv')

X_train_final = train.iloc[:, :-1].to_numpy()
y_train_final = train['label']

X_val = val.iloc[:, :-1].to_numpy()
y_val = val['label']

X_test = test.iloc[:, :-1].to_numpy()
y_test = test['label']


param_grid = {
    'C': [0.01, 1, 10],
    'penalty': ['l2'],
    'max_iter': [1, 10, 100, 200]
}

best_score = -1
best_params = None
best_model = None


for C, penalty, max_iter in product(param_grid['C'], param_grid['penalty'], param_grid['max_iter']):
    solver = 'lbfgs'
    try:
        pipeline = Pipeline([
            ('scaler', StandardScaler()),
            ('clf', LogisticRegression(C=C, penalty=penalty, max_iter=max_iter, solver=solver, random_state=42, class_weight='balanced'))
        ])

        pipeline.fit(X_train_final, y_train_final)
        y_val_pred = pipeline.predict(X_val)
        score = accuracy_score(y_val, y_val_pred)


        if score > best_score:
            best_score = score
            best_params = {'C': C, 'penalty': penalty, 'max_iter': max_iter}
            best_model = pipeline

    except Exception as e:
        print(f"Skipping configuration C={C}, penalty={penalty}, max_iter={max_iter} due to error: {e}")

def evaluate_best_model(model, params, score, X_test, y_test):
    print(f"Best Params: {params}")
    print(f"Validation Best Score: {score}")
    y_test_pred = model.predict(X_test)
    print("\nTest Accuracy:", accuracy_score(y_test, y_test_pred))
    print("\nTest Classification Report:")
    print(classification_report(y_test, y_test_pred))

evaluate_best_model(best_model, best_params, best_score, X_test, y_test)


Best Params: {'C': 0.01, 'penalty': 'l2', 'max_iter': 10}
Validation Best Score: 0.8244372990353698

Test Accuracy: 0.832411575562701

Test Classification Report:
              precision    recall  f1-score   support

           0       0.23      0.28      0.25       187
           1       0.29      0.22      0.25        91
           2       0.18      0.45      0.25       132
           3       0.85      0.76      0.80      2178
           4       0.03      0.28      0.05        32
           5       0.75      0.51      0.61       393
           6       0.71      0.64      0.67       131
           7       0.98      0.95      0.96      4631

    accuracy                           0.83      7775
   macro avg       0.50      0.51      0.48      7775
weighted avg       0.88      0.83      0.85      7775

