<a href="https://colab.research.google.com/github/ikoojos/Algorithm-Debt-Research/blob/master/Albert_DL_Embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import sys
from google.colab import drive
import importlib

drive.mount('/content/drive')
%cd '/content/drive/My Drive/AD Final Experiments'
sys.path.append('/content/drive/My Drive/AD Final Experiments')

from preprocessing import preprocess_data
from splitting import split_data
from utils import *
from evaluate_model import evaluate_best_model
from lr_tuning import hyperparameter_tuning

for module in ['preprocessing', 'splitting', 'utils', 'evaluate_model', 'lr_tuning']:
    importlib.reload(sys.modules[module])

import torch
from torch.utils.data import DataLoader, Dataset

from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler, MultiLabelBinarizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression

from transformers import AlbertTokenizer, AlbertForSequenceClassification, AlbertModel, Trainer, TrainingArguments

from itertools import product
import warnings

warnings.filterwarnings("ignore")

Mounted at /content/drive
/content/drive/My Drive/AD Final Experiments


In [2]:
file_path = '/content/drive/My Drive/AD Identification using SATD/liu_datset_processed.csv'
data = preprocess_data(file_path)
X_train_final, X_val, X_test, y_train_final, y_val, y_test = split_data(data)

In [4]:
pip install transformers




In [5]:
import wandb

# Disable wandb logging
wandb.init(mode="disabled")

In [None]:
liu_data = data
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = AlbertModel.from_pretrained('albert-base-v2')

def encode_text(text):
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    return inputs

def extract_albert_features(text):
    inputs = encode_text(text)
    with torch.no_grad():
        outputs = model(**inputs)
    features = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    return features

X_train, X_test, y_train, y_test = train_test_split(liu_data['Comments'], liu_data['TDType'], test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.20, random_state=42)

X_train_albert = np.vstack(X_train.apply(extract_albert_features))
X_val_albert = np.vstack(X_val.apply(extract_albert_features))
X_test_albert = np.vstack(X_test.apply(extract_albert_features))

train_df = pd.DataFrame(X_train_albert)
train_df['label'] = y_train.values

val_df = pd.DataFrame(X_val_albert)
val_df['label'] = y_val.values

test_df = pd.DataFrame(X_test_albert)
test_df['label'] = y_test.values

In [11]:
train_df.to_csv('albert_train_embeddings.csv', index=False)
val_df.to_csv('albert_val_embeddings.csv', index=False)
test_df.to_csv('albert_test_embeddings.csv', index=False)

print("Embeddings saved to CSV files successfully!")


Embeddings saved to CSV files successfully!


In [3]:
train = pd.read_csv('albert_train_embeddings.csv')
val = pd.read_csv('albert_val_embeddings.csv')
test = pd.read_csv('albert_test_embeddings.csv')

# Separate features and labels
X_train = train.iloc[:, :-1]
y_train = train['label']

X_val = val.iloc[:, :-1]
y_val = val['label']

X_test = test.iloc[:, :-1]
y_test = test['label']

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

param_grid = {
    'C': [0.01, 1, 10],
    'penalty': ['l2'],
    'max_iter': [1, 10, 100, 200]
}

# Hyperparameter tuning function
def hyperparameter_tuning(X_train, y_train, X_val, y_val, param_grid):
    grid_search = GridSearchCV(LogisticRegression(class_weight='balanced', random_state=42), param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    best_model = grid_search.best_estimator_
    y_val_pred = best_model.predict(X_val)
    val_accuracy = accuracy_score(y_val, y_val_pred)

    print("\nBest Parameters:", best_params)
    print("\nValidation Accuracy:", val_accuracy)

    return best_model, best_params, best_score

def evaluate_best_model(model, X_test, y_test):
    y_test_pred = model.predict(X_test)


    print("\nTest Classification Report:")
    print(classification_report(y_test, y_test_pred))


best_model, best_params, best_score = hyperparameter_tuning(X_train_scaled, y_train, X_val_scaled, y_val, param_grid)
evaluate_best_model(best_model, X_test_scaled, y_test)



Best Parameters: {'C': 1, 'max_iter': 200, 'penalty': 'l2'}

Validation Accuracy: 0.7354340836012861

Test Classification Report:
                        precision    recall  f1-score   support

             ALGORITHM       0.19      0.49      0.28       200
         COMPATIBILITY       0.21      0.58      0.31        89
                DEFECT       0.15      0.38      0.21       135
                DESIGN       0.78      0.54      0.64      2206
         DOCUMENTATION       0.16      0.43      0.24        23
        IMPLEMENTATION       0.38      0.63      0.48       387
                  TEST       0.38      0.62      0.47       143
WITHOUT_CLASSIFICATION       0.95      0.87      0.91      4592

              accuracy                           0.74      7775
             macro avg       0.40      0.57      0.44      7775
          weighted avg       0.82      0.74      0.77      7775

