<a href="https://colab.research.google.com/github/ikoojos/Algorithm-Debt-Research/blob/master/Instructor_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import sys
from google.colab import drive
from itertools import product
from sklearn.linear_model import LogisticRegression
import importlib


import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np


# Mount Google Drive
drive.mount('/content/drive')
sys.path.append('/content/drive/My Drive/AD Final Experiments')

# Import custom modules
from preprocessing import preprocess_data
from splitting import split_data
from utils import *
from evaluate_model import evaluate_best_model
from lr_tuning import hyperparameter_tuning


for module in ['preprocessing', 'splitting', 'utils', 'evaluate_model', 'lr_tuning']:
    importlib.reload(sys.modules[module])


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%cd '/content/drive/My Drive/AD Final Experiments'
!pip install -r requirements.txt

/content/drive/My Drive/AD Final Experiments


In [3]:
!pip install sentence-transformers
!pip install InstructorEmbedding



In [4]:
!pip install --upgrade huggingface_hub
!pip install huggingface_hub==0.25.2

Collecting huggingface_hub
  Using cached huggingface_hub-0.26.5-py3-none-any.whl.metadata (13 kB)
Using cached huggingface_hub-0.26.5-py3-none-any.whl (447 kB)
Installing collected packages: huggingface_hub
  Attempting uninstall: huggingface_hub
    Found existing installation: huggingface-hub 0.25.2
    Uninstalling huggingface-hub-0.25.2:
      Successfully uninstalled huggingface-hub-0.25.2
Successfully installed huggingface_hub-0.26.5
Collecting huggingface_hub==0.25.2
  Using cached huggingface_hub-0.25.2-py3-none-any.whl.metadata (13 kB)
Using cached huggingface_hub-0.25.2-py3-none-any.whl (436 kB)
Installing collected packages: huggingface_hub
  Attempting uninstall: huggingface_hub
    Found existing installation: huggingface-hub 0.26.5
    Uninstalling huggingface-hub-0.26.5:
      Successfully uninstalled huggingface-hub-0.26.5
Successfully installed huggingface_hub-0.25.2


In [27]:
drive.mount('/content/drive')

sys.path.append('/content/drive/My Drive/AD Final Experiments')

file_path = '/content/drive/My Drive/AD Identification using SATD/liu_datset_processed.csv'
data = preprocess_data(file_path)
#X_train_final, X_val, X_test, y_train_final, y_val, y_test = split_data(data)

print("Data preprocessing complete!")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Data preprocessing complete!


In [12]:
from InstructorEmbedding import INSTRUCTOR

# Initialize Instructor Model
model = INSTRUCTOR('hkunlp/instructor-large')

# Load Dataset
liu_data = data

# Map Labels to Integers
class_mapping = {label: idx for idx, label in enumerate(liu_data['TDType'].unique())}
liu_data['label'] = liu_data['TDType'].map(class_mapping)

# Split Dataset into Train, Validation, and Test Sets
X_train, X_test, y_train, y_test = train_test_split(liu_data['Comments'], liu_data['label'], test_size=0.2, random_state=42, stratify=liu_data['label'])
X_train_final, X_val, y_train_final, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


# Prepare Comments with Instructions
def prepare_text_instruction_pairs(texts):
    instruction = "Represent the following technical debt comments for classification:"
    return [[instruction, text] for text in texts]

train_texts = prepare_text_instruction_pairs(X_train_final.tolist())
val_texts = prepare_text_instruction_pairs(X_val.tolist())
test_texts = prepare_text_instruction_pairs(X_test.tolist())

# Generate Embeddings
def calculate_embeddings(model, texts):
    return model.encode(texts)

train_embeddings = calculate_embeddings(model, train_texts)
val_embeddings = calculate_embeddings(model, val_texts)
test_embeddings = calculate_embeddings(model, test_texts)

# Combine Embeddings and Labels into DataFrames
train_df = pd.DataFrame(train_embeddings)
train_df['label'] = y_train_final.values

val_df = pd.DataFrame(val_embeddings)
val_df['label'] = y_val.values

test_df = pd.DataFrame(test_embeddings)
test_df['label'] = y_test.values

# Save to CSV
train_df.to_csv('train_embeddings__.csv', index=False)
val_df.to_csv('val_embeddings__.csv', index=False)
test_df.to_csv('test_embeddings__.csv', index=False)

print("Embeddings__ for Train, Validation, and Test sets have been saved.")


load INSTRUCTOR_Transformer
max_seq_length  512
Embeddings__ for Train, Validation, and Test sets have been saved.


In [21]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, classification_report
from itertools import product
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings("ignore")

# Load Precomputed Embeddings
train_df = pd.read_csv('train_embeddings__.csv')
val_df = pd.read_csv('val_embeddings__.csv')
test_df = pd.read_csv('test_embeddings__.csv')

# Separate Features (X) and Labels (y)
X_train, y_train = train_df.iloc[:, :-1], train_df['label']
X_val, y_val = val_df.iloc[:, :-1], val_df['label']
X_test, y_test = test_df.iloc[:, :-1], test_df['label']

# Define Hyperparameter Grid for Logistic Regression
param_grid = {
    'C': [0.01, 1, 10],
    'penalty': ['l2'],  # Regularization (ElasticNet is not supported for dense embeddings)
    'max_iter': [100, 200]
}

# Initialize Variables to Track the Best Model
best_score = -1
best_params = None
best_model = None

# Hyperparameter Tuning Loop
for C, penalty, max_iter in product(param_grid['C'], param_grid['penalty'], param_grid['max_iter']):
    solver = 'lbfgs'  # Solver suitable for 'l2' regularization

    try:
        # Build Pipeline with StandardScaler and Logistic Regression
        pipeline = Pipeline([
            ('scaler', StandardScaler()),
            ('clf', LogisticRegression(
                C=C, penalty=penalty, max_iter=max_iter, solver=solver,
                random_state=42, class_weight='balanced'
            ))
        ])

        # Train the Model
        pipeline.fit(X_train, y_train)

        # Validate the Model on the Validation Set
        y_val_pred = pipeline.predict(X_val)
        score = accuracy_score(y_val, y_val_pred)

        # Update the Best Model if Current Validation Score is Higher
        if score > best_score:
            best_score = score
            best_params = {'C': C, 'penalty': penalty, 'max_iter': max_iter}
            best_model = pipeline

    except Exception as e:
        print(f"Skipping configuration C={C}, penalty={penalty}, max_iter={max_iter} due to error: {e}")

# Evaluate the Best Model on the Test Set
if best_model:
    y_test_pred = best_model.predict(X_test)

    print("Best Model Parameters:", best_params)
    print("Validation Accuracy of Best Model:", best_score)

    print("\nTest Set Evaluation:")
    print("Accuracy:", accuracy_score(y_test, y_test_pred))
    print("F1 Score:", f1_score(y_test, y_test_pred, average='weighted'))
    print("Classification Report:\n", classification_report(y_test, y_test_pred))
else:
    print("No valid model was found during hyperparameter tuning.")


Best Model Parameters: {'C': 1, 'penalty': 'l2', 'max_iter': 100}
Validation Accuracy of Best Model: 0.7755627009646302

Test Set Evaluation:
Accuracy: 0.7742765273311897
F1 Score: 0.7960433578396509
Classification Report:
               precision    recall  f1-score   support

           0       0.20      0.54      0.30       187
           1       0.23      0.52      0.32        91
           2       0.25      0.63      0.36       132
           3       0.83      0.57      0.67      2178
           4       0.37      0.69      0.48        32
           5       0.42      0.69      0.53       393
           6       0.44      0.66      0.52       131
           7       0.96      0.90      0.93      4631

    accuracy                           0.77      7775
   macro avg       0.46      0.65      0.51      7775
weighted avg       0.84      0.77      0.80      7775

