<a href="https://colab.research.google.com/github/ikoojos/Algorithm-Debt-Research/blob/master/LR_and_Custom_features.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')
%cd '/content/drive/My Drive/AD Final Experiments'

# General imports
import sys
import pandas as pd
import numpy as np
from itertools import product
import importlib
import warnings

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score
)
from scipy.sparse import hstack, csr_matrix
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

# Custom modules
sys.path.append('/content/drive/My Drive/AD Final Experiments')
from preprocessing import preprocess_data
from splitting import split_data
from utils import *
from evaluate_model import evaluate_best_model
from lr_tuning import hyperparameter_tuning

# Reload custom modules to ensure latest updates
for module in ['preprocessing', 'splitting', 'utils', 'evaluate_model', 'lr_tuning']:
    importlib.reload(sys.modules[module])

# Ignore all warnings
warnings.filterwarnings("ignore")


Mounted at /content/drive
/content/drive/My Drive/AD Final Experiments


In [2]:
drive.mount('/content/drive')

sys.path.append('/content/drive/My Drive/AD Final Experiments')

file_path = '/content/drive/My Drive/AD Identification using SATD/liu_datset_processed.csv'
data = preprocess_data(file_path)
#X_train_final, X_val, X_test, y_train_final, y_val, y_test = split_data(data)

print("Data preprocessing Complete!")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Data preprocessing Complete!


In [22]:
df = data
X = df['Comments'].apply(lambda x: x.lower().strip())
y = df['TDType']

X_train_raw, X_test_raw, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

class CustomFeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, keywords, save_csv=False, csv_path=None):
        self.keywords = keywords
        self.save_csv = save_csv
        self.csv_path = csv_path

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        # Extract custom features
        custom_features = [
            {f'contains_{kw}': int(kw in text.lower()) for kw in self.keywords}
            for text in X
        ]


        custom_features_df = pd.DataFrame(custom_features)

        if self.save_csv and self.csv_path:
            combined_df = pd.DataFrame({'Comments': X}).reset_index(drop=True)
            combined_df = pd.concat([combined_df, custom_features_df], axis=1)

            if y is not None:
                combined_df['TDType'] = y.reset_index(drop=True) if isinstance(y, pd.Series) else pd.Series(y).reset_index(drop=True)

            combined_df.to_csv(self.csv_path, index=False)
            print(f"CSV saved to {self.csv_path}")

        # Return sparse matrix of features for the pipeline
        return csr_matrix(custom_features_df.values)

keywords = ['shape', 'input', 'tensor', 'output', 'size', 'convolution',
            'value', 'efficient', 'matrix', 'model', 'node', 'function', 'batch']

# Save CSVs
debug_extractor = CustomFeatureExtractor(keywords, save_csv=True, csv_path="Custom AD Features.csv")
debug_extractor.transform(X_train_raw, y_train)


pipeline = Pipeline([
    ('features', FeatureUnion([
        ('vectorizer', CountVectorizer()),
        ('custom', CustomFeatureExtractor(keywords, save_csv=False))  # Disable saving in pipeline
    ])),
    ('clf', LogisticRegression(class_weight='balanced', random_state=42))
])

param_grid = {
    'clf__C': [0.01, 1, 10],
    'clf__penalty': ['l2'],
    'clf__max_iter': [100, 200]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1_macro', n_jobs=-1, verbose=1)
grid_search.fit(X_train_raw, y_train)

best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print(f"Best F1 Score (Macro) on Training Set: {best_score:.2f}")

y_pred_test = best_model.predict(X_test_raw)

print("\nEvaluation on Test Set:")
print(f"F1 Score (Macro): {f1_score(y_test, y_pred_test, average='macro'):.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred_test))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_test))

CSV saved to Custom AD Features.csv
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best Parameters: {'clf__C': 10, 'clf__max_iter': 100, 'clf__penalty': 'l2'}
Best F1 Score (Macro) on Training Set: 0.66

Evaluation on Test Set:
F1 Score (Macro): 0.68
Classification Report:
                        precision    recall  f1-score   support

             ALGORITHM       0.51      0.56      0.54       200
         COMPATIBILITY       0.56      0.55      0.55        89
                DEFECT       0.50      0.58      0.53       135
                DESIGN       0.86      0.82      0.84      2206
         DOCUMENTATION       0.62      0.57      0.59        23
        IMPLEMENTATION       0.63      0.75      0.69       387
                  TEST       0.71      0.76      0.74       143
WITHOUT_CLASSIFICATION       0.97      0.96      0.96      4592

              accuracy                           0.88      7775
             macro avg       0.67      0.69      0.68      7775
       