<a href="https://colab.research.google.com/github/ikoojos/Algorithm-Debt-Research/blob/master/Custom_features_RF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')
%cd '/content/drive/My Drive/AD Final Experiments'

# General imports
import sys
import pandas as pd
import numpy as np
from itertools import product
import importlib
import warnings

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score
)
from scipy.sparse import hstack, csr_matrix
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

# Custom modules
sys.path.append('/content/drive/My Drive/AD Final Experiments')
from preprocessing import preprocess_data
from splitting import split_data
from utils import *
from evaluate_model import evaluate_best_model
from lr_tuning import hyperparameter_tuning
from feature_extractor import CustomFeatureExtractor, keywords

# Reload custom modules to ensure latest updates
for module in ['preprocessing', 'splitting', 'utils', 'evaluate_model', 'lr_tuning']:
    importlib.reload(sys.modules[module])

# Ignore all warnings
warnings.filterwarnings("ignore")


Mounted at /content/drive
/content/drive/My Drive/AD Final Experiments


In [2]:
file_path = '/content/drive/My Drive/AD Identification using SATD/liu_datset_processed.csv'
data = preprocess_data(file_path)

In [4]:
from sklearn.ensemble import RandomForestClassifier

# Assuming CustomFeatureExtractor is already defined and imported
df = data
X = df['Comments'].apply(lambda x: x.lower().strip())
y = df['TDType']

X_train_raw, X_test_raw, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Save CSVs for - Train Set
debug_extractor = CustomFeatureExtractor(keywords, save_csv=True, csv_path="Custom_AD_Train_Features_RF.csv")
debug_extractor.transform(X_train_raw, y_train)

# Save CSVs for - Test Set
debug_extractor_test = CustomFeatureExtractor(keywords, save_csv=True, csv_path="Custom_AD_Test_Features_RF.csv")
debug_extractor_test.transform(X_test_raw, y_test)

# Define pipeline
pipeline = Pipeline([
    ('features', FeatureUnion([
        ('vectorizer', CountVectorizer()),
        ('custom', CustomFeatureExtractor(keywords, save_csv=False))  # Disable saving in pipeline
    ])),
    ('clf', RandomForestClassifier(class_weight='balanced', random_state=42))
])

param_grid = {
    'clf__n_estimators': [100, 200],
    'clf__max_depth': [None, 10, 20, 30],
    'clf__min_samples_split': [2, 5, 10],
    'clf__min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1_macro', n_jobs=-1, verbose=1)
grid_search.fit(X_train_raw, y_train)

best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print(f"Best F1 Score (Macro) on Training Set: {best_score:.2f}")

y_pred_test = best_model.predict(X_test_raw)

print("\nEvaluation on Test Set:")
print(f"F1 Score (Macro): {f1_score(y_test, y_pred_test, average='macro'):.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred_test))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_test))

CSV saved to Custom_AD_Train_Features_RF.csv
CSV saved to Custom_AD_Test_Features_RF.csv
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best Parameters: {'clf__max_depth': None, 'clf__min_samples_leaf': 1, 'clf__min_samples_split': 10, 'clf__n_estimators': 100}
Best F1 Score (Macro) on Training Set: 0.65

Evaluation on Test Set:
F1 Score (Macro): 0.69
Classification Report:
                        precision    recall  f1-score   support

             ALGORITHM       0.84      0.39      0.53       200
         COMPATIBILITY       0.89      0.44      0.59        89
                DEFECT       0.71      0.45      0.55       135
                DESIGN       0.82      0.90      0.85      2206
         DOCUMENTATION       0.73      0.48      0.58        23
        IMPLEMENTATION       0.73      0.66      0.70       387
                  TEST       0.82      0.68      0.74       143
WITHOUT_CLASSIFICATION       0.96      0.97      0.96      4592

              accuracy        