In [None]:
import warnings
import time
import math
import statistics
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import wilcoxon

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import (
    RandomizedSearchCV, GridSearchCV, cross_val_score,
    StratifiedKFold, RepeatedStratifiedKFold, cross_validate
)
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.ensemble import (
    VotingClassifier, StackingClassifier
)
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.metrics import (
    accuracy_score, recall_score, precision_score, f1_score,
    roc_auc_score, matthews_corrcoef, make_scorer, get_scorer
)

from sklearn.pipeline import Pipeline
from sklearn.feature_selection import mutual_info_classif

warnings.filterwarnings("ignore")

In [None]:
DetDF = pd.DataFrame (columns = ['Classifier', 'Accuracy', 'Recall', 'Precision', 'F1 score', 'AUC', 'MCC', 'Time', 'Dataset'])
ResDF = pd.DataFrame (columns = ['Classifier', 'Accuracy', 'Recall', 'Precision', 'F1 score', 'AUC', 'MCC', 'Time', 'Dataset'])
StaDF = pd.DataFrame (columns = ['Classifier_1', 'Classifier_2', 'Stat', 'Sig-level', 'p-value', 'Null Hypo', 'Win', 'Lost', 'Effect Size', 'Effect Type', 'Dataset'])

smell_datasets = ['Large Class.csv', 'Long Method.csv', 'Long Base Class List.csv', 'Long Parameter List.csv', 'Long Scope Chaining.csv']

fnames = smell_datasets

In [None]:
class FS_GainRatio(BaseEstimator, TransformerMixin):
    def __init__(self, num_features=1):
        self.num_features = num_features
        self.selected_features = []

    def fit(self, X, y):
        info_gain = mutual_info_classif(X, y)
        intrinsic_value = -np.nansum((X / np.sum(X, axis=0)) * np.log2(X / np.sum(X, axis=0)), axis=0)
        intrinsic_value[intrinsic_value == 0] = np.finfo(float).eps
        gain_ratio_scores = info_gain / intrinsic_value
        mean_score = np.mean(gain_ratio_scores)
        self.selected_features = [i for i in np.argsort(gain_ratio_scores)[::-1] if gain_ratio_scores[i] >= mean_score]
        return self

    def transform(self, X):
        return X[:, self.selected_features]

In [None]:
scaler = MinMaxScaler()
selector = FS_GainRatio()

inner_cv = StratifiedKFold(n_splits=5)
outer_cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=10)
n_jobs = -1

scoring = {
    'accuracy': make_scorer(accuracy_score),
    'recall': make_scorer(recall_score),
    'precision': make_scorer(precision_score),
    'f1': make_scorer(f1_score),
    'roc_auc': make_scorer(roc_auc_score),
    'mcc': make_scorer(matthews_corrcoef)
}

ml_param_grids = {
    'DT': {
        'dt__criterion': ['gini', 'entropy'],
        'dt__max_depth': [None, 5, 10, 20, 30],
        'dt__min_samples_split': [2, 5, 10],
        'dt__min_samples_leaf': [1, 2, 4]
    },
    'KNN': {
        'knn__n_neighbors': [3, 5, 7, 10],
        'knn__weights': ['uniform', 'distance'],
        'knn__metric': ['euclidean', 'manhattan', 'minkowski']
    },
    'LR': {
        'lr__C': [0.01, 0.1, 1, 10, 100],
        'lr__solver': ['lbfgs', 'liblinear'],
        'lr__penalty': ['l2']
    },
    'SVM': {
        'svm__C': [0.1, 1, 10, 100],
        'svm__kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
        'svm__gamma': ['scale', 'auto']
    },
    'MLP': {
        'mlp__hidden_layer_sizes': [(50,), (100,), (50, 50)],
        'mlp__activation': ['relu', 'tanh'],
        'mlp__solver': ['adam', 'sgd'],
        'mlp__alpha': [0.0001, 0.001, 0.01]
    },
    'SGD': {
        'sgd__penalty': ['l2', 'l1', 'elasticnet'],
        'sgd__alpha': [0.0001, 0.001, 0.01],
        'sgd__learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive'],
        'sgd__eta0': [0.01, 0.1, 1, 10]  # Adding valid eta0 values
    },
    'NB': {
        'nb__var_smoothing': np.logspace(-10, -2, 50)
    },
    'GP': {
        'gp__kernel': [1.0 * RBF(length_scale) for length_scale in [0.1, 1, 10]]
    }
}

ml_pipelines = {
    'DT': Pipeline([
        ('scaler', scaler),
        ('selector', selector),
        ('dt', DecisionTreeClassifier())
    ]),
    'KNN': Pipeline([
        ('scaler', scaler),
        ('selector', selector),
        ('knn', KNeighborsClassifier())
    ]),
    'LR': Pipeline([
        ('scaler', scaler),
        ('selector', selector),
        ('lr', LogisticRegression())
    ]),
    'SVM': Pipeline([
        ('scaler', scaler),
        ('selector', selector),
        ('svm', SVC(probability=True))
    ]),
    'MLP': Pipeline([
        ('scaler', scaler),
        ('selector', selector),
        ('mlp', MLPClassifier())
    ]),
    'SGD': Pipeline([
        ('scaler', scaler),
        ('selector', selector),
        ('sgd', SGDClassifier(loss='log_loss'))
    ]),
    'NB': Pipeline([
        ('scaler', scaler),
        ('selector', selector),
        ('nb', GaussianNB())
    ]),
    'GP': Pipeline([
        ('scaler', scaler),
        ('selector', selector),
        ('gp', GaussianProcessClassifier())
    ])
}

ml_models = []
for name, pipeline in ml_pipelines.items():
    ml_models.append([name, RandomizedSearchCV(pipeline, ml_param_grids.get(name, {}), cv=inner_cv, scoring='matthews_corrcoef', refit=True, n_jobs=n_jobs)])

hard_voting = VotingClassifier(estimators=ml_models, voting='hard', n_jobs=n_jobs)
soft_voting = VotingClassifier(estimators=ml_models, voting='soft', n_jobs=n_jobs)
stacking = StackingClassifier(estimators=ml_models, n_jobs=n_jobs)

ensemble_models = [
    ('Hard Voting', hard_voting),
    ('Soft Voting', soft_voting),
    ('Stacking', stacking)
]

dl_models = [
    ('CNN', ''),
    ('LSTM', ''),
    ('GRU', '')
]

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, GRU, Conv1D, Dense, Flatten, Dropout, MaxPooling1D, Input, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

kf = RepeatedStratifiedKFold(n_splits=10, n_repeats=10)

def compute_scores(y_true, y_pred_prob, threshold=0.5):
    y_pred = (y_pred_prob > threshold).astype(int)
    return {
        'accuracy': accuracy_score(y_true, y_pred),
        'recall': recall_score(y_true, y_pred, zero_division=0),
        'precision': precision_score(y_true, y_pred, zero_division=0),
        'f1': f1_score(y_true, y_pred, zero_division=0),
        'roc_auc': roc_auc_score(y_true, y_pred_prob),
        'mcc': matthews_corrcoef(y_true, y_pred)
    }

def build_deep_models(X, y):
    global DetDF, ResDF

    scaler = MinMaxScaler()
    X = scaler.fit_transform(X)

    # Define models with improved regularization and dropout
    models = {
        "LSTM": Sequential([
            Input(shape=(X.shape[1], 1)),
            LSTM(16, return_sequences=True, kernel_regularizer=l2(0.005)),
            BatchNormalization(),
            Dropout(0.4),
            LSTM(8, kernel_regularizer=l2(0.005)),
            BatchNormalization(),
            Dropout(0.4),
            Dense(1, activation='sigmoid', kernel_regularizer=l2(0.005))
        ]),
        "CNN": Sequential([
            Input(shape=(X.shape[1], 1)),
            Conv1D(filters=16, kernel_size=3, activation='relu', kernel_regularizer=l2(0.005)),
            BatchNormalization(),
            MaxPooling1D(pool_size=2),
            Flatten(),
            Dense(8, activation='relu', kernel_regularizer=l2(0.005)),
            Dropout(0.4),
            Dense(1, activation='sigmoid')
        ]),
        "GRU": Sequential([
            Input(shape=(X.shape[1], 1)),
            GRU(16, return_sequences=True, kernel_regularizer=l2(0.005)),
            BatchNormalization(),
            Dropout(0.4),
            GRU(8, kernel_regularizer=l2(0.005)),
            BatchNormalization(),
            Dropout(0.4),
            Dense(1, activation='sigmoid', kernel_regularizer=l2(0.005))
        ])
    }

    for model_name, model in models.items():
        fold_results = []
        for train_index, test_index in kf.split(X, y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            X_train = np.expand_dims(X_train, axis=-1)
            X_test = np.expand_dims(X_test, axis=-1)

            model.compile(optimizer=Adam(learning_rate=0.0005), loss='binary_crossentropy', metrics=['accuracy'])

            early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
            lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, verbose=0)

            start_time = time.time()
            model.fit(X_train, y_train, epochs=20, batch_size=32, verbose=0, validation_data=(X_test, y_test),
                      callbacks=[early_stopping, lr_scheduler])

            y_pred_prob = model.predict(X_test).flatten()

            elapsed_time = time.time() - start_time

            scores = compute_scores(y_test, y_pred_prob)
            fold_result = {
                'Classifier': model_name,
                'Accuracy': scores['accuracy'],
                'Recall': scores['recall'],
                'Precision': scores['precision'],
                'F1 score': scores['f1'],
                'AUC': scores['roc_auc'],
                'MCC': scores['mcc'],
                'Time': elapsed_time,
                'Dataset': fname
            }
            DetDF = pd.concat([DetDF, pd.DataFrame([fold_result])], ignore_index=True)
            fold_results.append(fold_result)

        avg_results = pd.DataFrame(fold_results).mean(numeric_only=True).to_dict()
        avg_results['Classifier'] = model_name
        ResDF = pd.concat([ResDF, pd.DataFrame([avg_results])], ignore_index=True)

    return models

In [None]:
for fname in fnames:
    print("Dataset = " + fname)

    data = pd.read_csv(fname)

    results = {}

    X = data.drop(columns=["class"]).values
    y = data["class"].values

    build_deep_models(X,y)

    for name, model in ml_models:
        print("Processing Model = " + name)
        cv_results = cross_validate(model, X, y, cv=outer_cv, scoring=scoring, n_jobs=n_jobs, return_estimator=False)
        results[name] = cv_results

    for name, model in ensemble_models:
        print("Processing Model = " + name)
        cv_results = cross_validate(model, X, y, cv=outer_cv, scoring=scoring, n_jobs=n_jobs, return_estimator=False)
        results[name] = cv_results

    for name, _ in ml_models + ensemble_models:
        result = results[name]

        acc_scores = result['test_accuracy']
        rec_scores = result['test_recall']
        pre_scores = result['test_precision']
        f1s_scores = result['test_f1']
        auc_scores = result['test_roc_auc']
        mcc_scores = result['test_mcc']
        test_times = result['score_time'] + result['fit_time']  # Total time = fit + score time

        new_row = {'Classifier': name,
                   'Accuracy': np.round(acc_scores.mean(), 2),
                   'Recall': np.round(rec_scores.mean(), 2),
                   'Precision': np.round(pre_scores.mean(), 2),
                   'F1 score': np.round(f1s_scores.mean(), 2),
                   'AUC': np.round(auc_scores.mean(), 2),
                   'MCC': np.round(mcc_scores.mean(), 2),
                   'Time': np.round(test_times.mean(), 2),
                   'Dataset': fname}

        ResDF = pd.concat([ResDF, pd.DataFrame([new_row])], ignore_index=True)

        for i in range(len(mcc_scores)):
            new_row = {'Classifier': name,
                       'Accuracy': np.round(acc_scores[i], 2),
                       'Recall': np.round(rec_scores[i], 2),
                       'Precision': np.round(pre_scores[i], 2),
                       'F1 score': np.round(f1s_scores[i], 2),
                       'AUC': np.round(auc_scores[i], 2),
                       'MCC': np.round(mcc_scores[i], 2),
                       'Time': np.round(test_times[i], 2),
                       'Dataset': fname}
            DetDF = pd.concat([DetDF, pd.DataFrame([new_row])], ignore_index=True)


In [None]:
from datetime import date

ResDF.to_excel (str(date.today()) + '_SummaryResults.xlsx', header='column_names')
DetDF.to_excel (str(date.today()) + '_DetailedResults.xlsx', header='column_names')