# XAI-ACSM


## Baseline Models

In [None]:
import random
import numpy as np
import numpy as npl
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MaxAbsScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import matthews_corrcoef # average == 'macro'.
from sklearn.metrics import roc_auc_score # multiclas 'ovo' average == 'macro'.
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier
from sklearn.mixture import GaussianMixture
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import cross_val_predict
import os
from sklearn.metrics import auc, precision_recall_curve, roc_auc_score
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd

# Step 1: Directory containing training CSV files
train_dir_path = 'Train'  # Replace with your actual folder path
test_dir_path = 'Test'    # Test directory path

global results 

# Custom cross-validation function
def custom_cv(y, nr_fold):
    skf = StratifiedKFold(n_splits=nr_fold, shuffle=True, random_state=0)
    return skf.split(np.zeros(len(y)), y)

# Define cross-validation function
def cv(clf, X, y, nr_fold):
    ix = np.arange(len(y))  # Generate index array
    allACC, allSENS, allSPEC, allMCC, allROC_AUC, allBACC, allAUC_PR = [], [], [], [], [], [], []
    
    for j in range(nr_fold):
        train_ix = ((ix % nr_fold) != j)
        test_ix = ((ix % nr_fold) == j)
        train_X, test_X = X[train_ix], X[test_ix]
        train_y, test_y = y[train_ix], y[test_ix]
        
        # Apply StandardScaler to training and testing data within each fold
        scaler = StandardScaler()
        train_X = scaler.fit_transform(train_X)
        test_X = scaler.transform(test_X)

        clf.fit(train_X, train_y)
        p = clf.predict(test_X)
        pr = clf.predict_proba(test_X)[:, 1]

        TP, FP, TN, FN = 0.00001, 0.00001, 0.00001, 0.00001
        for i in range(len(test_y)):
            if test_y[i] == 1 and p[i] == 1:
                TP += 1
            elif test_y[i] == 1 and p[i] == 0:
                FN += 1
            elif test_y[i] == 0 and p[i] == 1:
                FP += 1
            elif test_y[i] == 0 and p[i] == 0:
                TN += 1

        ACC = (TP + TN) / (TP + FP + TN + FN)
        SENS = TP / (TP + FN)
        SPEC = TN / (TN + FP)
        MCC = ((TP * TN) - (FP * FN)) / np.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) if (TP + FP) * (TP + FN) * (TN + FP) * (TN + FN) != 0 else 0
        ROC_AUC = roc_auc_score(test_y, pr)  # Renamed from 'AUC' to 'ROC_AUC'
        F1 = TP / (TP + (0.5 * (FP + FN)))

        precision, recall, _ = precision_recall_curve(test_y, pr)
        AUC_PR = auc(recall, precision)

        allACC.append(ACC)
        allSENS.append(SENS)
        allSPEC.append(SPEC)
        allMCC.append(MCC)
        allROC_AUC.append(ROC_AUC)
        allBACC.append(F1)
        allAUC_PR.append(AUC_PR)

    return np.mean(allACC), np.mean(allSENS), np.mean(allSPEC), np.mean(allMCC), np.mean(allROC_AUC), np.mean(allBACC), np.mean(allAUC_PR)

# Define test function
def test(clf, X, y, Xt, yt):
    clf.fit(X, y)
    p = clf.predict(Xt)
    pr = clf.predict_proba(Xt)[:, 1]

    TP, FP, TN, FN = 0.00001, 0.00001, 0.00001, 0.00001
    for i in range(len(yt)):
        if yt[i] == 1 and p[i] == 1:
            TP += 1
        elif yt[i] == 1 and p[i] == 0:
            FN += 1
        elif yt[i] == 0 and p[i] == 1:
            FP += 1
        elif yt[i] == 0 and p[i] == 0:
            TN += 1

    ACC = (TP + TN) / (TP + FP + TN + FN)
    SENS = TP / (TP + FN)
    SPEC = TN / (TN + FP)
    MCC = ((TP * TN) - (FP * FN)) / np.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) if (TP + FP) * (TP + FN) * (TN + FP) * (TN + FN) != 0 else 0
    ROC_AUC = roc_auc_score(yt, pr)  # Renamed from 'AUC' to 'ROC_AUC'
    F1 = TP / (TP + (0.5 * (FP + FN)))

    precision, recall, _ = precision_recall_curve(yt, pr)
    AUC_PR = auc(recall, precision)

    return ACC, SENS, SPEC, MCC, ROC_AUC, F1, AUC_PR, TP, TN, FP, FN

# Writing to CSV
output_file_path = "Baseline.csv"

feat_train = None
feat_test = None
allclf = []
with open(output_file_path, "a") as file:
    file.write("Prob Feature, acc, sens, spec, mcc, roc, f1, auc_pr, best param, acc, sens, spec, mcc, roc, f1, auc_pr, tp, tn, fp, fn\n")

    # Loop through all CSV files in the specified training directory
    for filename in os.listdir(train_dir_path):
        if filename.endswith('.csv'):
            train_file_path = os.path.join(train_dir_path, filename)
            train_df = pd.read_csv(train_file_path)
            X_train = train_df.iloc[:, 1:-1].values
            y_train = train_df.iloc[:, -1].values

            # Apply StandardScaler to the training data
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)

            # Use the filename (without the extension) as the feature name
            feat = os.path.splitext(filename)[0]

            # Load the corresponding test file based on the filename
            test_file_path = os.path.join(test_dir_path, f"{feat}_test.csv")
            if os.path.exists(test_file_path):
                test_df = pd.read_csv(test_file_path)
                X_test = test_df.iloc[:, 1:-1].values
                y_test = test_df.iloc[:, -1].values

                # Apply StandardScaler to the test data
                X_test = scaler.transform(X_test)  # Only transform the test set

                classifiers = [
                    ("SVMRBF", SVC(random_state=0, probability=True), {'C': np.geomspace(2**-8, 2**8, num=17)}, 'C'),
                    ("RF", RandomForestClassifier(random_state=0), {'n_estimators': [20, 50, 100, 200]}, 'n_estimators'),
                    ("ET", ExtraTreesClassifier(random_state=0), {'n_estimators': [20, 50, 100, 200]}, 'n_estimators'),
                    ("XGB", XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=0), {'n_estimators': [20, 50, 100, 200]}, 'n_estimators'),
                    ("LGBM", LGBMClassifier(random_state=0), {'n_estimators': [20, 50, 100, 200]}, 'n_estimators'),
                    ("ADA", AdaBoostClassifier(random_state=0), {'n_estimators': [20, 50, 100, 200]}, 'n_estimators'),
                    ("MLP", MLPClassifier(random_state=0, max_iter=10000), {'hidden_layer_sizes': [(20,), (50,), (100,), (200,)]}, 'hidden_layer_sizes'),
                    ("KNN", KNeighborsClassifier(), {'n_neighbors': [3, 5, 7, 9, 11]}, 'n_neighbors'),
                    ("LDA", LinearDiscriminantAnalysis(), {'n_components': [1, 2, 3, 4]}, 'n_components'),
                    ("NB", GaussianNB(), {'var_smoothing': np.logspace(0, -9, num=100)}, 'var_smoothing'),
                    ("DT", DecisionTreeClassifier(random_state=0), {'min_samples_leaf': [1, 2, 4, 8]}, 'min_samples_leaf'),
                    ("LR", LogisticRegression(random_ste=0, max_itetar=10000), {'C': np.logspace(-3, 3, num=100)}, 'C'),
                    ("PLS", PLS(), {'iter': np.arange(10, 1000, 10)}, 'iter')
                ]


                for clf_name, clf, param_grid, param_name in classifiers:
                    bestclf = GridSearchCV(estimator=clf, param_grid=param_grid, cv=custom_cv(y_train, 10))
                    bestclf.fit(X_train, y_train)
                    bestparam = np.round(bestclf.best_params_[param_name], 3)
                    allclf.append(bestclf.best_estimator_)
                    featx = cross_val_predict(allclf[-1], X_train, y_train, cv=custom_cv(y_train, 10), method="predict_proba")[:, 0]
                    featxt = allclf[-1].predict_proba(X_test)[:, 0]

                    #feat_train = np.hstack((feat_train, featx.reshape(-1, 1))) if feat_train else featx.reshape(-1, 1)
                    #feat_test = np.hstack((feat_test, featxt.reshape(-1, 1))) if feat_test else featxt.reshape(-1, 1)
                    
                    if feat_train is None:
                        feat_train = featx.reshape(-1, 1)  # Initialize feat_train on the first pass
                        feat_test = featxt.reshape(-1, 1)  # Initialize feat_test on the first pass
                    else:
                        feat_train = np.hstack((feat_train, featx.reshape(-1, 1)))  # Concatenate new features
                        feat_test = np.hstack((feat_test, featxt.reshape(-1, 1)))
                    
                     # Concatenate new features

                    print("Feature engineering process")
                    acc, sens, spec, mcc, roc_auc, f1, auc_pr = cv(bestclf.best_estimator_, X_train, y_train, 10)
                    file.write(f"{clf_name}-{feat},{acc},{sens},{spec},{mcc},{roc_auc},{f1},{auc_pr},{bestparam}")
                    print(f"{clf_name}-{feat},{acc},{sens},{spec},{mcc},{roc_auc},{f1},{auc_pr},{bestparam}\n")

                    acc, sens, spec, mcc, roc_auc, f1, auc_pr, TP, TN, FP, FN = test(bestclf.best_estimator_, X_train, y_train, X_test, y_test)
                    file.write(f",{acc},{sens},{spec},{mcc},{roc_auc},{f1},{auc_pr},{TP},{TN},{FP},{FN}\n")
                    print(f",{acc},{sens},{spec},{mcc},{roc_auc},{f1},{auc_pr},{TP},{TN},{FP},{FN}\n")
                    

                    print(f"Finished {feat}\n")
                    
feat_train_df = pd.DataFrame(feat_train)
feat_test_df = pd.DataFrame(feat_test)
feat_train_df.to_csv(f'feat_train_all.csv', index=False)
feat_test_df.to_csv(f'feat_test_all.csv', index=False)

## Average Ensemble Learning

In [None]:
# แบบหลายมิติ
import pandas as pd
import os
import numpy as np

# Folder path where your CSV files are stored
folder_path = 'Two-step Selection' # keep stack_train_switch2.csv and stack_test_switch2.csv

# List of all CSV files in the folder (assuming all files have the .csv extension)
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

# List of dimensions to switch between deep learning and handcrafted feature
dimensions = [8]

# Initialize an empty list to store the row averages for each file
row_averages = []
target_classes = []  # To store the target class for each file

# Loop through each CSV file and process it
for csv_file in csv_files:
    file_path = os.path.join(folder_path, csv_file)
    print(f"Reading file: {csv_file}")  # Show which file is being read
    
    # Read the CSV file into a DataFrame (assuming it has a header row)
    df = pd.read_csv(file_path, header=0)  # If no header, you can change it to header=None
    
    # Extract the target class (last column)
    target_class = df.iloc[:, -1]  # The last column is the target class
    
    # Loop through the dimensions and create separate files for each
    for dim in dimensions:
        #if dim <= df.shape[1] - 1:  # Ensure the dimension doesn't exceed the available columns
            # Slice the DataFrame to get the required number of columns
        df_dim = df.iloc[:, :dim]  # Select columns from the start up to the given dimension
            
            # Calculate the average of each row across columns (excluding the target class column)
        row_avg = df_dim.mean(axis=1)
            
            # Convert the row averages to binary (1 if >= 0.5, else 0)
        row_avg_bin = row_avg.apply(lambda x: 0 if x >= 0.5 else 1)
            
            # Combine the target class and the row averages
        final_output = pd.concat([row_avg_bin, target_class], axis=1)
        final_output.columns = [f'Avg_{dim}', 'Target_Class']
            
            # Define the output file name based on the dimension
        output_file_path = os.path.join(folder_path, f"M2V-Handcraft_averaged_{dim}_{csv_file}")
            
            # Save the modified DataFrame to a new CSV file
        final_output.to_csv(output_file_path, index=True)
            
        print(f"Output saved to {output_file_path}")
        #else:
         #   print(f"Skipping dimension {dim} for file {csv_file} due to insufficient columns.")
