# XAI-ACSM


## Probablistic Feature and Baseline Models

In [None]:
file = open('PLS.py','w')
file.write('import numpy as np'+"\n")
file.write('from sklearn.cross_decomposition import PLSRegression'+"\n")
file.write('from sklearn.base import BaseEstimator, ClassifierMixin'+"\n")
file.write('class PLS(BaseEstimator, ClassifierMixin):'+"\n")
file.write('    def __init__(self, iter=500):'+"\n")
file.write('        self.iter = iter'+"\n")
file.write('        self.clf = PLSRegression(n_components=2, max_iter=self.iter)'+"\n")
file.write('    def fit(self, X, y):'+"\n")
file.write('        self.classes_ = np.unique(y)'+"\n")
file.write('        self.clf.fit(X,y)'+"\n")
file.write('        return self'+"\n")
file.write('    def predict(self, X):'+"\n")
file.write('        pr = [np.round(min(max(item[0],0.000001),0.999999)) for item in self.clf.predict(X)]'+"\n")
file.write('        return np.array(pr)'+"\n")
file.write('    def predict_proba(self, X):'+"\n")
file.write('        p_all = []'+"\n")
file.write('        p_all.append([1-min(max(item[0],0.000001),0.99999) for item in self.clf.predict(X)])'+"\n")
file.write('        p_all.append([min(max(item[0],0.000001),0.99999) for item in self.clf.predict(X)])'+"\n")
file.write('        return np.transpose(np.array(p_all))'+"\n")
file.close()

In [None]:
import random
import numpy as np
import numpy as npl
from PLS import PLS
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MaxAbsScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import matthews_corrcoef # average == 'macro'.
from sklearn.metrics import roc_auc_score # multiclas 'ovo' average == 'macro'.
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier
from sklearn.mixture import GaussianMixture
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import cross_val_predict
import os
from sklearn.metrics import auc, precision_recall_curve, roc_auc_score
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd

# Step 1: Directory containing training CSV files
train_dir_path = 'Train'  # Replace with your actual folder path
test_dir_path = 'Test'    # Test directory path

global results 

# Custom cross-validation function
def custom_cv(y, nr_fold):
    skf = StratifiedKFold(n_splits=nr_fold, shuffle=True, random_state=0)
    return skf.split(np.zeros(len(y)), y)

# Define cross-validation function
def cv(clf, X, y, nr_fold):
    ix = np.arange(len(y))  # Generate index array
    allACC, allSENS, allSPEC, allMCC, allROC_AUC, allBACC, allAUC_PR = [], [], [], [], [], [], []
    
    for j in range(nr_fold):
        train_ix = ((ix % nr_fold) != j)
        test_ix = ((ix % nr_fold) == j)
        train_X, test_X = X[train_ix], X[test_ix]
        train_y, test_y = y[train_ix], y[test_ix]
        
        # Apply StandardScaler to training and testing data within each fold
        scaler = StandardScaler()
        train_X = scaler.fit_transform(train_X)
        test_X = scaler.transform(test_X)

        clf.fit(train_X, train_y)
        p = clf.predict(test_X)
        pr = clf.predict_proba(test_X)[:, 1]

        TP, FP, TN, FN = 0.00001, 0.00001, 0.00001, 0.00001
        for i in range(len(test_y)):
            if test_y[i] == 1 and p[i] == 1:
                TP += 1
            elif test_y[i] == 1 and p[i] == 0:
                FN += 1
            elif test_y[i] == 0 and p[i] == 1:
                FP += 1
            elif test_y[i] == 0 and p[i] == 0:
                TN += 1

        ACC = (TP + TN) / (TP + FP + TN + FN)
        SENS = TP / (TP + FN)
        SPEC = TN / (TN + FP)
        MCC = ((TP * TN) - (FP * FN)) / np.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) if (TP + FP) * (TP + FN) * (TN + FP) * (TN + FN) != 0 else 0
        ROC_AUC = roc_auc_score(test_y, pr)  # Renamed from 'AUC' to 'ROC_AUC'
        F1 = TP / (TP + (0.5 * (FP + FN)))

        precision, recall, _ = precision_recall_curve(test_y, pr)
        AUC_PR = auc(recall, precision)

        allACC.append(ACC)
        allSENS.append(SENS)
        allSPEC.append(SPEC)
        allMCC.append(MCC)
        allROC_AUC.append(ROC_AUC)
        allBACC.append(F1)
        allAUC_PR.append(AUC_PR)

    return np.mean(allACC), np.mean(allSENS), np.mean(allSPEC), np.mean(allMCC), np.mean(allROC_AUC), np.mean(allBACC), np.mean(allAUC_PR)

# Define test function
def test(clf, X, y, Xt, yt):
    clf.fit(X, y)
    p = clf.predict(Xt)
    pr = clf.predict_proba(Xt)[:, 1]

    TP, FP, TN, FN = 0.00001, 0.00001, 0.00001, 0.00001
    for i in range(len(yt)):
        if yt[i] == 1 and p[i] == 1:
            TP += 1
        elif yt[i] == 1 and p[i] == 0:
            FN += 1
        elif yt[i] == 0 and p[i] == 1:
            FP += 1
        elif yt[i] == 0 and p[i] == 0:
            TN += 1

    ACC = (TP + TN) / (TP + FP + TN + FN)
    SENS = TP / (TP + FN)
    SPEC = TN / (TN + FP)
    MCC = ((TP * TN) - (FP * FN)) / np.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) if (TP + FP) * (TP + FN) * (TN + FP) * (TN + FN) != 0 else 0
    ROC_AUC = roc_auc_score(yt, pr)  # Renamed from 'AUC' to 'ROC_AUC'
    F1 = TP / (TP + (0.5 * (FP + FN)))

    precision, recall, _ = precision_recall_curve(yt, pr)
    AUC_PR = auc(recall, precision)

    return ACC, SENS, SPEC, MCC, ROC_AUC, F1, AUC_PR, TP, TN, FP, FN

# Writing to CSV
output_file_path = "Baseline.csv"

feat_train = None
feat_test = None
allclf = []
with open(output_file_path, "a") as file:
    file.write("Prob Feature, acc, sens, spec, mcc, roc, f1, auc_pr, best param, acc, sens, spec, mcc, roc, f1, auc_pr, tp, tn, fp, fn\n")

    # Loop through all CSV files in the specified training directory
    for filename in os.listdir(train_dir_path):
        if filename.endswith('.csv'):
            train_file_path = os.path.join(train_dir_path, filename)
            train_df = pd.read_csv(train_file_path)
            X_train = train_df.iloc[:, 1:-1].values
            y_train = train_df.iloc[:, -1].values

            # Apply StandardScaler to the training data
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)

            # Use the filename (without the extension) as the feature name
            feat = os.path.splitext(filename)[0]

            # Load the corresponding test file based on the filename
            test_file_path = os.path.join(test_dir_path, f"{feat}_test.csv")
            if os.path.exists(test_file_path):
                test_df = pd.read_csv(test_file_path)
                X_test = test_df.iloc[:, 1:-1].values
                y_test = test_df.iloc[:, -1].values

                # Apply StandardScaler to the test data
                X_test = scaler.transform(X_test)  # Only transform the test set

                classifiers = [
                    ("SVMRBF", SVC(random_state=0, probability=True), {'C': np.geomspace(2**-8, 2**8, num=17)}, 'C'),
                    ("RF", RandomForestClassifier(random_state=0), {'n_estimators': [20, 50, 100, 200]}, 'n_estimators'),
                    ("ET", ExtraTreesClassifier(random_state=0), {'n_estimators': [20, 50, 100, 200]}, 'n_estimators'),
                    ("XGB", XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=0), {'n_estimators': [20, 50, 100, 200]}, 'n_estimators'),
                    ("LGBM", LGBMClassifier(random_state=0), {'n_estimators': [20, 50, 100, 200]}, 'n_estimators'),
                    ("ADA", AdaBoostClassifier(random_state=0), {'n_estimators': [20, 50, 100, 200]}, 'n_estimators'),
                    ("MLP", MLPClassifier(random_state=0, max_iter=10000), {'hidden_layer_sizes': [(20,), (50,), (100,), (200,)]}, 'hidden_layer_sizes'),
                    ("KNN", KNeighborsClassifier(), {'n_neighbors': [3, 5, 7, 9, 11]}, 'n_neighbors'),
                    ("LDA", LinearDiscriminantAnalysis(), {'n_components': [1, 2, 3, 4]}, 'n_components'),
                    ("NB", GaussianNB(), {'var_smoothing': np.logspace(0, -9, num=100)}, 'var_smoothing'),
                    ("DT", DecisionTreeClassifier(random_state=0), {'min_samples_leaf': [1, 2, 4, 8]}, 'min_samples_leaf'),
                    ("LR", LogisticRegression(random_state=0, max_iter=10000), {'C': np.logspace(-3, 3, num=100)}, 'C'),
                    ("PLS", PLS(), {'iter': np.arange(10, 1000, 10)}, 'iter')
                ]

                for clf_name, clf, param_grid, param_name in classifiers:
                    bestclf = GridSearchCV(estimator=clf, param_grid=param_grid, cv=custom_cv(y_train, 10))
                    bestclf.fit(X_train, y_train)
                    bestparam = np.round(bestclf.best_params_[param_name], 3)
                    allclf.append(bestclf.best_estimator_)
                    featx = cross_val_predict(allclf[-1], X_train, y_train, cv=custom_cv(y_train, 10), method="predict_proba")[:, 0]
                    featxt = allclf[-1].predict_proba(X_test)[:, 0]

                    #feat_train = np.hstack((feat_train, featx.reshape(-1, 1))) if feat_train else featx.reshape(-1, 1)
                    #feat_test = np.hstack((feat_test, featxt.reshape(-1, 1))) if feat_test else featxt.reshape(-1, 1)
                    
                    if feat_train is None:
                        feat_train = featx.reshape(-1, 1)  # Initialize feat_train on the first pass
                        feat_test = featxt.reshape(-1, 1)  # Initialize feat_test on the first pass
                    else:
                        feat_train = np.hstack((feat_train, featx.reshape(-1, 1)))  # Concatenate new features
                        feat_test = np.hstack((feat_test, featxt.reshape(-1, 1)))
                    
                     # Concatenate new features

                    print("stack feature process")
                    acc, sens, spec, mcc, roc_auc, f1, auc_pr = cv(bestclf.best_estimator_, X_train, y_train, 10)
                    file.write(f"{clf_name}-{feat},{acc},{sens},{spec},{mcc},{roc_auc},{f1},{auc_pr},{bestparam}")
                    print(f"{clf_name}-{feat},{acc},{sens},{spec},{mcc},{roc_auc},{f1},{auc_pr},{bestparam}\n")

                    acc, sens, spec, mcc, roc_auc, f1, auc_pr, TP, TN, FP, FN = test(bestclf.best_estimator_, X_train, y_train, X_test, y_test)
                    file.write(f",{acc},{sens},{spec},{mcc},{roc_auc},{f1},{auc_pr},{TP},{TN},{FP},{FN}\n")
                    print(f",{acc},{sens},{spec},{mcc},{roc_auc},{f1},{auc_pr},{TP},{TN},{FP},{FN}\n")
                    

                    print(f"Finished {feat}\n")
                    
feat_train_df = pd.DataFrame(feat_train)
feat_test_df = pd.DataFrame(feat_test)
feat_train_df.to_csv(f'feat_train_all.csv', index=False)
feat_test_df.to_csv(f'feat_test_all.csv', index=False)

## Two-step Feature Selection

In [None]:
import pandas as pd

def switch_and_save(input_csv1, input_csv2, columns_to_select, exclude_columns=False, include_last_column=True, output_csv1=None, output_csv2=None):
    # Step 1: Read the CSV files
    df1 = pd.read_csv(input_csv1)
    df2 = pd.read_csv(input_csv2)
    
    # Debug: Print the columns to check for mismatches
    print(f"Columns in {input_csv1}: {df1.columns}")
    print(f"Columns in {input_csv2}: {df2.columns}")
    
    # Clean column names by stripping spaces (optional)
    df1.columns = df1.columns.str.strip()
    df2.columns = df2.columns.str.strip()
    
    # Check for case sensitivity (optional)
    df1.columns = df1.columns.str.lower()
    df2.columns = df2.columns.str.lower()
    columns_to_select = [col.lower() for col in columns_to_select]
    
    # Step 2: Filter the columns based on selection or exclusion
    if exclude_columns:
        # Exclude the columns in the list from the dataframe
        selected_columns_df1 = df1.drop(columns=columns_to_select, errors='ignore')
        selected_columns_df2 = df2.drop(columns=columns_to_select, errors='ignore')
    else:
        # Select only the columns in the list from the dataframe
        selected_columns_df1 = df1[columns_to_select]
        selected_columns_df2 = df2[columns_to_select]
    
    # Step 3: Option to include or exclude the last column as label
    if include_last_column:
        label_df1 = df1.iloc[:, -1]  # Last column from df1
        label_df2 = df2.iloc[:, -1]  # Last column from df2
        selected_columns_df1['label'] = label_df1  # Add as 'label' column
        selected_columns_df2['label'] = label_df2  # Add as 'label' column
    
    # Step 4: Save the results to two new CSV files (without index)
    if output_csv1 is not None:
        selected_columns_df1.to_csv(output_csv1, index=False)
    if output_csv2 is not None:
        selected_columns_df2.to_csv(output_csv2, index=False)

# Usage
input_csv1 = 'feat_train_all.csv'  # Path to your first input CSV
input_csv2 = 'feat_test_all.csv'  # Path to your second input CSV

# Column_to_select is based on the list of best baseline results
columns_to_select = ['SVMRBF-R6D300', 'SVMRBF-CKD', 'SVMRBF-R8D300', 'XGB-CKD', 'SVMRBF-R9D300', 'XGB-FP4C', 'SVMRBF-R7D300', 'SVMRBF-CKDExt', 'SVMRBF-R4D300', 'XGB-PubChem', 'SVMRBF-R5D300', 'ET-CKDExt', 'SVMRBF-R2D300', 'XGB-CKDExt', 'SVMRBF-R3D300', 'ET-FP4C', 'SVMRBF-R1D300', 'RF-CKDExt', 'MLP-R6D300', 'LGBM-PubChem', 'MLP-R9D300', 'ET-CKD', 'MLP-R4D300', 'LGBM-CKDExt', 'MLP-R7D300', 'LGBM-CKD', 'MLP-R5D300', 'MLP-CKD', 'MLP-R8D300', 'LGBM-FP4C', 'MLP-R1D300', 'LGBM-KRC', 'SVMRBF-R7D100', 'SVMRBF-RDKit', 'SVMRBF-R9D100', 'RF-FP4C', 'SVMRBF-R8D100', 'MLP-FP4C', 'SVMRBF-R4D100', 'MLP-CKDExt', 'SVMRBF-R5D100', 'XGB-KRC', 'SVMRBF-R6D100', 'SVMRBF-Circle', 'MLP-R2D300', 'ET-PubChem', 'MLP-R3D300', 'LGBM-RDKit', 'SVMRBF-R3D100', 'SVMRBF-FP4C', 'MLP-R8D100', 'MLP-RDKit', 'MLP-R5D100', 'ET-Circle', 'SVMRBF-R2D100', 'RF-CKD', 'MLP-R9D100', 'ET-RDKit', 'MLP-R6D100', 'SVMRBF-PubChem', 'MLP-R4D100', 'SVMRBF-Hybrid', 'SVMRBF-R1D100', 'ET-Hybrid', 'MLP-R7D100', 'RF-PubChem', 'LGBM-R7D300', 'XGB-KR', 'MLP-R2D100', 'LGBM-Hybrid', 'XGB-R7D300', 'RF-RDKit', 'MLP-R3D100', 'LGBM-KR', 'XGB-R8D300', 'RF-Hybrid', 'LGBM-R5D300', 'XGB-Circle', 'MLP-R1D100', 'LGBM-Circle', 'XGB-R9D300', 'XGB-RDKit', 'XGB-R5D300', 'XGB-Hybrid', 'XGB-R4D300', 'RF-Circle', 'XGB-R6D300', 'MLP-PubChem', 'XGB-R1D300', 'MLP-Circle', 'LGBM-R6D300', 'XGB-MACCS', 'LGBM-R8D300', 'LR-RDKit', 'LGBM-R9D300', 'ET-KRC', 'LGBM-R4D300', 'MLP-Hybrid', 'XGB-R7D100', 'MLP-KRC', 'LGBM-R7D100', 'RF-MACCS', 'LR-R6D300', 'SVMRBF-MACCS', 'XGB-R2D300', 'ET-MACCS', 'LR-R7D300', 'RF-KRC', 'LGBM-R2D300', 'MLP-KR', 'LGBM-R1D300', 'ET-KR', 'LGBM-R3D300', 'LGBM-MACCS', 'XGB-R6D100', 'PLS-RDKit', 'XGB-R3D100', 'RF-KR', 'XGB-R9D100', 'SVMRBF-KRC', 'LR-R8D300', 'MLP-MACCS', 'XGB-R3D300', 'ADA-KRC', 'LGBM-R5D100', 'LR-PubChem', 'XGB-R5D100', 'LDA-CKD', 'LR-R9D300', 'LDA-KRC', 'LR-R5D300', 'ADA-FP4C', 'LGBM-R8D100', 'ET-FP4', 'LR-R4D300', 'XGB-CKDGraph', 'XGB-R4D100', 'XGB-FP4', 'XGB-R8D100', 'RF-FP4', 'ET-R7D300', 'LDA-PubChem', 'ET-R5D300', 'RF-CKDGraph', 'ET-R9D300', 'LDA-RDKit', 'LGBM-R4D100', 'LDA-CKDExt', 'LGBM-R6D100', 'LR-CKD', 'ET-R6D300', 'ADA-KR', 'LGBM-R9D100', 'LR-Circle', 'LR-R2D300', 'LDA-FP4C', 'LR-R1D300', 'SVMRBF-KR', 'XGB-R2D100', 'LR-KRC', 'ET-R8D300', 'LGBM-CKDGraph', 'LGBM-R3D100', 'MLP-FP4', 'ET-R6D100', 'KNN-CKDExt', 'LDA-R8D300', 'LR-KR', 'ET-R5D100', 'ET-CKDGraph', 'LDA-R9D300', 'LDA-Circle', 'XGB-R1D100', 'SVMRBF-FP4', 'LGBM-R2D100', 'LGBM-FP4', 'LDA-R7D300', 'LDA-KR', 'ET-R4D300', 'PLS-Circle', 'LDA-R6D300', 'SVMRBF-CKDGraph', 'ET-R8D100', 'KNN-CKD', 'RF-R6D300', 'LR-CKDExt', 'RF-R8D300', 'LR-Hybrid', 'RF-R7D300', 'KNN-FP4C', 'RF-R5D300', 'MLP-CKDGraph', 'ET-R7D100', 'KNN-MACCS', 'LGBM-R1D100', 'LR-FP4C', 'ET-R9D100', 'ADA-CKD', 'RF-R9D300', 'ADA-PubChem', 'KNN-R7D300', 'LDA-Hybrid', 'KNN-R8D300', 'ADA-RDKit', 'KNN-R6D300', 'ADA-CKDExt', 'LR-R3D300', 'KNN-PubChem', 'LDA-R5D300', 'DT-FP4C', 'LDA-R1D300', 'PLS-KR', 'KNN-R9D300', 'KNN-Hybrid', 'LDA-R2D300', 'PLS-FP4C', 'LDA-R3D300', 'ADA-Circle', 'KNN-R5D300', 'KNN-RDKit', 'ET-R4D100', 'KNN-FP4', 'LDA-R4D300', 'DT-PubChem', 'RF-R6D100', 'KNN-Circle', 'ET-R3D300', 'PLS-KRC', 'RF-R7D100', 'ADA-Hybrid', 'RF-R4D300', 'NB-RDKit', 'KNN-R6D100', 'XGB-AP2D', 'RF-R4D100', 'LDA-MACCS', 'RF-R5D100', 'XGB-Estate', 'SVMRBF-R0D100', 'RF-AP2D', 'ET-R2D100', 'SVMRBF-Estate', 'ET-R1D300', 'LR-MACCS', 'ET-R2D300', 'ADA-MACCS', 'SVMRBF-R0D300', 'LGBM-Estate', 'KNN-R8D100', 'DT-CKDExt', 'RF-R8D100', 'ET-AP2D', 'KNN-R7D100', 'LGBM-AP2D', 'RF-R9D100', 'LR-CKDGraph', 'KNN-R4D300', 'RF-Estate', 'KNN-R9D100', 'ET-Estate', 'KNN-R1D300', 'NB-Circle', 'KNN-R3D300', 'LDA-CKDGraph', 'ET-R3D100', 'DT-CKD', 'KNN-R5D100', 'PLS-CKDExt', 'KNN-R2D300', 'DT-RDKit', 'ET-R1D100', 'KNN-CKDGraph', 'MLP-R0D100', 'PLS-CKD', 'RF-R1D300', 'DT-KR', 'RF-R3D300', 'DT-KRC', 'RF-R2D300', 'SVMRBF-AP2D', 'KNN-R4D100', 'DT-MACCS', 'LGBM-R0D300', 'DT-FP4', 'RF-R3D100', 'MLP-Estate', 'LDA-R9D100', 'KNN-KR', 'KNN-R1D100', 'MLP-AP2D', 'RF-R2D100', 'ADA-CKDGraph', 'MLP-R0D300', 'KNN-KRC', 'LDA-R8D100', 'LDA-FP4', 'KNN-R2D100', 'DT-CKDGraph', 'XGB-R0D300', 'KNN-Estate', 'LDA-R6D100', 'ADA-FP4', 'LDA-R7D100', 'DT-Hybrid', 'KNN-R3D100', 'LR-FP4', 'RF-R1D100', 'PLS-Hybrid', 'LDA-R5D100', 'DT-Estate', 'ADA-R5D300', 'PLS-FP4', 'LR-R7D100', 'PLS-PubChem', 'ADA-R6D300', 'NB-KR', 'LDA-R4D100', 'NB-KRC', 'ET-R0D300', 'DT-Circle', 'LDA-R3D100', 'NB-Hybrid', 'ADA-R7D300', 'NB-CKDExt', 'ADA-R9D300', 'LDA-Estate', 'LDA-R2D100', 'ADA-Estate', 'XGB-R0D100', 'PLS-MACCS', 'ADA-R8D100', 'PLS-Estate', 'ADA-R8D300', 'LR-Estate', 'LDA-R1D100', 'KNN-AP2D', 'ET-R0D100', 'NB-CKD', 'ADA-R7D100', 'DT-AP2D', 'LGBM-R0D100', 'PLS-CKDGraph', 'ADA-R4D300', 'LDA-AP2D', 'ADA-R9D100', 'NB-PubChem', 'ADA-R6D100', 'LR-AP2D', 'RF-R0D300', 'ADA-AP2D', 'ADA-R3D300', 'NB-FP4C', 'LR-R6D100', 'NB-FP4', 'ADA-R1D300', 'NB-MACCS', 'ADA-R2D300', 'PLS-AP2D', 'ADA-R5D100', 'NB-CKDGraph', 'RF-R0D100', 'NB-Estate', 'KNN-R0D300', 'NB-AP2D', 'ADA-R4D100', 'LR-R8D100', 'LR-R9D100', 'KNN-R0D100', 'ADA-R1D100', 'ADA-R2D100', 'ADA-R3D100', 'LR-R5D100', 'LR-R0D100', 'LDA-R0D100', 'LDA-R0D300', 'LR-R0D300', 'LR-R4D100', 'ADA-R0D300', 'LR-R1D100', 'ADA-R0D100', 'PLS-R6D300', 'LR-R3D100', 'PLS-R7D100', 'PLS-R5D300', 'PLS-R6D100', 'LR-R2D100', 'PLS-R8D100', 'PLS-R9D100', 'DT-R7D300', 'PLS-R7D300', 'PLS-R5D100', 'PLS-R9D300', 'PLS-R8D300', 'DT-R9D300', 'PLS-R4D300', 'DT-R5D300', 'DT-R6D100', 'NB-R7D300', 'DT-R8D300', 'NB-R8D100', 'NB-R9D100', 'NB-R7D100', 'DT-R6D300', 'NB-R6D300', 'DT-R7D100', 'PLS-R4D100', 'NB-R8D300', 'DT-R9D100', 'PLS-R3D300', 'DT-R8D100', 'NB-R9D300', 'PLS-R1D300', 'PLS-R2D300', 'DT-R5D100', 'NB-R6D100', 'PLS-R3D100', 'PLS-R1D100', 'NB-R5D300', 'DT-R4D100', 'DT-R1D300', 'PLS-R2D100', 'DT-R1D100', 'DT-R4D300', 'DT-R2D100', 'DT-R2D300', 'NB-R5D100', 'DT-R3D300', 'NB-R4D300', 'PLS-R0D300', 'DT-R3D100', 'DT-R0D300', 'NB-R4D100', 'DT-R0D100', 'NB-R1D100', 'NB-R3D100', 'PLS-R0D100', 'NB-R1D300', 'NB-R2D100', 'NB-R3D300', 'NB-R2D300', 'NB-R0D100', 'NB-R0D300']

# List of columns you want to select (or exclude)
exclude_columns = False  # Set to True if you want to exclude the columns instead of selecting them
include_last_column = True  # Set to True if you want to include the last column as a label
output_csv1 = 'train_switch2.csv'  
output_csv2 = 'test_switch2.csv'  

switch_and_save(input_csv1, input_csv2, columns_to_select, exclude_columns, include_last_column, output_csv1, output_csv2)


## Average Ensemble Learning

In [None]:
# แบบหลายมิติ
import pandas as pd
import os
import numpy as np

# Folder path where your CSV files are stored
folder_path = 'Two-step Selection' # keep stack_train_switch2.csv and stack_test_switch2.csv

# List of all CSV files in the folder (assuming all files have the .csv extension)
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

# List of dimensions to switch between deep learning and handcrafted feature
dimensions = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126, 128, 130, 132, 134, 136, 138, 140, 142, 144, 146, 148, 150, 152, 154, 156, 158, 160, 162, 164, 166, 168, 170, 172, 174, 176, 178, 180, 182, 184, 186, 188, 190, 192, 194, 196, 198, 200, 202, 204, 206, 208, 210, 212, 214, 216, 218, 220, 222, 224, 226, 228, 230, 232, 234, 236, 238, 240, 242, 244, 246, 248, 250, 252, 254, 256, 258, 260, 262, 264, 266, 268, 270, 272, 274, 276, 278, 280, 282, 284, 286, 288, 290, 292, 294, 296, 298, 300, 302, 304, 306, 308, 310, 312, 314, 316, 318, 320, 322, 324, 326, 328, 330, 332, 334, 336, 338, 340, 342, 344, 346, 348, 350, 352, 354, 356, 358, 360, 362, 364, 366, 368, 370, 372, 374, 376, 378, 380, 382, 384, 386, 388, 390, 392, 394, 396, 398, 400, 402, 404, 406, 408, 410, 412, 414, 416, 418, 420, 422, 424, 426, 428, 430, 432, 434, 436, 438, 440, -1]


# Initialize an empty list to store the row averages for each file
row_averages = []
target_classes = []  # To store the target class for each file

# Loop through each CSV file and process it
for csv_file in csv_files:
    file_path = os.path.join(folder_path, csv_file)
    print(f"Reading file: {csv_file}")  # Show which file is being read
    
    # Read the CSV file into a DataFrame (assuming it has a header row)
    df = pd.read_csv(file_path, header=0)  # If no header, you can change it to header=None
    
    # Extract the target class (last column)
    target_class = df.iloc[:, -1]  # The last column is the target class
    
    # Loop through the dimensions and create separate files for each
    for dim in dimensions:
        #if dim <= df.shape[1] - 1:  # Ensure the dimension doesn't exceed the available columns
            # Slice the DataFrame to get the required number of columns
        df_dim = df.iloc[:, :dim]  # Select columns from the start up to the given dimension
            
            # Calculate the average of each row across columns (excluding the target class column)
        row_avg = df_dim.mean(axis=1)
            
            # Convert the row averages to binary (1 if >= 0.5, else 0)
        row_avg_bin = row_avg.apply(lambda x: 0 if x >= 0.5 else 1)
            
            # Combine the target class and the row averages
        final_output = pd.concat([row_avg_bin, target_class], axis=1)
        final_output.columns = [f'Avg_{dim}', 'Target_Class']
            
            # Define the output file name based on the dimension
        output_file_path = os.path.join(folder_path, f"M2V-Handcraft_averaged_{dim}_{csv_file}")
            
            # Save the modified DataFrame to a new CSV file
        final_output.to_csv(output_file_path, index=True)
            
        print(f"Output saved to {output_file_path}")
        #else:
         #   print(f"Skipping dimension {dim} for file {csv_file} due to insufficient columns.")
