## Effect of Matrix Transformations on data with Standard Algorithms Classifiers 

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
import random
import math
import warnings
from sklearn.exceptions import ConvergenceWarning
from sklearn.metrics import accuracy_score, ConfusionMatrixDisplay, confusion_matrix, classification_report

# Ignore the FutureWarning
warnings.simplefilter(action='ignore', category=FutureWarning)
# Ignore the ConvergenceWarning and UserWarning
warnings.simplefilter(action='ignore', category=ConvergenceWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

In [2]:
# Load the Dataset
url = "https://drive.google.com/file/d/16YkA1qJ4FHcBIvXZc17ifKSUzb_Xihth/view?usp=sharing"
url = "https://drive.google.com/uc?id=" + url.split('/')[-2]

# Dataset visualization as DataFrame
dataset = pd.read_csv(url, header = 0)
print("Dataframe visualization: ")
dataset

seed_new = 23

Dataframe visualization: 


In [3]:
def calculate_relevance_array(dataset):
    # Assuming the last column contains the class labels
    classes = dataset.iloc[:, -1].unique()

    # Create an empty DataFrame to store the mean per class
    mean_per_class_df = pd.DataFrame(index=classes, columns=dataset.columns[: -1])

    # Calculate mean for each feature per class
    for class_label in classes:
        class_data = dataset[dataset.iloc[:, -1] == class_label].iloc[:, :-1]
        mean_per_class_df.loc[class_label] = class_data.mean()

    # Calculate the difference between the mean of Class 0 and Class 1
    mean_difference = mean_per_class_df.loc[classes[0]] - mean_per_class_df.loc[classes[1]]
    # Sort the array in descending order and extract the indices
    relevance_indices = mean_difference.sort_values(ascending = False).index.to_numpy()

    # Sort the array in descending order
    relevance_array = mean_difference.sort_values(ascending = False)

    return relevance_array, relevance_indices

relevance_array, relevance_indices = calculate_relevance_array(dataset)

print("Relevance Indices (Descending Sorted):")
print(relevance_indices)

Relevance Indices (Descending Sorted):
['116' '142' '31' ... '604' '613' '614']


## KNN Classifier

In [4]:
from sklearn.neighbors import KNeighborsClassifier

def knn_classifier_with_kfcv(dataset, k_neighbors, num_folds, relevance_indices, output_file):
    # Assuming the last column contains the class labels
    X = dataset.iloc[:, :-1]
    y = dataset.iloc[:, -1]

    # Initialize kNN classifier
    knn_classifier = KNeighborsClassifier(n_neighbors = k_neighbors)

    # Initialize k-Fold Cross-Validation
    kf = KFold(n_splits = num_folds, shuffle = True, random_state = seed_new)

    # Results dictionary to store metrics for each iteration
    results = {'Accuracy': [], 'Relevance_Indices': []}
    # Initialize an empty list to accumulate indices
    accumulated_indices = []
    
    # Initialize variables to keep track of the best accuracy and its corresponding indices
    best_accuracy = 0.0
    best_relevance_indices = None
    best_classification_report = None

    # Iterate through relevance indices
    for feature_index in relevance_indices:
        # Initialize lists to store metrics for each fold
        accuracy_list = []

        # Lists to store the predicted and actual labels
        predicted_labels = []
        actual_labels = []

        # Iterate through each fold
        for train_index, test_index in kf.split(X):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            # Add one feature at a time based on relevance indices
            X_train_current = X_train.iloc[:, : int(feature_index) + 1]
            X_test_current = X_test.iloc[:, : int(feature_index) + 1]
            
            # Fit kNN classifier
            knn_classifier.fit(X_train_current, y_train)
            # Predict on test set
            y_pred = knn_classifier.predict(X_test_current)
            predicted_labels.extend(y_pred)
            actual_labels.extend(y_test)

            # Calculate accuracy
            accuracy = accuracy_score(y_test, y_pred)
            # Append accuracy to the list
            accuracy_list.append(accuracy)

        # Calculate mean accuracy
        mean_accuracy = np.mean(accuracy_list)

        # Append mean accuracy and accumulated relevance indices for each feature to results dictionary
        results['Accuracy'].append(mean_accuracy)
        accumulated_indices.append(feature_index)
        results['Relevance_Indices'].append(accumulated_indices.copy())

        # Update the best accuracy and its corresponding indices
        if mean_accuracy > best_accuracy:
            best_accuracy = mean_accuracy
            best_relevance_indices = accumulated_indices.copy()

            # Calculate the best classification report
            labels = ['Clase 0', 'Clase 1']
            best_classification_report = classification_report(actual_labels, predicted_labels, target_names=labels)

    # Convert lists to NumPy arrays
    results['Accuracy'] = np.array(results['Accuracy'])
    results['Relevance_Indices'] = np.array(results['Relevance_Indices'], dtype=object)

    # Sort the accuracies and their corresponding indices in ascending order
    sorted_indices = np.argsort(results['Accuracy'])
    results['Accuracy'] = results['Accuracy'][sorted_indices]
    results['Relevance_Indices'] = results['Relevance_Indices'][sorted_indices]

    # Print the best accuracy and its corresponding indices
    print(f"Best Accuracy: {best_accuracy}")
    print(f"Corresponding Relevance Indices: {best_relevance_indices}")
    print("Number of Relevance Features: ", len(best_relevance_indices))
    
    # Print the best classification report
    print("Best Classification Report:")
    print(best_classification_report)

    # Save the best classification report to a file
    with open(output_file, 'w') as file:
        file.write("Best Classification Report:\n")
        file.write(best_classification_report)

    return results

# Run kNN classifier with kFCV and save the best classification report to a file
results = knn_classifier_with_kfcv(dataset, k_neighbors = 1, num_folds = 10, relevance_indices = relevance_indices, output_file = 'best_classification_report_1nn.txt')

Best Accuracy: 0.8666666666666666
Corresponding Relevance Indices: ['116', '142']
Number of Relevance Features:  2
Best Classification Report:
              precision    recall  f1-score   support

     Clase 0       1.00      0.71      0.83        14
     Clase 1       0.78      1.00      0.88        14

    accuracy                           0.86        28
   macro avg       0.89      0.86      0.85        28
weighted avg       0.89      0.86      0.85        28



In [5]:
# Run kNN classifier with kFCV and save the best classification report to a file
results = knn_classifier_with_kfcv(dataset, k_neighbors = 3, num_folds = 10, relevance_indices = relevance_indices, output_file = 'best_classification_report_3nn.txt')

Best Accuracy: 0.7666666666666666
Corresponding Relevance Indices: ['116', '142', '31', '132']
Number of Relevance Features:  4
Best Classification Report:
              precision    recall  f1-score   support

     Clase 0       1.00      0.57      0.73        14
     Clase 1       0.70      1.00      0.82        14

    accuracy                           0.79        28
   macro avg       0.85      0.79      0.78        28
weighted avg       0.85      0.79      0.78        28



In [6]:
# Run kNN classifier with kFCV and save the best classification report to a file
results = knn_classifier_with_kfcv(dataset, k_neighbors = 5, num_folds = 10, relevance_indices = relevance_indices, output_file = 'best_classification_report_5nn.txt')

Best Accuracy: 0.7666666666666666
Corresponding Relevance Indices: ['116', '142', '31', '132', '1048', '734', '695', '837', '125', '70', '867', '240', '563', '147', '323', '479', '968', '124', '505', '334', '966', '773', '777', '191', '146', '916', '227', '471', '765', '735', '609', '503', '98', '757', '118', '504', '969', '825', '336', '223', '183', '509', '980', '351', '936', '673', '544', '467', '422', '664', '1010', '829', '545', '115', '35', '705', '912', '270', '758', '547', '709', '748', '1', '586', '508', '1030', '610', '808', '8', '1003', '264', '538', '583', '533', '300', '863', '974', '247', '929', '458', '287', '760', '917', '76', '970', '790', '913', '216', '445', '921', '779', '726', '314', '740', '820', '636', '712', '350', '66', '356', '600', '129', '4', '1063', '535', '697', '301', '156', '906', '510', '501', '202', '792', '891', '603', '439', '74', '332', '631', '180', '1037', '396', '878', '10', '160', '490', '985', '597', '395', '536', '228', '596', '710', '1056', '

## SVM Classifier

In [7]:
from sklearn.svm import SVC

def svm_classifier_with_kfcv(dataset, relevance_indices, num_folds, output_file):
    # Assuming the last column contains the class labels
    X = dataset.iloc[:, :-1]
    y = dataset.iloc[:, -1]

    # Initialize SVM classifier
    svm_classifier = SVC()

    # Initialize k-Fold Cross-Validation
    kf = KFold(n_splits = num_folds, shuffle = True, random_state = seed_new)

    # Results dictionary to store metrics for each iteration
    results = {'Accuracy': [], 'Relevance_Indices': []}
    # Initialize an empty list to accumulate indices
    accumulated_indices = []
    
    # Initialize variables to keep track of the best accuracy and its corresponding indices
    best_accuracy = 0.0
    best_relevance_indices = None
    best_classification_report = None

    # Iterate through relevance indices
    for feature_index in relevance_indices:
        # Initialize lists to store metrics for each fold
        accuracy_list = []

        # Lists to store the predicted and actual labels
        predicted_labels = []
        actual_labels = []

        # Iterate through each fold
        for train_index, test_index in kf.split(X):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            # Add one feature at a time based on relevance indices
            X_train_current = X_train.iloc[:, : int(feature_index) + 1]
            X_test_current = X_test.iloc[:, : int(feature_index) + 1]
            
            # Fit SVM classifier
            svm_classifier.fit(X_train_current, y_train)
            # Predict on test set
            y_pred = svm_classifier.predict(X_test_current)
            predicted_labels.extend(y_pred)
            actual_labels.extend(y_test)

            # Calculate accuracy
            accuracy = accuracy_score(y_test, y_pred)
            # Append accuracy to the list
            accuracy_list.append(accuracy)

        # Calculate mean accuracy
        mean_accuracy = np.mean(accuracy_list)

        # Append mean accuracy and accumulated relevance indices for each feature to results dictionary
        results['Accuracy'].append(mean_accuracy)
        accumulated_indices.append(feature_index)
        results['Relevance_Indices'].append(accumulated_indices.copy())

        # Update the best accuracy and its corresponding indices
        if mean_accuracy > best_accuracy:
            best_accuracy = mean_accuracy
            best_relevance_indices = accumulated_indices.copy()

            # Calculate the best classification report
            labels = ['Clase 0', 'Clase 1']
            best_classification_report = classification_report(actual_labels, predicted_labels, target_names=labels)

    # Convert lists to NumPy arrays
    results['Accuracy'] = np.array(results['Accuracy'])
    results['Relevance_Indices'] = np.array(results['Relevance_Indices'], dtype=object)

    # Sort the accuracies and their corresponding indices in ascending order
    sorted_indices = np.argsort(results['Accuracy'])
    results['Accuracy'] = results['Accuracy'][sorted_indices]
    results['Relevance_Indices'] = results['Relevance_Indices'][sorted_indices]

    # Print the best accuracy and its corresponding indices
    print(f"Best Accuracy: {best_accuracy}")
    print(f"Corresponding Relevance Indices: {best_relevance_indices}")
    print("Number of Relevance Features: ", len(best_relevance_indices))
    
    # Print the best classification report
    print("Best Classification Report:")
    print(best_classification_report)

    # Save the best classification report to a file
    with open(output_file, 'w') as file:
        file.write("Best Classification Report:\n")
        file.write(best_classification_report)

    return results

# Run kNN classifier with kFCV and save the best classification report to a file
results = svm_classifier_with_kfcv(dataset, num_folds = 10, relevance_indices = relevance_indices, output_file = 'best_classification_report_svm.txt')

Best Accuracy: 1.0
Corresponding Relevance Indices: ['116', '142', '31', '132', '1048', '734', '695', '837', '125', '70', '867', '240', '563', '147', '323']
Number of Relevance Features:  15
Best Classification Report:
              precision    recall  f1-score   support

     Clase 0       1.00      1.00      1.00        14
     Clase 1       1.00      1.00      1.00        14

    accuracy                           1.00        28
   macro avg       1.00      1.00      1.00        28
weighted avg       1.00      1.00      1.00        28



## Random Forest Classifier

In [8]:
from sklearn.ensemble import RandomForestClassifier

def random_forest_classifier_with_kfcv(dataset, relevance_indices, num_folds, output_file):
    # Assuming the last column contains the class labels
    X = dataset.iloc[:, :-1]
    y = dataset.iloc[:, -1]

    # Initialize Random Forest classifier
    rf_classifier = RandomForestClassifier(n_estimators=10)

    # Initialize k-Fold Cross-Validation
    kf = KFold(n_splits = num_folds, shuffle = True, random_state = seed_new)

    # Results dictionary to store metrics for each iteration
    results = {'Accuracy': [], 'Relevance_Indices': []}
    # Initialize an empty list to accumulate indices
    accumulated_indices = []
    
    # Initialize variables to keep track of the best accuracy and its corresponding indices
    best_accuracy = 0.0
    best_relevance_indices = None
    best_classification_report = None

    # Iterate through relevance indices
    for feature_index in relevance_indices:
        # Initialize lists to store metrics for each fold
        accuracy_list = []

        # Lists to store the predicted and actual labels
        predicted_labels = []
        actual_labels = []

        # Iterate through each fold
        for train_index, test_index in kf.split(X):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            # Add one feature at a time based on relevance indices
            X_train_current = X_train.iloc[:, : int(feature_index) + 1]
            X_test_current = X_test.iloc[:, : int(feature_index) + 1]
            
            # Fit Random Forest classifier
            rf_classifier.fit(X_train_current, y_train)
            # Predict on test set
            y_pred = rf_classifier.predict(X_test_current)
            predicted_labels.extend(y_pred)
            actual_labels.extend(y_test)

            # Calculate accuracy
            accuracy = accuracy_score(y_test, y_pred)
            # Append accuracy to the list
            accuracy_list.append(accuracy)

        # Calculate mean accuracy
        mean_accuracy = np.mean(accuracy_list)

        # Append mean accuracy and accumulated relevance indices for each feature to results dictionary
        results['Accuracy'].append(mean_accuracy)
        accumulated_indices.append(feature_index)
        results['Relevance_Indices'].append(accumulated_indices.copy())

        # Update the best accuracy and its corresponding indices
        if mean_accuracy > best_accuracy:
            best_accuracy = mean_accuracy
            best_relevance_indices = accumulated_indices.copy()

            # Calculate the best classification report
            labels = ['Clase 0', 'Clase 1']
            best_classification_report = classification_report(actual_labels, predicted_labels, target_names=labels)

    # Convert lists to NumPy arrays
    results['Accuracy'] = np.array(results['Accuracy'])
    results['Relevance_Indices'] = np.array(results['Relevance_Indices'], dtype=object)

    # Sort the accuracies and their corresponding indices in ascending order
    sorted_indices = np.argsort(results['Accuracy'])
    results['Accuracy'] = results['Accuracy'][sorted_indices]
    results['Relevance_Indices'] = results['Relevance_Indices'][sorted_indices]

    # Print the best accuracy and its corresponding indices
    print(f"Best Accuracy: {best_accuracy}")
    print(f"Corresponding Relevance Indices: {best_relevance_indices}")
    print("Number of Relevance Features: ", len(best_relevance_indices))
    
    # Print the best classification report
    print("Best Classification Report:")
    print(best_classification_report)

    # Save the best classification report to a file
    with open(output_file, 'w') as file:
        file.write("Best Classification Report:\n")
        file.write(best_classification_report)

    return results

# Run SVM classifier with kFCV
results = random_forest_classifier_with_kfcv(dataset, relevance_indices, num_folds = 10, output_file = 'best_classification_report_rf.txt')

Best Accuracy: 0.9666666666666666
Corresponding Relevance Indices: ['116', '142', '31', '132', '1048', '734', '695', '837', '125', '70', '867', '240', '563', '147', '323', '479', '968', '124', '505', '334', '966', '773', '777', '191', '146', '916', '227', '471', '765', '735', '609', '503', '98', '757', '118', '504', '969', '825', '336', '223', '183', '509', '980', '351', '936', '673', '544', '467', '422', '664', '1010', '829', '545', '115', '35', '705', '912', '270', '758', '547', '709', '748', '1', '586', '508', '1030', '610', '808', '8', '1003', '264', '538', '583', '533', '300', '863', '974', '247', '929', '458', '287', '760', '917', '76', '970', '790', '913', '216', '445', '921', '779', '726', '314', '740', '820', '636', '712', '350', '66', '356', '600', '129', '4', '1063', '535', '697', '301', '156', '906', '510', '501', '202', '792', '891', '603', '439', '74', '332', '631', '180', '1037', '396', '878', '10', '160', '490', '985', '597', '395', '536', '228', '596', '710', '1056', '

## Adaboost Classifier

In [9]:
from sklearn.ensemble import AdaBoostClassifier

def adaboost_classifier_with_kfcv(dataset, relevance_indices, num_folds, output_file):
    # Assuming the last column contains the class labels
    X = dataset.iloc[:, :-1]
    y = dataset.iloc[:, -1]

    # Initialize AdaBoost classifier
    adaboost_classifier = AdaBoostClassifier(n_estimators = 50, random_state = seed_new)

    # Initialize k-Fold Cross-Validation
    kf = KFold(n_splits = num_folds, shuffle = True, random_state = 42)

    # Results dictionary to store metrics for each iteration
    results = {'Accuracy': [], 'Relevance_Indices': []}
    # Initialize an empty list to accumulate indices
    accumulated_indices = []
    
    # Initialize variables to keep track of the best accuracy and its corresponding indices
    best_accuracy = 0.0
    best_relevance_indices = None
    best_classification_report = None

    # Iterate through relevance indices
    for feature_index in relevance_indices:
        # Initialize lists to store metrics for each fold
        accuracy_list = []

        # Lists to store the predicted and actual labels
        predicted_labels = []
        actual_labels = []

        # Iterate through each fold
        for train_index, test_index in kf.split(X):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            # Add one feature at a time based on relevance indices
            X_train_current = X_train.iloc[:, : int(feature_index) + 1]
            X_test_current = X_test.iloc[:, : int(feature_index) + 1]

            # Fit AdaBoost classifier
            adaboost_classifier.fit(X_train_current, y_train)
            # Predict on test set
            y_pred = adaboost_classifier.predict(X_test_current)
            predicted_labels.extend(y_pred)
            actual_labels.extend(y_test)

            # Calculate accuracy
            accuracy = accuracy_score(y_test, y_pred)
            # Append accuracy to the list
            accuracy_list.append(accuracy)

        # Calculate mean accuracy
        mean_accuracy = np.mean(accuracy_list)

        # Append mean accuracy and accumulated relevance indices for each feature to results dictionary
        results['Accuracy'].append(mean_accuracy)
        accumulated_indices.append(feature_index)
        results['Relevance_Indices'].append(accumulated_indices.copy())

        # Update the best accuracy and its corresponding indices
        if mean_accuracy > best_accuracy:
            best_accuracy = mean_accuracy
            best_relevance_indices = accumulated_indices.copy()

            # Calculate the best classification report
            labels = ['Clase 0', 'Clase 1']
            best_classification_report = classification_report(actual_labels, predicted_labels, target_names=labels)

    # Convert lists to NumPy arrays
    results['Accuracy'] = np.array(results['Accuracy'])
    results['Relevance_Indices'] = np.array(results['Relevance_Indices'], dtype=object)

    # Sort the accuracies and their corresponding indices in ascending order
    sorted_indices = np.argsort(results['Accuracy'])
    results['Accuracy'] = results['Accuracy'][sorted_indices]
    results['Relevance_Indices'] = results['Relevance_Indices'][sorted_indices]

    # Print the best accuracy and its corresponding indices
    print(f"Best Accuracy: {best_accuracy}")
    print(f"Corresponding Relevance Indices: {best_relevance_indices}")
    print("Number of Relevance Features: ", len(best_relevance_indices))
    
    # Print the best classification report
    print("Best Classification Report:")
    print(best_classification_report)

    # Save the best classification report to a file
    with open(output_file, 'w') as file:
        file.write("Best Classification Report:\n")
        file.write(best_classification_report)

    return results

# Run MLP classifier with kFCV
results = adaboost_classifier_with_kfcv(dataset, relevance_indices, num_folds = 10, output_file = 'best_classification_report_adaboost.txt')

Best Accuracy: 0.8333333333333333
Corresponding Relevance Indices: ['116', '142', '31', '132', '1048', '734', '695', '837', '125', '70', '867', '240', '563', '147', '323', '479', '968', '124', '505', '334', '966', '773', '777', '191', '146', '916', '227', '471', '765', '735', '609', '503', '98', '757', '118', '504', '969', '825', '336', '223', '183', '509', '980', '351', '936', '673']
Number of Relevance Features:  46
Best Classification Report:
              precision    recall  f1-score   support

     Clase 0       0.85      0.79      0.81        14
     Clase 1       0.80      0.86      0.83        14

    accuracy                           0.82        28
   macro avg       0.82      0.82      0.82        28
weighted avg       0.82      0.82      0.82        28



## MLP Classifier

In [10]:
from sklearn.neural_network import MLPClassifier

def mlp_classifier_with_kfcv(dataset, relevance_indices, num_folds, output_file):
    # Assuming the last column contains the class labels
    X = dataset.iloc[:, :-1]
    y = dataset.iloc[:, -1]

    # Initialize MLP classifier with 10 hidden layers
    mlp_classifier = MLPClassifier(hidden_layer_sizes = (20,), max_iter = 1000, early_stopping = True, random_state = 42)

    # Initialize k-Fold Cross-Validation
    kf = KFold(n_splits = num_folds, shuffle = True, random_state = seed_new)

    # Results dictionary to store metrics for each iteration
    results = {'Accuracy': [], 'Relevance_Indices': []}
    # Initialize an empty list to accumulate indices
    accumulated_indices = []
    
    # Initialize variables to keep track of the best accuracy and its corresponding indices
    best_accuracy = 0.0
    best_relevance_indices = None
    best_classification_report = None

   # Iterate through relevance indices
    for feature_index in relevance_indices:
        # Initialize lists to store metrics for each fold
        accuracy_list = []

        # Lists to store the predicted and actual labels
        predicted_labels = []
        actual_labels = []

        # Iterate through each fold
        for train_index, test_index in kf.split(X):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            # Add one feature at a time based on relevance indices
            X_train_current = X_train.iloc[:, : int(feature_index) + 1]
            X_test_current = X_test.iloc[:, : int(feature_index) + 1]

            # Fit MLP classifier
            mlp_classifier.fit(X_train_current, y_train)
            # Predict on test set
            y_pred = mlp_classifier.predict(X_test_current)
            predicted_labels.extend(y_pred)
            actual_labels.extend(y_test)

            # Calculate accuracy
            accuracy = accuracy_score(y_test, y_pred)
            # Append accuracy to the list
            accuracy_list.append(accuracy)

        # Calculate mean accuracy
        mean_accuracy = np.mean(accuracy_list)

        # Append mean accuracy and accumulated relevance indices for each feature to results dictionary
        results['Accuracy'].append(mean_accuracy)
        accumulated_indices.append(feature_index)
        results['Relevance_Indices'].append(accumulated_indices.copy())

        # Update the best accuracy and its corresponding indices
        if mean_accuracy > best_accuracy:
            best_accuracy = mean_accuracy
            best_relevance_indices = accumulated_indices.copy()

            # Calculate the best classification report
            labels = ['Clase 0', 'Clase 1']
            best_classification_report = classification_report(actual_labels, predicted_labels, target_names = labels)

    # Convert lists to NumPy arrays
    results['Accuracy'] = np.array(results['Accuracy'])
    results['Relevance_Indices'] = np.array(results['Relevance_Indices'], dtype=object)

    # Sort the accuracies and their corresponding indices in ascending order
    sorted_indices = np.argsort(results['Accuracy'])
    results['Accuracy'] = results['Accuracy'][sorted_indices]
    results['Relevance_Indices'] = results['Relevance_Indices'][sorted_indices]

    # Print the best accuracy and its corresponding indices
    print(f"Best Accuracy: {best_accuracy}")
    print(f"Corresponding Relevance Indices: {best_relevance_indices}")
    print("Number of Relevance Features: ", len(best_relevance_indices))
    
    # Print the best classification report
    print("Best Classification Report:")
    print(best_classification_report)

    # Save the best classification report to a file
    with open(output_file, 'w') as file:
        file.write("Best Classification Report:\n")
        file.write(best_classification_report)

    return results

# Run MLP classifier with kFCV
results = mlp_classifier_with_kfcv(dataset, relevance_indices, num_folds = 10, output_file = 'best_classification_report_mlp.txt')

Best Accuracy: 0.7666666666666666
Corresponding Relevance Indices: ['116', '142', '31', '132', '1048', '734', '695', '837', '125', '70', '867', '240', '563', '147', '323', '479', '968', '124', '505', '334', '966', '773', '777', '191', '146', '916', '227', '471', '765', '735', '609', '503', '98', '757', '118', '504', '969', '825', '336', '223', '183', '509', '980', '351', '936', '673', '544', '467', '422', '664', '1010', '829', '545', '115', '35', '705', '912', '270', '758', '547', '709', '748', '1', '586', '508', '1030', '610', '808', '8', '1003', '264', '538', '583', '533', '300', '863', '974', '247', '929', '458', '287', '760', '917', '76', '970', '790', '913', '216', '445', '921', '779', '726', '314', '740', '820', '636', '712', '350', '66', '356', '600', '129', '4', '1063', '535', '697', '301', '156', '906', '510', '501', '202', '792', '891', '603', '439', '74', '332', '631', '180', '1037', '396', '878', '10', '160', '490', '985', '597', '395', '536', '228', '596', '710', '1056', '

## Naive Bayes Classifier

In [11]:
from sklearn.naive_bayes import GaussianNB

def naive_bayes_classifier_with_kfcv(dataset, relevance_indices, num_folds, output_file):
    # Assuming the last column contains the class labels
    X = dataset.iloc[:, :-1]
    y = dataset.iloc[:, -1]

    # Initialize Gaussian Naive Bayes classifier
    nb_classifier = GaussianNB()

    # Initialize k-Fold Cross-Validation
    kf = KFold(n_splits = num_folds, shuffle = True, random_state = seed_new)

    # Results dictionary to store metrics for each iteration
    results = {'Accuracy': [], 'Relevance_Indices': []}
    # Initialize an empty list to accumulate indices
    accumulated_indices = []
    
    # Initialize variables to keep track of the best accuracy and its corresponding indices
    best_accuracy = 0.0
    best_relevance_indices = None
    best_classification_report = None

    # Iterate through relevance indices
    for feature_index in relevance_indices:
        # Initialize lists to store metrics for each fold
        accuracy_list = []

        # Lists to store the predicted and actual labels
        predicted_labels = []
        actual_labels = []

        # Iterate through each fold
        for train_index, test_index in kf.split(X):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            # Add one feature at a time based on relevance indices
            X_train_current = X_train.iloc[:, : int(feature_index) + 1]
            X_test_current = X_test.iloc[:, : int(feature_index) + 1]

            # Fit Gaussian Naive Bayes classifier
            nb_classifier.fit(X_train_current, y_train)

            # Predict on test set
            X_test_current = X_test.iloc[:, :int(feature_index) + 1]
            y_pred = nb_classifier.predict(X_test_current)

            predicted_labels.extend(y_pred)
            actual_labels.extend(y_test)

            # Calculate accuracy
            accuracy = accuracy_score(y_test, y_pred)
            # Append accuracy to the list
            accuracy_list.append(accuracy)

        # Calculate mean accuracy
        mean_accuracy = np.mean(accuracy_list)

        # Append mean accuracy and accumulated relevance indices for each feature to results dictionary
        results['Accuracy'].append(mean_accuracy)
        accumulated_indices.append(feature_index)
        results['Relevance_Indices'].append(accumulated_indices.copy())

        # Update the best accuracy and its corresponding indices
        if mean_accuracy > best_accuracy:
            best_accuracy = mean_accuracy
            best_relevance_indices = accumulated_indices.copy()

            # Calculate the best classification report
            labels = ['Clase 0', 'Clase 1']
            best_classification_report = classification_report(actual_labels, predicted_labels, target_names = labels)

    # Convert lists to NumPy arrays
    results['Accuracy'] = np.array(results['Accuracy'])
    results['Relevance_Indices'] = np.array(results['Relevance_Indices'], dtype=object)

    # Sort the accuracies and their corresponding indices in ascending order
    sorted_indices = np.argsort(results['Accuracy'])
    results['Accuracy'] = results['Accuracy'][sorted_indices]
    results['Relevance_Indices'] = results['Relevance_Indices'][sorted_indices]

    # Print the best accuracy and its corresponding indices
    print(f"Best Accuracy: {best_accuracy}")
    print(f"Corresponding Relevance Indices: {best_relevance_indices}")
    print("Number of Relevance Features: ", len(best_relevance_indices))
    
    # Print the best classification report
    print("Best Classification Report:")
    print(best_classification_report)

    # Save the best classification report to a file
    with open(output_file, 'w') as file:
        file.write("Best Classification Report:\n")
        file.write(best_classification_report)

    return results

# Run NB classifier with kFCV
results = naive_bayes_classifier_with_kfcv(dataset, relevance_indices, num_folds = 10, output_file = 'best_classification_report_nb.txt')

Best Accuracy: 0.8333333333333333
Corresponding Relevance Indices: ['116', '142', '31', '132', '1048', '734', '695', '837', '125', '70', '867', '240', '563', '147', '323', '479', '968', '124', '505', '334', '966', '773', '777', '191', '146', '916', '227', '471', '765', '735', '609', '503', '98', '757', '118', '504', '969', '825', '336', '223', '183', '509', '980', '351', '936', '673', '544', '467', '422']
Number of Relevance Features:  49
Best Classification Report:
              precision    recall  f1-score   support

     Clase 0       0.74      1.00      0.85        14
     Clase 1       1.00      0.64      0.78        14

    accuracy                           0.82        28
   macro avg       0.87      0.82      0.82        28
weighted avg       0.87      0.82      0.82        28

