Import the necessary libraries

In [1]:
import pandas as pd
from sklearn.metrics import confusion_matrix, accuracy_score
from keras.layers import Dense
from keras.models import Sequential
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, cross_validate, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

Import and preprocess the data

In [2]:
data = 'Data/dataset_5.csv'
detectionData = pd.read_csv(data)

X_columns = ["Ia","Ib", "Ic" ,"Va", "Vb" ,"Vc"]
X_detection = detectionData[X_columns]
y_columns = ["F"]
y_detection = detectionData[y_columns]
additional_columns = ["A", "B", "C", "G", "L", "Rf"]
additional_data = detectionData[additional_columns]

X_train, X_test, y_train, y_test, additional_data_train, additional_data_test = train_test_split(X_detection, y_detection, additional_data, test_size=0.2, random_state=42)

Define training and evaluation functions

In [3]:
# Define a dictionary to store the cross-validation metrics of the algorithms
cv_metrics = {'Model': [], 'Accuracy': []}
test_metrics = {'Model': [], 'Accuracy': []}

# Define a function to train and evaluate each algorithm
def train_and_evaluate_model(model, model_name, X_train, y_train):
    # Define the scoring metrics for multi-class classification
    scoring = {
        'accuracy': make_scorer(accuracy_score),
    }

    # Perform cross-validation using StratifiedKFold
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_validate(model, X_train, y_train, cv=skf, scoring=scoring)
    
    # Store the cross-validation metrics
    cv_metrics['Model'].append(model_name)
    cv_metrics['Accuracy'].append(scores['test_accuracy'].mean())
    print(f"{model_name}: Cross-validation metrics calculated")
    
    # Fit the model on the entire training set
    model.fit(X_train, y_train)
    return model

# Define a function to evaluate the model on the test set and store the metrics
def evaluate_on_test_set(model, model_name, X_test, y_test):
    y_pred = model.predict(X_test)
    test_metrics['Model'].append(model_name)
    test_metrics['Accuracy'].append(accuracy_score(y_test, y_pred))
    print(f"{model_name}: Test metrics calculated")

Define the models

In [4]:
models = [
    (LogisticRegression(random_state=42, max_iter=1000), "Logistic Regression"),
    (SVC(random_state=42), "Support Vector Machines"),
    (KNeighborsClassifier(), "K-Nearest Neighbors"),
    (DecisionTreeClassifier(random_state=42), "Decision Trees"),
    (RandomForestClassifier(random_state=42), "Random Forest"),
    (GradientBoostingClassifier(random_state=42), "Gradient Boosting"),
    (MLPClassifier(random_state=42, max_iter=1000), "Neural Networks"),
    (GaussianNB(), "Naive Bayes"),
    (AdaBoostClassifier(random_state=42), "AdaBoost"),
    (XGBClassifier(random_state=42), "XGBoost"),
    (LGBMClassifier(random_state=42), "LightGBM"),
    (CatBoostClassifier(random_state=42, verbose=0), "CatBoost"),
    (detectionANN, "Artificial Neural Network")
    ]

Train and evaluate the models

In [5]:
# Train and evaluate each model
for model, model_name in models:
    fitted_model = train_and_evaluate_model(model, model_name, X_train, y_train.values.ravel())
    evaluate_on_test_set(fitted_model, model_name, X_test, y_test.values.ravel())

# Convert the dictionary of cross-validation metrics to a DataFrame
cv_metrics_df = pd.DataFrame(cv_metrics)
test_metrics_df = pd.DataFrame(test_metrics)
print("\nCross-validation Metrics:")
print(cv_metrics_df)
print("\nTest Metrics:")
print(test_metrics_df)

Logistic Regression: Cross-validation metrics calculated
Logistic Regression: Test metrics calculated
Support Vector Machines: Cross-validation metrics calculated
Support Vector Machines: Test metrics calculated
K-Nearest Neighbors: Cross-validation metrics calculated
K-Nearest Neighbors: Test metrics calculated
Decision Trees: Cross-validation metrics calculated
Decision Trees: Test metrics calculated
Random Forest: Cross-validation metrics calculated
Random Forest: Test metrics calculated
Gradient Boosting: Cross-validation metrics calculated
Gradient Boosting: Test metrics calculated
Neural Networks: Cross-validation metrics calculated
Neural Networks: Test metrics calculated
Naive Bayes: Cross-validation metrics calculated
Naive Bayes: Test metrics calculated




AdaBoost: Cross-validation metrics calculated




AdaBoost: Test metrics calculated
XGBoost: Cross-validation metrics calculated
XGBoost: Test metrics calculated
[LightGBM] [Info] Number of positive: 26698, number of negative: 26806
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000738 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1530
[LightGBM] [Info] Number of data points in the train set: 53504, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.498991 -> initscore=-0.004037
[LightGBM] [Info] Start training from score -0.004037
[LightGBM] [Info] Number of positive: 26698, number of negative: 26806
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000978 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1530
[LightGBM] [Info] Number of data points in the train set: 53504,

TypeError: Cannot clone object '<Sequential name=sequential, built=False>' (type <class 'keras.src.models.sequential.Sequential'>): it does not seem to be a scikit-learn estimator as it does not implement a 'get_params' method.

In [6]:
cv_metrics_df = pd.DataFrame(cv_metrics)
test_metrics_df = pd.DataFrame(test_metrics)
print("\nCross-validation Metrics:")
print(cv_metrics_df)
print("\nTest Metrics:")
print(test_metrics_df)


Cross-validation Metrics:
                      Model  Accuracy
0       Logistic Regression  0.492344
1   Support Vector Machines  0.953768
2       K-Nearest Neighbors  0.980173
3            Decision Trees  0.974836
4             Random Forest  0.975748
5         Gradient Boosting  0.943361
6           Neural Networks  0.984958
7               Naive Bayes  0.800538
8                  AdaBoost  0.933463
9                   XGBoost  0.981624
10                 LightGBM  0.978783
11                 CatBoost  0.981160

Test Metrics:
                      Model  Accuracy
0       Logistic Regression  0.486962
1   Support Vector Machines  0.957656
2       K-Nearest Neighbors  0.983134
3            Decision Trees  0.977093
4             Random Forest  0.977811
5         Gradient Boosting  0.945455
6           Neural Networks  0.985766
7               Naive Bayes  0.802033
8                  AdaBoost  0.937261
9                   XGBoost  0.983014
10                 LightGBM  0.980502
11      