## Creating train and test data

In [1]:
# Essential for modelling
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RepeatedStratifiedKFold, RandomizedSearchCV, ParameterGrid
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

# Other packages
from ucimlrepo import fetch_ucirepo
import warnings
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

fetch = fetch_ucirepo(id=545)
X = fetch.data.features
y = fetch.data.targets
rice = pd.concat([X, y], axis=1)

# Splitting into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=2023)

# Encoding Class (Cammeo and Osmancik into 0 and 1)
label_encoder = LabelEncoder()
y_LE = y.copy()
y_LE['Class'] = label_encoder.fit_transform(y_LE['Class'])

# Splitting into train and test data with labels encoded (needed for XGBoost)
X_train_LE, X_test_LE, y_train_LE, y_test_LE = train_test_split(X, y_LE, test_size = 0.2, random_state=2023)

# Normalize the data to use when needed
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_train_LE = scaler.fit_transform(X_train_LE)
X_test_LE = scaler.transform(X_test_LE)

## Modelling the data and making predictions

In [2]:
##### K-NEAREST NEIGHBORS

from sklearn.neighbors import KNeighborsClassifier
print("Starting K-Nearest Neighbors...")

KNN_results = []

for k in range(1, 51):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train.values.ravel())  # Using ravel() to convert to 1D array
    yhat = knn.predict(X_test)
    accuracy = accuracy_score(y_test, yhat)
    KNN_results.append({'K-value': k, 'Accuracy': accuracy*100})

KNN_accuracy_df = pd.DataFrame(KNN_results)

# Find index of maximum accuracy
KNN_max_index = KNN_accuracy_df['Accuracy'].idxmax()
KNN_k_max = int(KNN_accuracy_df.loc[KNN_max_index]['K-value'])
KNN_acc_max = KNN_accuracy_df.loc[KNN_max_index]['Accuracy']

# Creating the model using the optimum value for K
KNN_model = KNeighborsClassifier(n_neighbors = KNN_k_max)
KNN_model.fit(X_train, y_train.values.ravel())
KNN_pred = KNN_model.predict(X_test)
KNN_accuracy = accuracy_score(y_test.values.ravel(), KNN_pred)
KNN_cr = classification_report(y_test.values.ravel(), KNN_pred)
KNN_cm = confusion_matrix(y_test.values.ravel(), KNN_pred)

##################################################

##### DECISION TREE

from sklearn.tree import DecisionTreeClassifier
print("Starting Decision Tree...")

DT_results = []

for i in range(1,11):
    clf_tree = DecisionTreeClassifier(criterion="entropy", random_state = 100, max_depth = i)
    clf_tree.fit(X_train, y_train.values.ravel())
    yhat = clf_tree.predict(X_test)
    accuracy = accuracy_score(y_test, yhat)
    DT_results.append({'Depth': i, 'Accuracy': accuracy*100})
    
DT_accuracy_df = pd.DataFrame(DT_results)

# Find index of maximum accuracy
DT_max_index = DT_accuracy_df['Accuracy'].idxmax()
DT_depth_max = int(DT_accuracy_df.loc[DT_max_index]['Depth'])
DT_acc_max = DT_accuracy_df.loc[DT_max_index]['Accuracy']

# Creating the model using the optimum value for Depth
DT_model = DecisionTreeClassifier(criterion="entropy", random_state = 100, max_depth = DT_depth_max)
DT_model.fit(X_train, y_train.values.ravel())
DT_pred = DT_model.predict(X_test)
DT_accuracy = accuracy_score(y_test.values.ravel(), DT_pred)
DT_cr = classification_report(y_test.values.ravel(), DT_pred)
DT_cm = confusion_matrix(y_test.values.ravel(), DT_pred)

##################################################

##### RANDOM FOREST

from sklearn.ensemble import RandomForestClassifier
print("Starting Random Forest...")

max_depth_values = list(range(2, 11))  # Test max_depth from 2 to 10
n_estimators_values = list(range(2, 103, 5))  # Test n_estimators from 2 to 102 with step size 5

RF_results = []

for max_depth in max_depth_values:
    for n_estimators in n_estimators_values:
        clf_forest = RandomForestClassifier(
            n_estimators=n_estimators, 
            criterion="entropy", 
            random_state=100, 
            max_depth=max_depth
        )
        clf_forest.fit(X_train, y_train.values.ravel())
        yhat = clf_forest.predict(X_test)
        accuracy = accuracy_score(y_test, yhat)
        RF_results.append({'Depth': max_depth, 'Estimators': n_estimators, 'Accuracy': accuracy*100})

RF_accuracy_df = pd.DataFrame(RF_results)

# Find index of maximum accuracy
RF_max_index = RF_accuracy_df['Accuracy'].idxmax()
RF_depth_max = int(RF_accuracy_df.loc[RF_max_index]['Depth'])
RF_estimators_max = int(RF_accuracy_df.loc[RF_max_index]['Estimators'])
RF_acc_max = RF_accuracy_df.loc[RF_max_index]['Accuracy']

# Creating the model using the optimum value for Depth
RF_model = RandomForestClassifier(n_estimators=RF_estimators_max, criterion="entropy", random_state=100, max_depth=RF_depth_max)
RF_model.fit(X_train, y_train.values.ravel())
RF_pred = RF_model.predict(X_test)
RF_accuracy = accuracy_score(y_test.values.ravel(), RF_pred)
RF_cr = classification_report(y_test.values.ravel(), RF_pred)
RF_cm = confusion_matrix(y_test.values.ravel(), RF_pred)

##################################################

##### SUPPORT VECTOR MACHINES

from sklearn.svm import SVC
print("Starting Support Vector Machines...")

# Grid search for the best parameters
SVM_param_grid = {'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001], 'kernel': ['rbf', 'poly', 'sigmoid']}
SVM_grid = GridSearchCV(SVC(), SVM_param_grid, refit = True, verbose = 0)
SVM_grid.fit(X_train, y_train.values.ravel())
print(SVM_grid.best_estimator_)

# Creating the model using the best parameters from the Grid Search
SVM_pred = SVM_grid.predict(X_test)
SVM_accuracy = accuracy_score(y_test.values.ravel(), SVM_pred)
SVM_cr = classification_report(y_test.values.ravel(), SVM_pred)
SVM_cm = confusion_matrix(y_test.values.ravel(), SVM_pred)

##################################################

##### NAIVE BAYES

from sklearn.naive_bayes import GaussianNB
print("Starting Naive Bayes...")

# Unlike the other models, Naive Bayes has no parameters that need tuning
NB_model = GaussianNB()
NB_model.fit(X_train, y_train.values.ravel())
NB_pred = NB_model.predict(X_test)
NB_accuracy = accuracy_score(y_test.values.ravel(), NB_pred)
NB_cr = classification_report(y_test.values.ravel(), NB_pred)
NB_cm = confusion_matrix(y_test.values.ravel(), NB_pred)

##################################################

##### LOGISTIC REGRESSION

from sklearn.linear_model import LogisticRegression
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)
print("Starting Logistic Regression...")

# Define ranges for hyperparameters
penalty_values = ['l1', 'l2']
C_values = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100]  # List of C values to test
solver_values = ['liblinear', 'saga', 'sag', 'lbfgs']  # Different solvers supporting both 'l1' and 'l2' penalties

LR_results = []

for params in ParameterGrid({'penalty': penalty_values, 'C': C_values, 'solver': solver_values}):
    if (params['solver'] in ['lbfgs', 'sag']) and params['penalty'] == 'l1':
        continue  # Skip 'lbfgs' and 'sag' solvers with 'l1' penalty
    clf_lr = LogisticRegression(**params)
    clf_lr.fit(X_train, y_train.values.ravel())
    yhat = clf_lr.predict(X_test)
    accuracy = accuracy_score(y_test.values.ravel(), yhat)
    LR_results.append({'Penalty': params['penalty'], 'C': params['C'], 'Solver': params['solver'], 'Accuracy': accuracy})

LR_results_df = pd.DataFrame(LR_results)

# Find the row index with the maximum accuracy
best_row_index = LR_results_df['Accuracy'].idxmax()
best_hyperparameters = LR_results_df.loc[best_row_index, ['C', 'Solver', 'Penalty', 'Accuracy']]
best_C = best_hyperparameters['C']
best_solver = best_hyperparameters['Solver']
best_penalty = best_hyperparameters['Penalty']

# Fitting the model with the best hyperparameters
LR_model = LogisticRegression(C=best_C, solver=best_solver, penalty=best_penalty)
LR_model.fit(X_train, y_train.values.ravel())
LR_pred = LR_model.predict(X_test)
LR_accuracy = accuracy_score(y_test.values.ravel(), LR_pred)
LR_cr = classification_report(y_test.values.ravel(), LR_pred)
LR_cm = confusion_matrix(y_test.values.ravel(), LR_pred)

##################################################

##### STOCHASTIC GRADIENT

from sklearn.linear_model import SGDClassifier
print("Starting Stochastic Gradient Descent...")

# Define hyperparameters for grid search
param_grid = {
    'alpha': [0.0001, 0.001, 0.01, 0.1, 1],
    'loss': ['hinge', 'modified_huber', 'squared_hinge', 'perceptron'],
    'penalty': ['l1', 'l2'],
}

# Initialize SGDClassifier
sgd = SGDClassifier(random_state=42)

# Perform grid search
grid_search = GridSearchCV(sgd, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train.values.ravel())

# Get best parameters and score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Fit a model using the best parameters
SGD_model = SGDClassifier(**best_params)
SGD_model.fit(X_train, y_train.values.ravel())
SGD_pred = SGD_model.predict(X_test)
SGD_accuracy = accuracy_score(y_test, SGD_pred)
SGD_cr = classification_report(y_test.values.ravel(), SGD_pred)
SGD_cm = confusion_matrix(y_test.values.ravel(), SGD_pred)

##################################################

##### XGBOOST

import xgboost as xgb
print("Starting XGBoost...")

XGB_model = xgb.XGBClassifier(objective='binary:logistic', random_state=42)

# Define the grid of parameters to search
param_grid = {
    'max_depth': [3, 4, 5],
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [100, 200, 300],
    'reg_alpha': [0, 0.1, 0.5],  # L1 regularization term (alpha)
    'reg_lambda': [0, 0.1, 0.5],  # L2 regularization term (lambda)
}

# Perform Grid Search with cross-validation
grid_search = GridSearchCV(estimator=XGB_model, param_grid=param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train_LE, y_train_LE.values.ravel())

# Get the best parameters and fit a model
best_params = grid_search.best_params_
XGB_model = xgb.XGBClassifier(**best_params)
XGB_model.fit(X_train_LE, y_train_LE.values.ravel())
XGB_pred = XGB_model.predict(X_test_LE)
XGB_accuracy = accuracy_score(y_test_LE, XGB_pred)
XGB_cr = classification_report(y_test_LE.values.ravel(), XGB_pred)
XGB_cm = confusion_matrix(y_test_LE.values.ravel(), XGB_pred)

##################################################

##### PERCEPTRON

from sklearn.linear_model import Perceptron
print("Starting Perceptron...")

# Define the Perceptron classifier
perceptron_classifier = Perceptron(random_state=42)

# Define the grid of parameters to search
param_grid = {'alpha': [0.0001, 0.001, 0.01, 0.1, 1.0]}

# Perform Grid Search with cross-validation
grid_search = GridSearchCV(estimator=perceptron_classifier, param_grid=param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train, y_train.values.ravel())

# Get the best parameters and the corresponding accuracy
best_params = grid_search.best_params_
best_accuracy = grid_search.best_score_

# Fit a model with the best params
perceptron_classifier = Perceptron(**best_params)
perceptron_classifier.fit(X_train, y_train.values.ravel())

# Make predictions on test data
perceptron_pred = perceptron_classifier.predict(X_test)
perceptron_accuracy = accuracy_score(y_test.values.ravel(), perceptron_pred)
perceptron_cr = classification_report(y_test.values.ravel(), perceptron_pred)
perceptron_cm = confusion_matrix(y_test.values.ravel(), perceptron_pred)

##################################################

##### GRADIENT BOOSTING

from sklearn.ensemble import GradientBoostingClassifier
print("Starting Gradient Boosting...")

gbm = GradientBoostingClassifier()
param_grid_gbm = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.5],
    'max_depth': [3, 5, 7]
}

grid_search_gbm = GridSearchCV(gbm, param_grid=param_grid_gbm, cv=5)
grid_search_gbm.fit(X_train, y_train.values.ravel())

best_gbm = grid_search_gbm.best_estimator_
best_gbm.fit(X_train, y_train.values.ravel())
gbm_pred = best_gbm.predict(X_test)
gbm_accuracy = accuracy_score(y_test.values.ravel(), gbm_pred)
gbm_cr = classification_report(y_test.values.ravel(), gbm_pred)
gbm_cm = confusion_matrix(y_test.values.ravel(), gbm_pred)

##################################################

##### ADABOOST

from sklearn.ensemble import AdaBoostClassifier
print("Starting AdaBoost...")

adaboost = AdaBoostClassifier()
param_grid_adaboost = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.5]
}

grid_search_adaboost = GridSearchCV(adaboost, param_grid=param_grid_adaboost, cv=5)
grid_search_adaboost.fit(X_train, y_train.values.ravel())

best_adaboost = grid_search_adaboost.best_estimator_
best_adaboost.fit(X_train, y_train.values.ravel())
adaboost_pred = best_adaboost.predict(X_test)
adaboost_accuracy = accuracy_score(y_test.values.ravel(), adaboost_pred)
adaboost_cr = classification_report(y_test.values.ravel(), adaboost_pred)
adaboost_cm = confusion_matrix(y_test.values.ravel(), adaboost_pred)

##################################################

##### BAGGING

from sklearn.ensemble import BaggingClassifier
print("Starting Bagging...")

bagging = BaggingClassifier()
param_grid_bagging = {
    'n_estimators': [10, 50, 100],
    'max_samples': [0.5, 1.0],
    'max_features': [0.5, 1.0]
}

grid_search_bagging = GridSearchCV(bagging, param_grid=param_grid_bagging, cv=5)
grid_search_bagging.fit(X_train, y_train.values.ravel())

best_bagging = grid_search_bagging.best_estimator_
best_bagging.fit(X_train, y_train.values.ravel())
bagging_pred = best_bagging.predict(X_test)
bagging_accuracy = accuracy_score(y_test.values.ravel(), bagging_pred)
bagging_cr = classification_report(y_test.values.ravel(), bagging_pred)
bagging_cm = confusion_matrix(y_test.values.ravel(), bagging_pred)

##################################################

##### EXTRA TREES

from sklearn.ensemble import ExtraTreesClassifier
print("Starting Extra Trees...")

extra_trees = ExtraTreesClassifier()
param_grid_extra_trees = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 3, 5, 7],
    'min_samples_split': [2, 5, 10]
}

grid_search_extra_trees = GridSearchCV(extra_trees, param_grid=param_grid_extra_trees, cv=5)
grid_search_extra_trees.fit(X_train, y_train.values.ravel())

best_extra_trees = grid_search_extra_trees.best_estimator_
best_extra_trees.fit(X_train, y_train.values.ravel())
extra_trees_pred = best_extra_trees.predict(X_test)
extra_trees_accuracy = accuracy_score(y_test.values.ravel(), extra_trees_pred)
extra_trees_cr = classification_report(y_test.values.ravel(), extra_trees_pred)
extra_trees_cm = confusion_matrix(y_test.values.ravel(), extra_trees_pred)

##################################################

##### MULTI-LAYER PERCEPTRON

from sklearn.neural_network import MLPClassifier
print("Starting Multi-Layer Perceptron...")

mlp_classifier = MLPClassifier()
param_grid_mlp = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam', 'sgd'],
    'alpha': [0.0001, 0.001, 0.01]
}

grid_search_mlp = GridSearchCV(mlp_classifier, param_grid=param_grid_mlp, cv=5)
grid_search_mlp.fit(X_train, y_train.values.ravel())

best_mlp = grid_search_mlp.best_estimator_
best_mlp.fit(X_train, y_train.values.ravel())
mlp_pred = best_mlp.predict(X_test)
mlp_accuracy = accuracy_score(y_test.values.ravel(), mlp_pred)
mlp_cr = classification_report(y_test.values.ravel(), mlp_pred)
mlp_cm = confusion_matrix(y_test.values.ravel(), mlp_pred)

##################################################

##### GAUSSIAN PROCESS

from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
print("Starting Gaussian Process...")

# Define the Gaussian Process Classifier
kernel = 1.0 * RBF(1.0)
gpc = GaussianProcessClassifier(kernel=kernel)

# Define the grid of parameters to search
param_grid_gpc = {
    "max_iter_predict": [100, 200, 300],
    # Add more parameters to be tuned as needed for the GPC model
}

# Perform Grid Search with cross-validation
grid_search_gpc = GridSearchCV(estimator=gpc, param_grid=param_grid_gpc, cv=3, scoring='accuracy')
grid_search_gpc.fit(X_train, y_train.values.ravel())

# Get the best parameters and the corresponding accuracy
best_params_gpc = grid_search_gpc.best_params_
best_accuracy_gpc = grid_search_gpc.best_score_

# Fit a model with the best params
gpc = GaussianProcessClassifier(**best_params_gpc)
gpc.fit(X_train, y_train.values.ravel())

# Make predictions on test data
gpc_pred = gpc.predict(X_test)
gpc_accuracy = accuracy_score(y_test.values.ravel(), gpc_pred)
gpc_cr = classification_report(y_test.values.ravel(), gpc_pred)
gpc_cm = confusion_matrix(y_test.values.ravel(), gpc_pred)

##################################################

##### QUADRATIC DISCRIMINANT

from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
print("Starting Quadratic Discriminant...")

# Define the Quadratic Discriminant Analysis model
qda = QuadraticDiscriminantAnalysis()

# Define the grid of parameters to search
param_grid_qda = {
    "tol": [0.0001, 0.001, 0.01],
    # Add more parameters to be tuned as needed for the QDA model
}

# Perform Grid Search with cross-validation
grid_search_qda = GridSearchCV(estimator=qda, param_grid=param_grid_qda, cv=3, scoring='accuracy')
grid_search_qda.fit(X_train, y_train.values.ravel())

# Get the best parameters and the corresponding accuracy
best_params_qda = grid_search_qda.best_params_
best_accuracy_qda = grid_search_qda.best_score_

# Fit a model with the best params
qda = QuadraticDiscriminantAnalysis(**best_params_qda)
qda.fit(X_train, y_train.values.ravel())

# Make predictions on test data
qda_pred = qda.predict(X_test)
qda_accuracy = accuracy_score(y_test.values.ravel(), qda_pred)
qda_cr = classification_report(y_test.values.ravel(), qda_pred)
qda_cm = confusion_matrix(y_test.values.ravel(), qda_pred)

##################################################

##### CATBOOST
print("Starting CatBoost...")

from catboost import CatBoostClassifier

# Define the CatBoost Classifier
catboost = CatBoostClassifier(verbose=False)

# Define the grid of parameters to search
param_grid_catboost = {
    "depth": [4, 6, 8],
    # Add more parameters to be tuned as needed for the CatBoost model
}

# Perform Grid Search with cross-validation
grid_search_catboost = GridSearchCV(estimator=catboost, param_grid=param_grid_catboost, cv=3, scoring='accuracy')
grid_search_catboost.fit(X_train, y_train.values.ravel())

# Get the best parameters and the corresponding accuracy
best_params_catboost = grid_search_catboost.best_params_
best_accuracy_catboost = grid_search_catboost.best_score_

# Fit a model with the best params
catboost = CatBoostClassifier(**best_params_catboost, verbose=False)
catboost.fit(X_train, y_train.values.ravel())

# Make predictions on test data
catboost_pred = catboost.predict(X_test)
catboost_accuracy = accuracy_score(y_test.values.ravel(), catboost_pred)
catboost_cr = classification_report(y_test.values.ravel(), catboost_pred)
catboost_cm = confusion_matrix(y_test.values.ravel(), catboost_pred)

Starting K-Nearest Neighbors...
Starting Decision Tree...
Starting Random Forest...
Starting Support Vector Machines...
SVC(C=100, gamma=0.01)
Starting Naive Bayes...
Accuracy: 92.0%

Classification Report
               precision    recall  f1-score   support

      Cammeo       0.89      0.92      0.90       308
    Osmancik       0.94      0.92      0.93       454

    accuracy                           0.92       762
   macro avg       0.92      0.92      0.92       762
weighted avg       0.92      0.92      0.92       762

Confusion Matrix
 [[283  25]
 [ 36 418]]
Starting Logistic Regression...
Starting Stochastic Gradient Descent...
Starting XGBoost...
Starting Perceptron...
Starting Gradient Boosting...
Starting AdaBoost...
Starting Bagging...
Starting Extra Trees...
Starting Multi-Layer Perceptron...
Starting Gaussian Process...
Starting Quadratic Discriminant...
Best Parameters for Quadratic Discriminant Analysis: {'tol': 0.0001}
Starting CatBoost...




## Model Summary

In [4]:
models = [
    "K-Nearest Neighbors", "Decision Tree", "Random Forest", "Support Vector Machines",
    "Naive Bayes", "Logistic Regression", "Stochastic Gradient", "XGBoost",
    "Perceptron", "Gradient Boosting", "Adaboost", "Bagging", "Extra Trees",
    "Multi-layer Perceptron", "Gaussian Process", "Quadratic Discriminant", "CatBoost"
]

accuracies = [
    KNN_accuracy, DT_accuracy, RF_accuracy, SVM_accuracy, NB_accuracy,
    LR_accuracy, SGD_accuracy, XGB_accuracy, perceptron_accuracy,
    gbm_accuracy, adaboost_accuracy, bagging_accuracy, extra_trees_accuracy,
    mlp_accuracy, gpc_accuracy, qda_accuracy, catboost_accuracy
]

classification_reports = [
    KNN_cr, DT_cr, RF_cr, SVM_cr, NB_cr,
    LR_cr, SGD_cr, XGB_cr, perceptron_cr,
    gbm_cr, adaboost_cr, bagging_cr, extra_trees_cr,
    mlp_cr, gpc_cr, qda_cr, catboost_cr
]

confusion_matrices = [
    KNN_cm, DT_cm, RF_cm, SVM_cm, NB_cm,
    LR_cm, SGD_cm, XGB_cm, perceptron_cm,
    gbm_cm, adaboost_cm, bagging_cm, extra_trees_cm,
    mlp_cm, gpc_cm, qda_cm, catboost_cm
]

predictions = [
    KNN_pred, DT_pred, RF_pred, SVM_pred, NB_pred,
    LR_pred, SGD_pred, XGB_pred, perceptron_pred,
    gbm_pred, adaboost_pred, bagging_pred, extra_trees_pred,
    mlp_pred, gpc_pred, qda_pred, catboost_pred
]

accuracy_df = pd.DataFrame({'Model Name': models, 'Accuracy': accuracies})
accuracy_df['Accuracy'] = round(accuracy_df['Accuracy']*100, 3)
accuracy_df = accuracy_df.sort_values(by='Accuracy', ascending=False).reset_index(drop=True)

# Confusion Matrices
TP_list, TN_list, FP_list, FN_list = [], [], [], []

for cm in confusion_matrices:
    TP = cm[1][1] if len(cm) == 2 else 0
    TN = cm[0][0] if len(cm) == 2 else cm[1][1]
    FP = cm[0][1] if len(cm) == 2 else 0
    FN = cm[1][0] if len(cm) == 2 else 0
    
    TP_list.append(TP)
    TN_list.append(TN)
    FP_list.append(FP)
    FN_list.append(FN)

# Create the DataFrame
cm_data = {
    'Model Name': models,
    'TP': TP_list,
    'TN': TN_list,
    'FP': FP_list,
    'FN': FN_list
}

cm_df = pd.DataFrame(cm_data)

# Classification Reports

# Initialize lists to store metrics
model_names = []
precision_list, recall_list, f1_list = [], [], []

for model, prediction in zip(models, predictions):
    if model == "XGBoost":
        report = classification_report(y_test_LE.values.ravel(), prediction, output_dict=True)
    else:
        report = classification_report(y_test.values.ravel(), prediction, output_dict=True)
    
    precision = round(report['weighted avg']['precision']*100, 3)
    recall = round(report['weighted avg']['recall']*100, 3)
    f1 = round(report['weighted avg']['f1-score']*100, 3)
    
    model_names.append(model)
    precision_list.append(precision)
    recall_list.append(recall)
    f1_list.append(f1)

# Create the DataFrame including accuracy and ROC-AUC score
cr_data = {
    'Model Name': model_names,
    'Precision': precision_list,
    'Recall': recall_list,
    'F1-Score': f1_list
}

cr_df = pd.DataFrame(cr_data)

metrics = accuracy_df.merge(cr_df, on='Model Name', how='inner')
metrics = metrics.merge(cm_df, on='Model Name', how='inner')
metrics['Correct'] = metrics['TP'] + metrics['TN']
metrics['Wrong'] = metrics['FP'] + metrics['FN']
metrics

Unnamed: 0,Model Name,Accuracy,Precision,Recall,F1-Score,TP,TN,FP,FN,Correct,Wrong
0,CatBoost,94.357,94.409,94.357,94.369,428,291,17,26,719,43
1,XGBoost,94.357,94.38,94.357,94.364,430,289,19,24,719,43
2,K-Nearest Neighbors,94.226,94.307,94.226,94.242,426,292,16,28,718,44
3,Random Forest,94.226,94.27,94.226,94.237,428,290,18,26,718,44
4,Logistic Regression,94.094,94.187,94.094,94.113,425,292,16,29,717,45
5,Support Vector Machines,93.963,94.026,93.963,93.978,426,290,18,28,716,46
6,Adaboost,93.963,93.963,93.963,93.963,431,285,23,23,716,46
7,Decision Tree,93.832,93.824,93.832,93.824,433,282,26,21,715,47
8,Multi-layer Perceptron,93.832,93.926,93.832,93.851,424,291,17,30,715,47
9,Gradient Boosting,93.701,93.764,93.701,93.716,425,289,19,29,714,48
