# Model Performance Table

Create table of model performance on train, val, test, etc.

In [1]:
# Packages
import pandas as pd
import os
import numpy as np
import dataframe_image as dfi # NOTE: YOU MUST HAVE GOOGLE CHROME INSTALLED FOR THIS TO WORK CORRECTLY

## List of Models

In [2]:
list_of_models = ['Logistic Regression', 'SVM', 'XGBoost', 'Random Forest'] # Add Random Forest Later

## Load Train and Validation Accuracies and Tuned Hyperparameters

In [3]:
# For each folder in list of models, load '../../Output/Classifier Fitting/' + folder Excel files with 'Training Statistics' in the name
dataframes = []
for model in list_of_models:
    #print('loading data for ' + model)
    # Get file names
    training_statistics_files = [f for f in os.listdir('../../Output/Classifier Fitting/' + model) if 'Training Statistics' in f]
    # Parse run type by splitting file name on spaces and taking first part
    run_types = [f.split(' ')[0] for f in training_statistics_files]
    # Load data
    for i in range(len(training_statistics_files)):
        loaded_df = pd.read_excel('../../Output/Classifier Fitting/' + model + '/' + training_statistics_files[i])
        # Add model to dataframe
        loaded_df['Model'] = model
        # Add run type to dataframe
        loaded_df['Run Type'] = run_types[i]
        # Joblib of tuned hyperparameters
        tuned_hyperparameters = pd.read_pickle('../../Output/Classifier Fitting/' + model + '/' + run_types[i] + ' Tuned Hyperparameters.joblib')
        #print(tuned_hyperparameters)
        #print(type(tuned_hyperparameters))
        # If 'C' is a key, print its type
        # if 'C' in tuned_hyperparameters:
        #     print(type(tuned_hyperparameters['C']))
        # Round all items to 2 decimal places in dictionary
        tuned_hyperparameters = {k: round(v, 2) if type(v) == np.float64 else v for k, v in tuned_hyperparameters.items()}
        print(tuned_hyperparameters)
        # Convert to string and remove {} and ' characters
        tuned_hyperparameters = str(tuned_hyperparameters).replace('{', '').replace('}', '').replace("'", '')
        # Add tuned hyperparameters to dataframe
        loaded_df['Tuned Hyperparameters'] = tuned_hyperparameters
        # Append to list of dataframes
        dataframes.append(loaded_df)

# Stack dataframes
train_info_df = pd.concat(dataframes)
# Keep Model, Run Type, mean_cross_validated_accuracy, train_accuracy_best_model
train_info_df = train_info_df[['Model', 'Run Type', 'Tuned Hyperparameters', 'train_accuracy_best_model', 'mean_cross_validated_accuracy']]
# Rename columns
train_info_df = train_info_df.rename(columns={'train_accuracy_best_model': 'Train Accuracy', 'mean_cross_validated_accuracy': 'Mean Cross-Validated Accuracy'})
train_info_df


{'C': 0.1, 'class_weight': 'balanced', 'l1_ratio': 1.0, 'multi_class': 'ovr', 'penalty': 'elasticnet', 'solver': 'saga'}
{'C': 0.1, 'class_weight': None, 'l1_ratio': 0.5, 'multi_class': 'ovr', 'penalty': 'elasticnet', 'solver': 'saga'}
{'C': 0.01, 'class_weight': 'balanced', 'l1_ratio': 0.25, 'multi_class': 'ovr', 'penalty': 'elasticnet', 'solver': 'saga'}
{'C': 0.1, 'class_weight': None, 'l1_ratio': 1.0, 'multi_class': 'multinomial', 'penalty': 'elasticnet', 'solver': 'saga'}
{'C': 1.0, 'degree': 2, 'gamma': 0.1, 'kernel': 'rbf'}
{'C': 0.1, 'degree': 2, 'gamma': 0.1, 'kernel': 'poly'}
{'C': 0.1, 'degree': 2, 'gamma': 0.1, 'kernel': 'poly'}
{'C': 1, 'degree': 2, 'gamma': 0.001, 'kernel': 'rbf'}
{'colsample_bytree': 1, 'learning_rate': 0.3, 'max_depth': 3, 'n_estimators': 200, 'subsample': 0.8}
{'colsample_bytree': 1, 'learning_rate': 0.3, 'max_depth': 3, 'n_estimators': 200, 'subsample': 0.8}
{'colsample_bytree': 0.8, 'learning_rate': 0.3, 'max_depth': 3, 'n_estimators': 200, 'subsampl

Unnamed: 0,Model,Run Type,Tuned Hyperparameters,Train Accuracy,Mean Cross-Validated Accuracy
0,Logistic Regression,All_Data,"C: 0.1, class_weight: balanced, l1_ratio: 1.0,...",0.984175,0.915708
0,Logistic Regression,All_Features_PCA,"C: 0.1, class_weight: None, l1_ratio: 0.5, mul...",0.992171,0.904881
0,Logistic Regression,Individual_Features_PCA,"C: 0.01, class_weight: balanced, l1_ratio: 0.2...",0.958021,0.915876
0,Logistic Regression,VGG,"C: 0.1, class_weight: None, l1_ratio: 1.0, mul...",0.918041,0.863565
0,SVM,All_Data,"C: 1.0, degree: 2, gamma: 0.1, kernel: rbf",0.633333,0.483333
0,SVM,All_Features_PCA,"C: 0.1, degree: 2, gamma: 0.1, kernel: poly",1.0,0.503248
0,SVM,Individual_Features_PCA,"C: 0.1, degree: 2, gamma: 0.1, kernel: poly",1.0,0.579708
0,SVM,VGG,"C: 1, degree: 2, gamma: 0.001, kernel: rbf",0.943195,0.869732
0,XGBoost,All_Data,"colsample_bytree: 1, learning_rate: 0.3, max_d...",1.0,0.90938
0,XGBoost,All_Features_PCA,"colsample_bytree: 1, learning_rate: 0.3, max_d...",1.0,0.855573


## Add Test Accuracy

In [4]:
# Load predictions
# For each folder in list of models, load all files in '../../Data/Predictions/' + folder
dataframes = []
for model in list_of_models:
    #print('loading data for ' + model)
    # Get file names
    prediction_files = [f for f in os.listdir('../../Data/Predictions/' + model)]
    # Load data
    for i in range(len(prediction_files)):
        loaded_df = pd.read_excel('../../Data/Predictions/' + model + '/' + prediction_files[i])
        # Rename column name containing 'Classification' to 'Prediction'
        loaded_df = loaded_df.rename(columns={loaded_df.columns[-1]: 'Prediction'})
        # Add model to dataframe
        loaded_df['Model'] = model
        # Parse run_type by splitting filename on 'Predictions_' and taking second part
        run_type = prediction_files[i].split('Predictions_')[1]
        # Split again to remove '.xlsx'
        run_type = run_type.split('.')[0]
        # Add run type to dataframe
        loaded_df['Run Type'] = run_type
        # Append to list of dataframes
        dataframes.append(loaded_df)

# Stack dataframes
predictions_df = pd.concat(dataframes)
# By Model and Run Type, get the share of rows where Class = Prediction
predictions_df['Correct'] = predictions_df['Class'] == predictions_df['Prediction']
# Get share of correct predictions
test_acc_df = predictions_df.groupby(['Model', 'Run Type'])['Correct'].mean().reset_index()
# Rename to Test Accuracy
test_acc_df = test_acc_df.rename(columns={'Correct': 'Test Accuracy'})
test_acc_df


Unnamed: 0,Model,Run Type,Test Accuracy
0,Logistic Regression,All_Data,0.910678
1,Logistic Regression,All_Features_PCA,0.890531
2,Logistic Regression,Individual_Features_PCA,0.909335
3,Random Forest,All_Data,0.847549
4,Random Forest,All_Features_PCA,0.70047
5,Random Forest,Individual_Features_PCA,0.790463
6,SVM,All_Features_PCA,0.574211
7,SVM,Individual_Features_PCA,0.631296
8,SVM,VGG,0.864338
9,XGBoost,All_Data,0.901276


## Merge Dataframes and Output

In [5]:
model_performance_table = (train_info_df.merge(test_acc_df, on=['Model', 'Run Type'])
                                        # Replace underscores with spaces in Run Type
                                        .assign(Run_type=lambda x: x['Run Type'].str.replace('_', ' '))
                                        .drop(columns=['Run Type'])
                                        .rename(columns={'Run_type': 'Run Type'})
                                        # Reorder to put Run Type second
                                        .reindex(columns=['Model', 'Run Type', 'Tuned Hyperparameters', 'Train Accuracy', 'Mean Cross-Validated Accuracy', 'Test Accuracy'])
                                        .rename(columns={'Run Type': 'Features'})
                                        # Recode 'All Data' to 'All Features' in Features column
                                        .replace({'Features': {'All Data': 'All Features'}})
                                        .style
                                        .format(thousands=",", decimal=".")
                                        .set_table_styles([dict(selector='th', props=[('text-align', 'center')])])
                                        .set_properties(**{'text-align': 'center'})
                                        .hide(axis="index"))

# Save to PNG
dfi.export(model_performance_table, '../../Output/Classifier Evaluation/Model Performance Table.png')

# Save to Excel
model_performance_table.to_excel('../../Output/Classifier Evaluation/Model Performance Table.xlsx', index=False)

model_performance_table

Model,Features,Tuned Hyperparameters,Train Accuracy,Mean Cross-Validated Accuracy,Test Accuracy
Logistic Regression,All Features,"C: 0.1, class_weight: balanced, l1_ratio: 1.0, multi_class: ovr, penalty: elasticnet, solver: saga",0.984175,0.915708,0.910678
Logistic Regression,All Features PCA,"C: 0.1, class_weight: None, l1_ratio: 0.5, multi_class: ovr, penalty: elasticnet, solver: saga",0.992171,0.904881,0.890531
Logistic Regression,Individual Features PCA,"C: 0.01, class_weight: balanced, l1_ratio: 0.25, multi_class: ovr, penalty: elasticnet, solver: saga",0.958021,0.915876,0.909335
SVM,All Features PCA,"C: 0.1, degree: 2, gamma: 0.1, kernel: poly",1.0,0.503248,0.574211
SVM,Individual Features PCA,"C: 0.1, degree: 2, gamma: 0.1, kernel: poly",1.0,0.579708,0.631296
SVM,VGG,"C: 1, degree: 2, gamma: 0.001, kernel: rbf",0.943195,0.869732,0.864338
XGBoost,All Features,"colsample_bytree: 1, learning_rate: 0.3, max_depth: 3, n_estimators: 200, subsample: 0.8",1.0,0.90938,0.901276
XGBoost,All Features PCA,"colsample_bytree: 1, learning_rate: 0.3, max_depth: 3, n_estimators: 200, subsample: 0.8",1.0,0.855573,0.862995
XGBoost,Individual Features PCA,"colsample_bytree: 0.8, learning_rate: 0.3, max_depth: 3, n_estimators: 200, subsample: 0.8",1.0,0.888056,0.881128
XGBoost,VGG,"learning_rate: 0.5, max_depth: 3, min_child_weight: 1, n_estimators: 300",1.0,0.863733,0.854265
