In [1]:
# -----------------------------------------------------------
# Dissertation Project: An Empirical Study on the Classification 
# Performance of Deep Learning vs. Gradient Boosting 
# on heterogeneous tabular data
#
# This module provides functions for all model time results
#
# Author: Adam Mabrouk
# Supervisor: Ben Ralph
# Institution: University of Bath
# Created on: 01/01/2024
# Version: 1.0
# -----------------------------------------------------------

# Pandas version: 2.0.3
# Python version: 3.11.5

import os
import pandas as pd
from statistics import mean, pstdev
import glob

metrics = ['Final Training Time (seconds)', 
           'Inference Time (seconds)']

def read_csv_file(file_path):
    """ Args:
        'model run times' file_path (str). """
    return pd.read_csv(file_path)

def calculate_metrics(values):
    """ This function calculates the mean and standard 
    deviation of the time csv files.
    Args:
        (float) list of time values
    Returns:
        Dictionary with keys 'mean' and 'std' """
    return {'mean': round(mean(values), 3), 'std': round(pstdev(values), 3)}

def process_files(file_paths):
    """ This function iterates through all the files paths to get metrics
    Args:
        file_paths (list of str).
    Returns:
        Dictionary with metrics as keys with mean and standard dev. """
    metric_values = {metric: [] for metric in metrics}

    for file_path in file_paths:
        time_data = read_csv_file(file_path)
        if time_data is not None:
            for metric in metrics:
                metric_values[metric].extend(time_data[metric].tolist())

    return {metric: calculate_metrics(values) for metric, values in metric_values.items()}

def process_folder(folder_path):
    """Args:
           CSV files, to process metrics """
    file_paths = glob.glob(os.path.join(folder_path, '*.csv'))
    folder_results = process_files(file_paths)
    if folder_results:
        print(f'------- {folder_path} -------\n')
        for metric, values in folder_results.items():
            print(f"{metric}: Mean = {values['mean']}, Standard Deviation = {values['std']}")
        print("\n")

def process_all_folders(base_folder):
    """ Accesses the base folder to output metrics.
    Args:
        Base_folder (str) with subfolder metrics """
    for sub_folder in os.listdir(base_folder):
        sub_folder_path = os.path.join(base_folder, sub_folder)
        if os.path.isdir(sub_folder_path):
            process_folder(sub_folder_path)

process_all_folders('Time_results')

------- Time_results/time_tabnet -------

Final Training Time (seconds): Mean = 78.995, Standard Deviation = 3.456
Inference Time (seconds): Mean = 0.157, Standard Deviation = 0.012


------- Time_results/time_node -------

Final Training Time (seconds): Mean = 11.68, Standard Deviation = 1.336
Inference Time (seconds): Mean = 0.086, Standard Deviation = 0.003


------- Time_results/time_xgboost -------

Final Training Time (seconds): Mean = 0.207, Standard Deviation = 0.034
Inference Time (seconds): Mean = 0.005, Standard Deviation = 0.001


