In [None]:
import pickle
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os


In [None]:
ml_vars_dir = "/Users/jorismachon/Documents/thesis/ML_data/vars"

In [None]:
def load_variables(file_name):
    file_path = os.path.join(ml_vars_dir, file_name)
    with open(file_path, 'rb') as handle:
        loaded_variables = pickle.load(handle)    
    return loaded_variables

In [None]:
import os
import pickle

def load_all_variables(directory, prefix):
    loaded_variables = {}

    # Sort the files alphabetically before iterating over them
    for file_name in sorted(os.listdir(directory)):
        if file_name.startswith(prefix) and file_name.endswith('.pkl'):  # only load .pkl files with the specified prefix
            file_path = os.path.join(directory, file_name)
            with open(file_path, 'rb') as handle:
                # Drop the prefix and '.pkl'
                key = os.path.splitext(file_name[len(prefix):])[0]
                loaded_variables[key] = pickle.load(handle)

    return loaded_variables

In [None]:
def load_show_metrics(file_name):
    print("Showing metrics for file", file_name)
    # Load variables from the file
    loaded_variables = load_variables(file_name)
    # Return each variable separately
    tprs = loaded_variables['tprs']
    aucs = loaded_variables['aucs']
    N = loaded_variables['N']
    P = loaded_variables['P']
    importances_random = loaded_variables['importances_random']
    scores = loaded_variables['scores']
    TP = loaded_variables['TP']
    FP = loaded_variables['FP']
    TN = loaded_variables['TN']
    FN = loaded_variables['FN']
    tnList = loaded_variables['tnList']
    fpList = loaded_variables['fpList']
    fnList = loaded_variables['fnList']
    tpList = loaded_variables['tpList']
    precisionList = loaded_variables['precisionList']
    f1List = loaded_variables['f1List']
    mccList = loaded_variables['mccList']
    train_splits = loaded_variables['train_splits']
    test_splits = loaded_variables['test_splits']
    train_anomaly_percentage = loaded_variables['train_anomaly_percentage']
    test_anomaly_percentage = loaded_variables['test_anomaly_percentage']
    train_anomaly_absolute = loaded_variables['train_anomaly_absolute']
    test_anomaly_absolute = loaded_variables['test_anomaly_absolute']
    
    mean_auc = np.mean(aucs)
    std_auc = np.std(aucs)
    auc_meanpercent = 100 * mean_auc
    auc_stdpercent = 100 * std_auc
    
    """Show metrics"""
    
    # plt.clf()  # Clear the current figure
    
    print("TN: %.02f %% ± %.02f %% - FN: %.02f %% ± %.02f %%" % (np.mean(tnList),
                                                                    np.std(tnList),
                                                                    np.mean(fnList),
                                                                    np.std(fnList)))
    print("FP: %.02f %% ± %.02f %% - TP: %.02f %% ± %.02f %%" % (np.mean(fpList),
                                                                    np.std(fpList),
                                                                    np.mean(tpList),
                                                                    np.std(tpList)))

    print(
        "Precision: %.02f %% ± %.02f %% - F1: %.02f %% ± %.02f %% - MCC: %.02f %% ± %.02f %%" % (np.mean(precisionList),
                                                                                                    np.std(precisionList),
                                                                                                    np.mean(f1List),
                                                                                                    np.std(f1List),
                                                                                                    np.mean(mccList),
                                                                                                    np.std(mccList)))

    print("AUC: %.02f %% ± %.02f %%" % (auc_meanpercent, auc_stdpercent))
  

In [None]:
import matplotlib.pyplot as plt

def load_plot_metrics(file_name):
    print("Showing metrics for file", file_name)
    # Load variables from the file
    loaded_variables = load_variables(file_name)
    f1List = loaded_variables['f1List']
    tnList = loaded_variables['tnList']
    fpList = loaded_variables['fpList']
    fnList = loaded_variables['fnList']
    tpList = loaded_variables['tpList']

    # Create 2x2 grid of boxplots
    fig, axs = plt.subplots(2, 2, figsize=(10, 10))
    # Boxplots
    axs[0, 0].boxplot(tnList, vert=False)
    axs[0, 0].set_title('TN List')
    axs[0, 1].boxplot(fpList, vert=False)
    axs[0, 1].set_title('FP List')
    axs[1, 0].boxplot(fnList, vert=False)
    axs[1, 0].set_title('FN List')
    axs[1, 1].boxplot(tpList, vert=False)
    axs[1, 1].set_title('TP List')
    # Display the plot
    plt.tight_layout()
    plt.show()
    
    # Plot boxplot of scores
    plt.figure()
    plt.boxplot(f1List)
    plt.title('F1 Distribution')
    plt.show()

In [None]:
def plot_boxplots(data, feature_name, y_label, title):
    fig, ax = plt.subplots()

    # Loop over the data and create a boxplot for each set of values
    for i, (label, values) in enumerate(data.items()):
        dataList = values[feature_name]
        
        ax.boxplot(dataList, positions=[i], widths=0.6, vert=True, patch_artist=True, labels=[label])

    # Set the x-axis labels and adjust the plot
    ax.set_xticks(range(len(data)))
    ax.set_xticklabels(data.keys(), rotation = 90)
    y_label = y_label + " (%)"
    ax.set_ylabel(y_label)
    
    # Set the title for the plot
    ax.set_title(title)

    plt.tight_layout()

    # Display the plot
    plt.show()

In [None]:
def calc_recall(data):
    for i, (label, values) in enumerate(data.items()):
        TPList = values['TP']
        FNList = values['FN']
        recallList = [tp / (tp + fn) for tp, fn in zip(TPList, FNList)]
        print(i, " ", recallList, " ", label)

In [None]:
def draw_recall_boxplots(data):
    fig, ax = plt.subplots()

    # Loop over the data and create a boxplot for each set of values
    for i, (label, values) in enumerate(data.items()):
        TPList = values['TP']
        FNList = values['FN']
        recallList = [tp / (tp + fn) for tp, fn in zip(TPList, FNList)]
        
        ax.boxplot(recallList, positions=[i], widths=0.6, vert=True, patch_artist=True, labels=[label])

    # Set the x-axis labels and adjust the plot
    ax.set_xticks(range(len(data)))
    ax.set_xticklabels(data.keys(), rotation = 90)
    ax.set_ylabel('Recall (%)')
    
    # Set the title for the plot
    ax.set_title('AdaBoost - LOGO\nRecall for datasets with nulls')

    plt.tight_layout()

    # Display the plot
    plt.show()

In [None]:
# draw_boxplots(calc_recall(load_all_variables(ml_vars_dir, "abc_LOGO_with_nulls_dataset")))
draw_recall_boxplots(load_all_variables(ml_vars_dir, "abc_LOGO_with_nulls_dataset"))

In [None]:
plot_boxplots(load_all_variables(ml_vars_dir, 'abc_LOGO_with_nulls_dataset'), 'f1List', 'F1 Score', 'AdaBoost - LOGO\nF1 for datasets with nulls')

In [None]:
plot_boxplots(load_all_variables(ml_vars_dir, 'gbc_LOGO_with_nulls_dataset'), 'f1List', 'F1 Score', 'Gradient Boost - LOGO\nF1 for datasets with nulls')

In [None]:
plot_boxplots(load_all_variables(ml_vars_dir, 'abc_LOGO_with_nulls_dataset'), 'precisionList', 'Precision', 'Adaboost - LOGO\nPrecision for datasets with nulls')


In [None]:
plot_boxplots(load_all_variables(ml_vars_dir, 'abc_k5_r1_with_nulls_dataset'), 'f1List', 'F1', 'Adaboost - Kfold 5x1\nF1 for datasets with nulls')


In [None]:
plot_boxplots(load_all_variables(ml_vars_dir, 'abc_k5_r1_with_nulls_dataset'), 'precisionList', 'F1', 'Adaboost - Kfold 5x1\nPrecision for datasets with nulls')


In [None]:
plot_boxplots(load_all_variables(ml_vars_dir, 'abc_LOGO_without_nulls_significant_dataset'), 'f1List', 'F1',
              'Adaboost - LOGO\nF1 for datasets without nulls - significant')


In [None]:
plot_boxplots(load_all_variables(ml_vars_dir, 'abc_LOGO_without_nulls_significant_dataset'), 'precisionList', 'Precision',
                'Adaboost - LOGO\nPrecision for datasets without nulls')

In [None]:
plot_boxplots(load_all_variables(ml_vars_dir, 'abc_LOGO_without_nulls_significant_dataset'), 'f1List', 'F1',
                'Adaboost - LOGO\nF1 for datasets without nulls, significant')

In [None]:
plot_boxplots(load_all_variables(ml_vars_dir, 'abc_LOGO_with_nulls_significant_dataset'), 'f1List', 'F1',
                'Adaboost - LOGO\nF1 for datasets with nulls, significant')

In [None]:
plot_boxplots(load_all_variables(ml_vars_dir, 'rfc_LOGO_with_nulls_dataset'), 'f1List', 'F1',
                'RFC - LOGO\nF1 for datasets with nulls')

New procedure

In [None]:
import matplotlib.pyplot as plt
import numpy as np

def plot_boxplots_table(data, feature_name, y_label, title):
    fig, ax = plt.subplots(2, 1, figsize=(10, 10), gridspec_kw={'height_ratios': [4, 1]})

    # Prepare data for table
    table_data = [['' for _ in range(len(data) + 1)] for _ in range(4)]  # Add extra column for legend

    # Add legend to the first column of the table
    legends = ['Process', 'Product', 'Textual', 'Author']
    for i, legend in enumerate(legends):
        table_data[i][0] = legend

    # Add dummy boxplot
    ax[0].boxplot([], positions=[0], widths=0.6, vert=True, patch_artist=True)

    # Loop over the data and create a boxplot for each set of values
    for i, (label, values) in enumerate(data.items()):
        dataList = values[feature_name]
        ax[0].boxplot(dataList, positions=[i + 1], widths=0.6, vert=True, patch_artist=True)  # Shift boxplots to the right

        # Convert index to 4-bit binary and set dots in table
        binary = format(i + 1, '04b')
        for bit_index, bit in enumerate(binary):
            if bit == '1':
                table_data[bit_index][i + 1] = 'X'  # Shift dots to the right

    # Set the x-axis labels and adjust the plot
    ax[0].set_xticks(range(len(data) + 1))  # Include dummy boxplot
    ax[0].set_xticklabels([''] + list(data.keys()), rotation = 90)  # Add empty label for dummy boxplot
    y_label = y_label + " (%)"
    ax[0].set_ylabel(y_label)
    
    ax[0].set_xlabel('subset index')
    # Set the title for the plot
    ax[0].set_title(title)

    # Create table
    ax[1].axis('tight')
    ax[1].axis('off')
    table = ax[1].table(cellText=table_data, cellLoc='center', loc='center')

    plt.tight_layout()
    plt.show()

In [None]:
plot_boxplots_table(load_all_variables(ml_vars_dir, 'abc_LOGO_with_nulls_dataset'), 'f1List', 'F1', 'AdaBoost - LOGO\nF1 for datasets with nulls')


In [None]:
plot_boxplots_table(load_all_variables(ml_vars_dir, 'abc_LOGO_with_nulls_dataset'), 'precisionList', 'Precision', 'AdaBoost - LOGO\nPrecision for datasets with nulls')


In [None]:
import matplotlib.pyplot as plt
import numpy as np

def plot_recall_boxplots_table(data, feature_name, y_label, title):
    fig, ax = plt.subplots(2, 1, figsize=(10, 10), gridspec_kw={'height_ratios': [4, 1]})

    # Prepare data for table
    table_data = [['' for _ in range(len(data) + 1)] for _ in range(4)]  # Add extra column for legend

    # Add legend to the first column of the table
    legends = ['Process', 'Product', 'Textual', 'Author']
    for i, legend in enumerate(legends):
        table_data[i][0] = legend

    # Add dummy boxplot
    ax[0].boxplot([], positions=[0], widths=0.6, vert=True, patch_artist=True)

    # Loop over the data and create a boxplot for each set of values
    for i, (label, values) in enumerate(data.items()):
        TP = values['TP']
        FN = values['FN']
        recall = [100*tp / (tp + fn) if tp + fn != 0 else 0 for tp, fn in zip(TP, FN)]  # Compute recall for each pair of values
        ax[0].boxplot(recall, positions=[i + 1], widths=0.6, vert=True, patch_artist=True)  # Shift boxplots to the right

        # Convert index to 4-bit binary and set dots in table
        binary = format(i + 1, '04b')
        for bit_index, bit in enumerate(binary):
            if bit == '1':
                table_data[bit_index][i + 1] = 'X'  # Shift dots to the right

    # Set the x-axis labels and adjust the plot
    ax[0].set_xticks(range(len(data) + 1))  # Include dummy boxplot
    ax[0].set_xticklabels([''] + list(data.keys()), rotation = 90)  # Add empty label for dummy boxplot
    y_label = y_label + " (%)"
    ax[0].set_ylabel(y_label)
    ax[0].set_xlabel('subset index')

    # Set the title for the plot
    ax[0].set_title(title)

    # Create table
    ax[1].axis('tight')
    ax[1].axis('off')
    table = ax[1].table(cellText=table_data, cellLoc='center', loc='center')

    plt.tight_layout()
    plt.show()

In [None]:
plot_recall_boxplots_table(load_all_variables(ml_vars_dir, 'abc_LOGO_with_nulls_dataset'), '', 'Recall', 'AdaBoost - LOGO\nRecall for datasets with nulls')

In [None]:
plot_boxplots_table(load_all_variables(ml_vars_dir, 'rfc_LOGO_with_nulls_dataset'), 'f1List', 'F1', 'Random Forest - LOGO\nF1 for datasets with nulls')


In [None]:
plot_boxplots_table(load_all_variables(ml_vars_dir, 'gbc_LOGO_with_nulls_dataset'), 'f1List', 'F1', 'Gradient Boost - LOGO\nF1 for datasets with nulls')
