In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
# General definitions
PRECISION = "precision"
RECALL = "recall"
F1SCORE = "f1-score"
SUPPORT = "support"
METRICS = [PRECISION, RECALL, F1SCORE, SUPPORT]

model_results = r".\model_results"
csv_model_result_file_name = "model_results.csv"

### Loading the model results and preparing them for visualization

In [3]:
merged_dataframes = {PRECISION: pd.DataFrame(), RECALL: pd.DataFrame(), F1SCORE: pd.DataFrame(),
                     SUPPORT: pd.DataFrame()
                     }
# Loop through all the models, folder structure and file name have to be as expected
for model, _, _ in os.walk(model_results):
    if model != model_results:  # Just ignore the root folder
        current_model_string = model.split('\\')[-1]
        # read dataframe and append model suffix to the columns, makes them unique for the later merge
        res = pd.read_csv(os.path.join(model, csv_model_result_file_name), sep=";")
        res.columns = [str(col) + current_model_string[-2:] if idx != 0 else col for idx, col in
                       enumerate(res.columns)]
        # Loop through the different metrics and create a merge dataframe for each one
        for metric in METRICS:
            # Get corresponding container
            df = merged_dataframes[metric]
            # Cut only precision part
            res_reduced = res[["Unnamed: 0", metric + current_model_string[-2:]]]
            if df.size == 0:
                df = res_reduced
            else:
                df = df.merge(res_reduced, how='right')
            previous_model_string = current_model_string
            merged_dataframes[metric] = df

We have now 4 dataframes, one for each metric, and we can easily highlight which model performs best for which metric

### Lets start with defining our own custom highlighting function, for Series, and some cleaning up functions

In [4]:
# Very largely based on the example from the panda's doc
def highlight_max(data, color='yellow'):
    attr = 'background-color: {}'.format(color)
    if data.ndim == 1:  # Series from .apply(axis=0) or axis=1
        is_max = data == data.max()
        return [attr if v else '' for v in is_max]

In [5]:
def apply_highlighting(df):
    return df.style.apply(highlight_max, color='lightgreen', axis=1)

In [6]:
def reduce_dataframe(df):
    """ Removes the extra entries that do not correspond to categories, as well as reports
    the number of categories for which all models fail to perform and removes these 
    categories also from the reduced dataframe """
    # Set the index column for conveniancy
    df = df.set_index("Unnamed: 0")
    # Drop undesired rows
    df.drop(index=['micro avg', 'macro avg', 'weighted avg', 'samples avg'], inplace=True)
    # Find out how many categories are blank for all models
    only_zeroes = (df.T == 0).any().sum()
    print("%d categories have zeros for all models!" % only_zeroes)
    # Clean up and return dataframe
    return df #df[(df.T != 0).any()]

### Lets checkout which models performs the best, metric per metric

#### Precision

In [7]:
precision_df = reduce_dataframe(merged_dataframes[PRECISION])
apply_highlighting(precision_df)

14 categories have zeros for all models!


Unnamed: 0_level_0,precision_a,precision_b,precision_c
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
related,0.831205,0.825564,0.845039
request,0.884058,0.87963,0.868275
offer,0.0,0.0,0.0
aid_related,0.8375,0.825153,0.768317
medical_help,0.4,0.444444,0.655172
medical_products,0.777778,0.857143,0.828571
search_and_rescue,0.0,0.666667,0.636364
security,1.0,0.0,0.25
military,0.0,0.0,0.454545
child_alone,0.0,0.0,0.0


#### Recall

In [8]:
recall_df = reduce_dataframe(merged_dataframes[RECALL])
apply_highlighting(recall_df)

14 categories have zeros for all models!


Unnamed: 0_level_0,recall_a,recall_b,recall_c
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
related,0.959051,0.956147,0.946081
request,0.42069,0.43628,0.4875
offer,0.0,0.0,0.0
aid_related,0.490393,0.496998,0.697215
medical_help,0.005102,0.009346,0.088785
medical_products,0.02518,0.022059,0.105455
search_and_rescue,0.0,0.013699,0.044304
security,0.010753,0.0,0.01
military,0.0,0.0,0.028736
child_alone,0.0,0.0,0.0


#### F1-score

In [9]:
f1score_df = reduce_dataframe(merged_dataframes[F1SCORE])
apply_highlighting(f1score_df)

14 categories have zeros for all models!


Unnamed: 0_level_0,f1-score_a,f1-score_b,f1-score_c
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
related,0.890563,0.88607,0.89271
request,0.570093,0.583269,0.624416
offer,0.0,0.0,0.0
aid_related,0.61858,0.620352,0.731041
medical_help,0.010076,0.018307,0.156379
medical_products,0.04878,0.043011,0.187097
search_and_rescue,0.0,0.026846,0.08284
security,0.021277,0.0,0.019231
military,0.0,0.0,0.054054
child_alone,0.0,0.0,0.0
