In [23]:
import os
import pandas as pd
import matplotlib as mpl
import matplotlib.cm as cm
from IPython.display import display, HTML

### Read Data
The counts for each of TP, TN, FP, FN are pulled from the confusion matrix for each mdoel run. The calculations for Precision, Recall, F1 and Accuracy are done in excel formulae

In [24]:
folder_name = "alldata"
file_name = "eval metrics all models.csv"
file_path = os.path.join(os.getcwd(), folder_name, file_name)
df = pd.read_csv(file_path)

In [25]:
df.head(1)

Unnamed: 0,Model,Dataset,TN,FP,TP,FN,Precision,Recall,F1,Accuracy
0,NN,Pre-trained Bert,374,513,892,221,0.634875,0.801438,0.708499,0.633


### Create pivot table + heatmap

In [26]:
# Create pivot tables for each metric
pivot_f1 = df.pivot(index='Model', columns='Dataset', values='F1')
pivot_acc = df.pivot(index='Model', columns='Dataset', values='Accuracy')

# Determine the overall min and max for each metric for normalization
f1_min, f1_max = pivot_f1.min().min(), pivot_f1.max().max()
acc_min, acc_max = pivot_acc.min().min(), pivot_acc.max().max()

norm_f1 = mpl.colors.Normalize(vmin=f1_min, vmax=f1_max)
norm_acc = mpl.colors.Normalize(vmin=acc_min, vmax=acc_max)

# Set separate colormaps:
cmap_f1 = mpl.colors.LinearSegmentedColormap.from_list("light_greens", ["#e0f2e9", "#a1d99b"])
cmap_acc = mpl.colors.LinearSegmentedColormap.from_list("light_blues", ["#e0f7fa", "#81d4fa"])


def get_color(val, norm, cmap):
    """Return a hex color for a given value based on the provided normalization and colormap."""
    rgb = cmap(norm(val))[:3]
    return mpl.colors.rgb2hex(rgb)

def format_cell(f1, acc):
    """Create an HTML block for a cell showing F1 and Accuracy with background colors."""
    f1_color = get_color(f1, norm_f1, cmap_f1)
    acc_color = get_color(acc, norm_acc, cmap_acc)
    cell_html = (
        f'<div style="display: flex; flex-direction: column; text-align: center;">'
        f'<span style="background-color: {f1_color}; padding: 2px;">F1: {f1:.2f}</span>'
        f'<span style="background-color: {acc_color}; padding: 2px;">Acc: {acc:.2f}</span>'
        f'</div>'
    )
    return cell_html

# Build a new DataFrame with the combined HTML for each cell.
formatted_data = {}
for model in pivot_f1.index:
    row = {}
    for ds in pivot_f1.columns:
        # Get the corresponding F1 and Accuracy values
        f1_val = pivot_f1.loc[model, ds]
        acc_val = pivot_acc.loc[model, ds]
        row[ds] = format_cell(f1_val, acc_val)
    formatted_data[model] = row

pivot_html = pd.DataFrame.from_dict(formatted_data, orient='index')
pivot_html.index.name = 'Model'
pivot_html.columns.name = 'Dataset'

#Render the final HTML table with custom formatting.
html_table = pivot_html.to_html(escape=False)
display(HTML(html_table))

Dataset,Fine-tuned Bert CE,Fine-tuned Bert CS,OpenAI Full Text,OpenAI Summarized,Pre-trained Bert
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Decision Tree,F1: 0.63Acc: 0.63,F1: 0.58Acc: 0.58,F1: 0.61Acc: 0.52,F1: 0.61Acc: 0.54,F1: 0.66Acc: 0.61
Logistic Regression,F1: 0.67Acc: 0.67,F1: 0.62Acc: 0.63,F1: 0.61Acc: 0.54,F1: 0.59Acc: 0.56,F1: 0.70Acc: 0.65
NN,F1: 0.67Acc: 0.68,F1: 0.62Acc: 0.61,F1: 0.55Acc: 0.49,F1: 0.63Acc: 0.55,F1: 0.71Acc: 0.63
Random Forest,F1: 0.69Acc: 0.68,F1: 0.66Acc: 0.66,F1: 0.73Acc: 0.66,F1: 0.79Acc: 0.74,F1: 0.68Acc: 0.66
XGBoost,F1: 0.65Acc: 0.65,F1: 0.65Acc: 0.65,F1: 0.70Acc: 0.60,F1: 0.67Acc: 0.60,F1: 0.73Acc: 0.68
