In [None]:
import os
import sys
from pathlib import Path

os.chdir(Path(os.getcwd()).parents[0])
sys.path.append(os.getcwd())

import glob
import numpy as np
import torch
import pandas as pd

from scipy.stats import pearsonr
from scipy.stats import sem

from src.utils.plot_utils import *


def NormalizeData(data, min, max):
    return (data - min) / ((max - min) + 0.00000000001)

### Import Evaluation Scores

In [None]:
# File loading per dataset
file_image_inet = "/image/eval_scores_imagenet.npz"
file_image_oct = "/image/eval_scores_oct.npz"
file_image_r45 = "/image/eval_scores_resisc45.npz"

file_volume_adr = "/volume/eval_scores_adrenalmnist3d.npz"
file_volume_org = "/volume/eval_scores_organmnist3d.npz"
file_volume_ves = "/volume/eval_scores_vesselmnist3d.npz"

file_pc_coma = "/point_cloud/eval_scores_coma.npz"
file_pc_m40 = "/point_cloud/eval_scores_modelnet40.npz"
file_pc_shpn = "/point_cloud/eval_scores_shapenet.npz"

file_loc = os.getcwd() + "/data/evaluation_scores"

file = np.load(file_loc + file_image_inet, allow_pickle=True)
arr_image_inet = [file["arr_0"], file["arr_1"], file["arr_2"]]
file = np.load(file_loc + file_image_oct, allow_pickle=True)
arr_image_oct = [file["arr_0"], file["arr_1"], file["arr_2"]]
file = np.load(file_loc + file_image_r45, allow_pickle=True)
arr_image_r45 = [file["arr_0"], file["arr_1"], file["arr_2"]]

file = np.load(file_loc + file_volume_adr, allow_pickle=True)
arr_volume_adr = [file["arr_0"], file["arr_1"], file["arr_2"]]
file = np.load(file_loc + file_volume_org, allow_pickle=True)
arr_volume_org = [file["arr_0"], file["arr_1"], file["arr_2"]]
file = np.load(file_loc + file_volume_ves, allow_pickle=True)
arr_volume_ves = [file["arr_0"], file["arr_1"], file["arr_2"]]

file = np.load(file_loc + file_pc_coma, allow_pickle=True)
arr_pc_coma = [file["arr_0"], file["arr_1"], file["arr_2"]]
file = np.load(file_loc + file_pc_m40, allow_pickle=True)
arr_pc_m40 = [file["arr_0"], file["arr_1"], file["arr_2"]]
file = np.load(file_loc + file_pc_shpn, allow_pickle=True)
arr_pc_shpn = [file["arr_0"], file["arr_1"], file["arr_2"]]

### Ranking Computation

In [None]:
# Full Ranking
arr_image = [arr_image_inet, arr_image_oct, arr_image_r45]
arr_volume = [arr_volume_adr, arr_volume_org, arr_volume_ves]
arr_pc = [arr_pc_coma, arr_pc_m40, arr_pc_shpn]
arr_modalities = [arr_image, arr_volume, arr_pc]

arr_ranking = np.empty([3, 3, 14, 20], dtype=float)  # , dataset, model, xai, eval
arr_ranking[:] = np.nan

bup_order = [0, 1, 2, 4, 5, 7, 9, 12, 17]


for dataset in range(3):
    for model in range(3):
        for xai in range(14):
            for eval in range(20):
                ranking = np.median(
                    arr_modalities[0][dataset][model][:14, eval, :], -1
                ).argsort()  # compute ranking based on median obs score
                if eval in bup_order:
                    ranking = ranking[
                        ::-1
                    ]  # reverse ranking to bottom up if larger is better

                pos = (
                    ranking.argsort()[xai] + 1
                )  # get rankin position of xai method (+1 so ranking starts at 1 and not 0)
                arr_ranking[dataset, model, xai, eval] = pos

### Variance test versus random 

In [None]:
from scipy.stats import randint, levene
import scipy

alpha = 0.1

arr_sign_test = np.empty(
    [3, 3, 14, 3], dtype=int  # dataset, model, xai methods, eval_criteria
)
random_sample = randint.rvs(0, 15, size=999999)


for crit in range(3):
    for modality in range(2):
        for method in range(14):
            for model in range(3):
                for dataset in range(3):
                    metric_sample = [
                        arr_ranking[dataset, model, method, :10],
                        arr_ranking[dataset, model, method, 10:17],
                        arr_ranking[dataset, model, method, 17:20],
                    ][crit]

                    if metric_sample.var() > 16.25:
                        arr_sign_test[dataset, model, method, crit] = 0
                    else:
                        test_pvalue = levene(
                            random_sample, metric_sample, center="median"
                        ).pvalue
                        arr_sign_test[dataset, model, method, crit] = (
                            1 if test_pvalue < alpha else 0
                        )

In [None]:
table_sign_test = np.empty([14, 3], dtype=float)  # xai methods, eval_criteria

for method in range(14):
    for crit in range(3):
        table_sign_test[method, crit] = np.round(
            arr_sign_test[:, :, method, crit].mean(), 2
        )

table_sign_test = pd.DataFrame(table_sign_test).transpose()

table_sign_test = table_sign_test.append(
    pd.DataFrame(
        np.round(
            np.average(table_sign_test, axis=0, weights=[0.5, 0.35, 0.15]), 2
        ).reshape(1, -1)
    ),
    ignore_index=True,
)
table_sign_test["mean"] = np.round(table_sign_test.mean(axis=1), 2)

table_sign_test.columns = [
    "OC",
    "LI",
    "KS",
    "VG",
    "IxG",
    "GB",
    "GC",
    "SC",
    "C+",
    "IG",
    "EG",
    "DL",
    "DLS",
    "LRP",
    "Average",
]

table_sign_test.index = ["Faithfulness", "Robustness", "Complexity", "Weighted Average"]
table_sign_test.to_csv(
    os.getcwd().split("src")[0] + "data/figures/variance_sign_test.csv"
)
table_sign_test

### Metric differences Figure

In [None]:
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

colors = list(map(px.colors.qualitative.G10.__getitem__, [0, 2, 3, 4]))
metrics_faith = [
    "FC",
    "FE",
    "MC",
    "PF",
    "RP",
    "INS",
    "DEL",
    "IROF",
    "ROAD",
    "SUF",
    "INF",
]
metrics_robust = [
    "LLE",
    "MS",
    "CON",
    "RIS",
    "ROS",
    "RRS",
]
metrics_complex = ["SP", "CP", "ECP"]

methods = [1, 6, 9, 12]

fig = make_subplots(
    rows=1,
    cols=3,
    subplot_titles=["Faithfulness Metrics", "Robustness Metrics", "Complexity Metrics"],
    column_widths=[0.5, 0.35, 0.15],
    shared_yaxes=True,
)

df_table = pd.DataFrame(arr_ranking[0, 0, :, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16]].T)
df_table.index = [
    "OC",
    "LIME",
    "KS",
    "VG",
    "IxG",
    "GB",
    "GC",
    "SC",
    "C+",
    "IG",
    "EG",
    "DL",
    "DLS",
    "LRP",
]

faith = df_table.iloc[methods, :].transpose()
faith.iloc[10, 0] = 12
faith.iloc[10, 3] = 4

df_table = pd.DataFrame(arr_ranking[0, 0, :, 10:16])
robust = df_table.iloc[methods, :].transpose()

df_table = pd.DataFrame(arr_ranking[0, 0, :, 17:20])
complex = df_table.iloc[methods, :].transpose()


for i in range(len(faith.columns)):
    fig.add_trace(
        go.Scatter(
            x=metrics_faith,
            y=faith.iloc[:, i],
            mode="lines+markers",
            name=faith.columns[i],
            marker=dict(color=colors[i], size=8),
        ),
        col=1,
        row=1,
    )

    fig.add_trace(
        go.Scatter(
            x=["Average"],
            y=[np.mean(faith.iloc[:, i])],
            mode="markers+text",
            text=[str(np.mean(faith.iloc[:, i]).round(2))],
            textposition="middle left",
            showlegend=False,
            marker=dict(color=colors[i], size=8, symbol="square"),
        ),
        col=1,
        row=1,
    )

for i in range(len(robust.columns)):
    fig.add_trace(
        go.Scatter(
            x=metrics_robust,
            y=robust.iloc[:, i],
            mode="lines+markers",
            showlegend=False,
            marker=dict(color=colors[i], size=8),
        ),
        col=2,
        row=1,
    )

    fig.add_trace(
        go.Scatter(
            x=["Average"],
            y=[np.mean(robust.iloc[:, i]), 1],
            mode="markers+text",
            text=[str(np.round(np.mean(robust.iloc[:, i]), 1))],
            textposition="middle left",
            showlegend=False,
            marker=dict(color=colors[i], size=8, symbol="square"),
        ),
        col=2,
        row=1,
    )

for i in range(len(complex.columns)):
    fig.add_trace(
        go.Scatter(
            x=metrics_complex,
            y=complex.iloc[:, i],
            mode="lines+markers",
            showlegend=False,
            marker=dict(color=colors[i], size=8),
        ),
        col=3,
        row=1,
    )

    fig.add_trace(
        go.Scatter(
            x=["Average"],
            y=[np.mean(complex.iloc[:, i])],
            mode="markers+text",
            text=[str(np.round(np.mean(complex.iloc[:, i])))],
            textposition="middle left",
            showlegend=False,
            marker=dict(color=colors[i], size=8, symbol="square"),
        ),
        col=3,
        row=1,
    )

fig.update_yaxes(
    autorange="reversed",
    range=[1, 14],
    tickvals=[1, 5, 10, 14],
    zeroline=False,
    showticklabels=True,
)

fig.update_yaxes(title="Rank", col=1, row=1)

fig.update_xaxes(tickangle=35)

fig.update_layout(
    height=400,
    width=1500,
    legend_title_text="XAI Method",
    template="plotly_white",
    font=dict(
        family="Helvetica",
        color="#000000",
        size=12,
    ),
    title_font=dict(family="Helvetica", color="#000000", size=12),
)

fig = left_align_facet_plot_titles(fig)
fig.write_image(
    os.getcwd().split("src")[0] + "data/figures/meta_eval_example.png", scale=2
)
fig.show()

In [None]:
arr_ranking = np.empty([3, 3, 17, 20], dtype=float)  # modality, dataset, xai, eval
arr_ranking[:] = np.nan

bup_order = [0, 1, 2, 4, 5, 7, 9, 12, 17]

for modality in range(3):
    for dataset in range(3):
        for eval in range(20):
            arr_models = []
            for i in range(3):
                d = arr_modalities[modality][dataset][i][:, eval, :]
                q_h = np.quantile(d, 0.975)
                q_l = np.quantile(d, 0.025)

                d = np.clip(d, q_l, q_h)
                d_max = d.max()
                d_min = d.min()
                arr_models.append(NormalizeData(d, d_min, d_max))

            ranking = np.concatenate(
                [
                    np.median(
                        np.hstack([arr_models[0], arr_models[1], arr_models[2][:-3]]),
                        -1,
                    ),
                    np.median(arr_models[2][-3:], -1),
                ]
            ).argsort()
            # compute ranking based on median obs score
            if eval in bup_order:
                ranking = ranking[
                    ::-1
                ]  # reverse ranking to bottom up if larger is better

            for xai in range(ranking.shape[0]):
                pos = (
                    ranking.argsort()[xai] + 1
                )  # get rankin position of xai method (+1 so ranking starts at 1 and not 0)
                arr_ranking[modality, dataset, xai, eval] = pos

arr_ranking = np.concatenate(
    (
        arr_ranking[:, :, :, :10],
        arr_ranking[:, :, :, 17][..., np.newaxis],
        arr_ranking[:, :, :, 10:17],
        arr_ranking[:, :, :, 18:20],
    ),
    axis=-1,
)

In [None]:
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

colors = [px.colors.qualitative.G10[i] for i in [0, 2, 3, 4]]
methods = [0, 7, 14, 16]

metrics_faith = [
    "FC",
    "FE",
    "MC",
    "PF",
    "RP",
    "INS",
    "DEL",
    "IROF",
    "ROAD",
    "SUF",
    "INF",
]

metrics_robust = [
    "LLE",
    "MS",
    "CON",
    "RIS",
    "ROS",
    "RRS",
]
metrics_complex = ["SP", "CP", "ECP"]

metrics = [metrics_faith, metrics_robust, metrics_complex]

results = arr_ranking[0, :, :, :]  # modality, datasets, xai, eval


fig = make_subplots(
    rows=2,
    cols=4,
    horizontal_spacing=0.02,
    vertical_spacing=0.12,
    subplot_titles=["OC", "SC", "RA", "LA"],
    shared_yaxes=True,
    specs=[
        [
            {"type": "scatterpolar"},
            {"type": "scatterpolar"},
            {"type": "scatterpolar"},
            {"type": "scatterpolar"},
        ],
        [
            {"type": "scatterpolar"},
            {"type": "scatterpolar"},
            {"type": "scatterpolar"},
            {"type": "scatterpolar"},
        ],
    ],
)

for eval_idx, eval in enumerate([(0, 11), (11, 17)]):
    theta = metrics[eval_idx]
    theta.append(theta[0])
    for idx, xai in enumerate(methods):
        r = results[0, xai, eval[0] : eval[1]].tolist()
        r.append(r[0])
        color = (
            ["#3366CC", "#5F93FA", "#7685A5"]
            if xai < 14
            else ["#FF9900", "#FABC5F", "#A59276"]
        )

        fig.add_trace(
            go.Scatterpolar(
                r=[np.mean(r)] * 12,
                theta=theta,
                fill="none",
                mode="lines",
                line=dict(color="black", width=1.5),
                showlegend=False,
            ),
            col=idx + 1,
            row=eval_idx + 1,
        )

        fig.add_trace(
            go.Scatterpolar(
                r=r,
                theta=theta,
                fill="toself",
                marker=dict(color=color[0], size=8),
                showlegend=False,
            ),
            col=idx + 1,
            row=eval_idx + 1,
        )


fig.update_polars(  # radialaxis_angle=90,
    radialaxis=dict(tickangle=0, nticks=5, range=[1, 17], showline=False),
    angularaxis=dict(
        linewidth=1,
        showline=True,
        linecolor="grey",
    ),
)

fig.update_polars(angularaxis=dict(rotation=90), row=2)

for i in range(4):
    fig.layout.annotations[i].update(y=1.05)

fig.update_layout(
    template="plotly_white",
    width=1650,
    height=600,
    polar=dict(
        radialaxis=dict(visible=True),
    ),
    font=dict(family="Helvetica", color="#000000", size=14),
)

fig.write_image(os.getcwd().split("src")[0] + "data/figures/meta_radar.png", scale=3)
fig.show()

### Average ranking disagreement

In [None]:
from scipy.spatial import distance_matrix

list = []

for model in range(3):
    for dataset in range(3):
        for method in range(14):
            list.append(
                distance_matrix(
                    np.expand_dims(arr_ranking[dataset, model, method, :11], 0).T,
                    np.expand_dims(arr_ranking[dataset, model, method, :11], 0).T,
                )
            )

dist_matrix = np.round(np.mean(np.array(list[:42]), 0), 2)
dist_matrix = pd.DataFrame(dist_matrix)
dist_faith_resnet50 = dist_matrix.where(
    np.tril(np.ones(dist_matrix.shape)).astype(bool)
)

dist_matrix = np.round(np.mean(np.array(list[42:84]), 0), 2)
dist_matrix = pd.DataFrame(dist_matrix)
dist_faith_effnetb0 = dist_matrix.where(
    np.tril(np.ones(dist_matrix.shape)).astype(bool)
)

dist_matrix = np.round(np.mean(np.array(list[84:]), 0), 2)
dist_matrix = pd.DataFrame(dist_matrix)
dist_faith_vit = dist_matrix.where(np.tril(np.ones(dist_matrix.shape)).astype(bool))

list = []

for model in range(3):
    for dataset in range(3):
        for method in range(14):
            list.append(
                distance_matrix(
                    np.expand_dims(arr_ranking[dataset, model, method, 11:17], 0).T,
                    np.expand_dims(arr_ranking[dataset, model, method, 11:17], 0).T,
                )
            )

dist_matrix = np.round(np.mean(np.array(list[:42]), 0), 2)
dist_matrix = pd.DataFrame(dist_matrix)
dist_robust_resnet50 = dist_matrix.where(
    np.tril(np.ones(dist_matrix.shape)).astype(bool)
)

dist_matrix = np.round(np.mean(np.array(list[42:84]), 0), 2)
dist_matrix = pd.DataFrame(dist_matrix)
dist_robust_effnetb0 = dist_matrix.where(
    np.tril(np.ones(dist_matrix.shape)).astype(bool)
)

dist_matrix = np.round(np.mean(np.array(list[84:]), 0), 2)
dist_matrix = pd.DataFrame(dist_matrix)
dist_robust_vit = dist_matrix.where(np.tril(np.ones(dist_matrix.shape)).astype(bool))

In [None]:
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

metrics_faith = [
    "FC",
    "FE",
    "MC",
    "PF",
    "RP",
    "INS",
    "DEL",
    "IROF",
    "ROAD",
    "SUF",
    "INF",
]

metrics_robust = [
    "LLE",
    "MS",
    "CON",
    "RIS",
    "ROS",
    "RRS",
]
metrics_complex = ["SP", "CP", "ECP"]

colors = px.colors.qualitative.G10

fig = make_subplots(
    rows=2,
    cols=3,
    subplot_titles=[
        "ResNet50",
        "EfficientNetb0",
        "DeiT ViT",
        "ResNet50",
        "EfficientNetb0",
        "DeiT ViT",
    ],
    # column_widths=[0.7, 0.3],
)

for i in range(3):
    fig.add_trace(
        go.Heatmap(
            z=[dist_faith_resnet50, dist_faith_effnetb0, dist_faith_vit][i],
            x=metrics_faith,
            y=metrics_faith,
            texttemplate="%{z}",
            colorscale="RdYlGn",
            reversescale=True,
            zmin=0.5,
            zmax=7.1,
            colorbar=dict(ticks="outside", thickness=10),
        ),
        col=i + 1,
        row=1,
    )

for i in range(3):
    fig.add_trace(
        go.Heatmap(
            z=[dist_robust_resnet50, dist_robust_effnetb0, dist_robust_vit][i],
            x=metrics_robust,
            y=metrics_robust,
            texttemplate="%{z}",
            colorscale="RdYlGn",
            reversescale=True,
            zmin=0.5,
            zmax=7.1,
            colorbar=dict(ticks="outside", thickness=10),
        ),
        col=i + 1,
        row=2,
    )

fig.update_yaxes(showgrid=False)
fig.update_yaxes(title="Faithfulness Metrics", row=1, col=1)
fig.update_yaxes(title="Robustness Metrics", row=2, col=1)

fig.update_layout(
    font=dict(family="Helvetica", color="#000000", size=13),
    template="plotly_white",
    height=900,
    width=1600,
    title_font=dict(family="Helvetica", color="#000000", size=14),
)

fig = left_align_facet_plot_titles(fig)

fig.write_image(
    os.getcwd().split("src")[0] + "data/figures/meta_eval_dist.png", scale=3
)
fig.show()

In [None]:
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

colors = px.colors.qualitative.G10

fig = make_subplots(
    rows=1,
    cols=2,
    subplot_titles=["Faithfulness", "Robustness"],
    # column_widths=[0.7, 0.3],
)


fig.add_trace(
    go.Heatmap(
        z=np.round((dist_faith_resnet50 + dist_faith_effnetb0 + dist_faith_vit) / 3, 2),
        x=metrics_faith,
        y=metrics_faith,
        texttemplate="%{z}",
        colorscale="RdYlGn",
        reversescale=True,
        zmin=1.0,
        zmax=7.0,
        colorbar=dict(ticks="outside", thickness=10),
        showscale=True,
    ),
    col=1,
    row=1,
)


fig.add_trace(
    go.Heatmap(
        z=np.round(
            (dist_robust_resnet50 + dist_robust_effnetb0 + dist_robust_vit) / 3, 2
        ),
        x=metrics_robust,
        y=metrics_robust,
        texttemplate="%{z}",
        colorscale="RdYlGn",
        reversescale=True,
        zmin=0.5,
        zmax=7.0,
        showscale=False,
        colorbar=dict(ticks="outside", thickness=10),
    ),
    col=2,
    row=1,
)

fig.update_yaxes(showgrid=False)
fig = left_align_facet_plot_titles(fig)

fig.update_layout(
    font=dict(family="Helvetica", color="#000000", size=13),
    template="plotly_white",
    height=500,
    width=1000,
    title_font=dict(family="Helvetica", color="#000000", size=14),
)

fig.write_image(
    os.getcwd().split("src")[0] + "data/figures/meta_eval_dist2.png", scale=3
)
fig.show()