In [None]:
import pickle
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
DATASET = "TinyImageNet"
MODEL_NAME = "ResNet18"
NUM_EXPERIENCES = 10
valid_colors = ['green', 'red', 'cyan', 'magenta', 'black', 'purple', 'orange', 'brown', 'gray', 'olive', 'indigo', 'turquoise']*10

In [None]:
run2name = {
    "j_nst_npp_v1" : ("JointTraining_NoSelfTraining_NoPostProcessing1", "./logs/TinyImageNet_J2"),
    "j_nst_npp_v2" : ("JointTraining_NoSelfTraining_NoPostProcessing2", "./logs/TinyImageNet_J2"),
    "j_nst_npp_v3" : ("JointTraining_NoSelfTraining_NoPostProcessing3", "./logs/TinyImageNet_J2"),

    "d_nst_npp_v1" : ("DER_NoSelfTraining_NoPostProcessing1", "./logs/TinyImageNet_D2"),
    "d_nst_npp_v2" : ("DER_NoSelfTraining_NoPostProcessing2", "./logs/TinyImageNet_D2"),
    "d_nst_npp_v3" : ("DER_NoSelfTraining_NoPostProcessing3", "./logs/TinyImageNet_D2"),

    "r_nst_npp_v1" : ("Replay_NoSelfTraining_NoPostProcessing1", "./logs/TinyImageNet_R2"),
    "r_nst_npp_v2" : ("Replay_NoSelfTraining_NoPostProcessing2", "./logs/TinyImageNet_R2"),
    "r_nst_npp_v3" : ("Replay_NoSelfTraining_NoPostProcessing3", "./logs/TinyImageNet_R2"),

    "n_nst_npp_v1" : ("Naive_NoSelfTraining_NoPostProcessing1", "./logs/TinyImageNet_N2"),
    "n_nst_npp_v2" : ("Naive_NoSelfTraining_NoPostProcessing2", "./logs/TinyImageNet_N2"),
    "n_nst_npp_v3" : ("Naive_NoSelfTraining_NoPostProcessing3", "./logs/TinyImageNet_N2"),
}

In [None]:
run2label = {
        "j_nst_npp_v1" : "JointTraining",
    "j_nst_npp_v2" : "JointTraining",
    "j_nst_npp_v3" : "JointTraining",
    "j_st_npp_00075_v1" : "JointTraining_ST_0.0075",
    "j_st_npp_00075_v2" : "JointTraining_ST_0.0075",
    "j_st_npp_00075_v3" : "JointTraining_ST_0.0075",
    "j_pp_vs_v1" : "JointTraining_VS",
    "j_pp_vs_v2" : "JointTraining_VS",
    "j_pp_vs_v3" : "JointTraining_VS",
    "j_pp_ms_v1" : "JointTraining_MS",
    "j_pp_ms_v2" : "JointTraining_MS",
    "j_pp_ms_v3" : "JointTraining_MS",
    "j_pp_ts_v0" : "JointTraining_TS",
    "j_pp_ts_v1" : "JointTraining_TS",
    "j_pp_ts_v2" : "JointTraining_TS",

    "d_nst_npp_v1" : "DER",
    "d_nst_npp_v2" : "DER",
    "d_nst_npp_v3" : "DER",
    
    "r_nst_npp_v1" : "Replay",
    "r_nst_npp_v2" : "Replay",
    "r_nst_npp_v3" : "Replay",
    "r_st_npp_00025_v1" : "Replay_ST_0.0025",
    "r_st_npp_00025_v2" : "Replay_ST_0.0025",
    "r_st_npp_00025_v3" : "Replay_ST_0.0025",
    "r_pp_vs_v1" : "Replay_VS",
    "r_pp_vs_v2" : "Replay_VS",
    "r_pp_vs_v3" : "Replay_VS",
    "r_pp_ms_v1" : "Replay_MS",
    "r_pp_ms_v2" : "Replay_MS",
    "r_pp_ms_v3" : "Replay_MS",
    "r_pp_ts_v0" : "Replay_TS",
    "r_pp_ts_v1" : "Replay_TS",
    "r_pp_ts_v2" : "Replay_TS",
    "r_pp_vs_md_v1" : "Replay_VS_MD",
    "r_pp_vs_md_v2" : "Replay_VS_MD",
    "r_pp_vs_md_v3" : "Replay_VS_MD",
    "r_pp_ms_md_v1" : "Replay_MS_MD",
    "r_pp_ms_md_v2" : "Replay_MS_MD",
    "r_pp_ms_md_v3" : "Replay_MS_MD",
    "r_pp_ts_md_v0" : "Replay_TS_MD",
    "r_pp_ts_md_v1" : "Replay_TS_MD",
    "r_pp_ts_md_v2" : "Replay_TS_MD",
    
    "n_nst_npp_v1" : "Naive",
    "n_nst_npp_v2" : "Naive",
    "n_nst_npp_v3" : "Naive",
    "n_st_npp_001_v1" : "Naive_ST_0.01",
    "n_st_npp_001_v2" : "Naive_ST_0.01",
    "n_st_npp_001_v3" : "Naive_ST_0.01",
    "n_st_npp_0005_v1" : "Naive_ST_0.005",
    "n_st_npp_0005_v2" : "Naive_ST_0.005",
    "n_st_npp_0005_v3" : "Naive_ST_0.005",
    "n_pp_vs_v1" : "Naive_VS",
    "n_pp_vs_v2" : "Naive_VS",
    "n_pp_vs_v3" : "Naive_VS",
    "n_pp_ms_v1" : "Naive_MS",
    "n_pp_ms_v2" : "Naive_MS",
    "n_pp_ms_v3" : "Naive_MS",
    "n_pp_ts_v0" : "Naive_TS",
    "n_pp_ts_v1" : "Naive_TS",
    "n_pp_ts_v2" : "Naive_TS",
    "n_pp_vs_md_v1" : "Naive_VS_MD",
    "n_pp_vs_md_v2" : "Naive_VS_MD",
    "n_pp_vs_md_v3" : "Naive_VS_MD",
    "n_pp_ms_md_v1" : "Naive_MS_MD",
    "n_pp_ms_md_v2" : "Naive_MS_MD",
    "n_pp_ms_md_v3" : "Naive_MS_MD",
    "n_pp_ts_md_v0" : "Naive_TS_MD",
    "n_pp_ts_md_v1" : "Naive_TS_MD",
    "n_pp_ts_md_v2" : "Naive_TS_MD",
}

In [None]:
running_accuracy = []
running_ece = []
final_accuracy = []
final_ece = []
bins = None
ece_hist_vals = []

for k, (name, path) in run2name.items():
    print(f">> {name} <<")
    with open(f"{path}/{DATASET}_{MODEL_NAME}_{name}_dict", "rb") as file:
        data = pickle.load(file)

        # print("\n---- ACCURACY ----")

        metric_str = "Top1_Acc_Exp/eval_phase/test_stream/Task000/Exp"
        m = []
        for i in range(len(data)):
            cur_exp_dict = data[i]
            cur_exp_acc = 0
            # compute the average over the experiences trained so far (i)
            for j in range(i+1):
                # print(i, j, metric_str + f"{j:03d}", cur_exp_dict[metric_str + f"{j:03d}"])
                cur_exp_acc += cur_exp_dict[metric_str + f"{j:03d}"]
            m.append(cur_exp_acc/(i+1))
        
        # duplicate for JointTraining
        if len(m) < NUM_EXPERIENCES:
            m = m*NUM_EXPERIENCES
        running_accuracy.append((k, m))
        final_accuracy.append((k, running_accuracy[-1][-1][-1]))
        # print(k, m, running_accuracy[-1][-1][-1])

        # print("\n---- ECE ----")

        metric_str = "ECE_Exp/eval_phase/test_stream/Task000/Exp"
        m = []
        for i in range(len(data)):
            cur_exp_dict = data[i]
            cur_exp_ece = 0
            # compute the average over the experiences trained so far (i)
            for j in range(i+1):
                # print(i, j, metric_str + f"{j:03d}", cur_exp_dict[metric_str + f"{j:03d}"])
                cur_exp_ece += cur_exp_dict[metric_str + f"{j:03d}"]
            # m.append(cur_exp_ece/(i+1))
            m.append((cur_exp_ece/(i+1))*100)
        
        # duplicate for JointTraining
        if len(m) < NUM_EXPERIENCES:
            m = m*NUM_EXPERIENCES
        running_ece.append((k, m))
        final_ece.append((k, running_ece[-1][-1][-1]))
        # print(k, m, running_ece[-1][-1][-1])

        # print("\n---- ECE HISTOGRAMS ----")

        metric_str = "ExpECEHistogram/eval_phase/test_stream/Exp"
        m = []
        i = -1 # after last experience
        cur_exp_dict = data[i]
        for j in range(NUM_EXPERIENCES):
            # print(i, j, metric_str + f"{j:03d}", cur_exp_dict[metric_str + f"{j:03d}"])
            fig = cur_exp_dict[metric_str + f"{j:03d}"]
            axes_list = fig.get_axes()
            for ax in axes_list:
                for line in ax.get_lines()[-1:]:
                    x_data = line.get_xdata()
                    y_data = line.get_ydata()
                    # print({'x': x_data, 'y': y_data})
                    if bins is None:
                        bins = x_data
                    m.append(y_data)
        # print(bins, m)
        bin_vals = []
        for i in range(len(bins)):
            x = []
            for j in range(len(m)):
                # print(j, i, m[j][i])
                x.append(m[j][i])
            mean = np.mean(x)
            std = np.std(x)
            bin_vals.append((mean, std))
        # print(k, bin_vals)
        ece_hist_vals.append((k, bin_vals))


In [None]:
# PLOT 1: Average accuracy on all experiences after training on exp j

plt.figure(figsize=(7, 6))
x_axis = list(range(1, NUM_EXPERIENCES+1))
for i, (name, vals) in enumerate(running_accuracy):
    plt.plot(x_axis, vals, label=run2label[name], color=valid_colors[i])
plt.title('Average Experience Accuracy')
plt.xlabel('#Trained Experience')
plt.ylabel('Accuracy')
plt.ylim(0, 1)
plt.xlim(1, NUM_EXPERIENCES)
plt.xticks(x_axis, x_axis)
plt.legend(loc='upper right', fontsize='small', ncol=2)
plt.savefig(f'./imgs/{DATASET}_{NUM_EXPERIENCES}_{str.lower("Average_Experience_Accuracy")}.png', dpi=400)
plt.show()

In [None]:
# PLOT 2: Average ece on all experiences after training on exp j

plt.figure(figsize=(7, 6))
x_axis = list(range(1, NUM_EXPERIENCES+1))
for i, (name, vals) in enumerate(running_ece):
    plt.plot(x_axis, vals, label=run2label[name], color=valid_colors[i])
plt.title('Average Experience ECE')
plt.xlabel('#Trained Experience')
plt.ylabel('ECE')
# plt.ylim(0, 1)
plt.xlim(1, NUM_EXPERIENCES)
plt.xticks(x_axis, x_axis)
plt.legend(loc='upper right', fontsize='small', ncol=2)
plt.savefig(f'./imgs/{DATASET}_{NUM_EXPERIENCES}_{str.lower("Average_Experience_ECE")}.png', dpi=400)
plt.show()

In [None]:
# TABLE : average accuracy/ece on all experiences at the end of training

table_data = []
for (n, acc), (_, ece) in zip(final_accuracy, final_ece):
    table_data.append((run2label[n], round(acc*100, 2), round(ece, 4)))

dt = pd.DataFrame(table_data, columns=["RunName", "Accuracy", "ECE"])
print(dt)

In [None]:
# Calculate mean and standard deviation for each run
mean_accuracy_per_run = dt.groupby('RunName')['Accuracy'].mean()
std_accuracy_per_run = dt.groupby('RunName')['Accuracy'].std()

mean_ece_per_run = dt.groupby('RunName')['ECE'].mean()
std_ece_per_run = dt.groupby('RunName')['ECE'].std()

# Print the results
print("Mean Accuracy per Run:\n", mean_accuracy_per_run)
print("\nStandard Deviation of Accuracy per Run:\n", std_accuracy_per_run)

print("\nMean ECE per Run:\n", mean_ece_per_run)
print("\nStandard Deviation of ECE per Run:\n", std_ece_per_run)

In [None]:
# HISTOGRAM : avg/std across all experiences at the end of training

fig, axs = plt.subplots(3, 4, figsize=(12, 8))
axs = axs.flatten()
for i, (name, vals) in enumerate(ece_hist_vals):
    m = [e[0] for e in vals]
    s = [e[1] for e in vals]
    l = [max(e[0] - e[1], 0) for e in vals] # cap lower-bound at zero
    u = [e[0] + e[1] for e in vals]
    axs[i].plot([0, 1], [0, 1], '--', label='ideal')
    # axs[i].plot(bins, m, color=valid_colors[i])
    # axs[i].fill_between(bins, l, u, alpha=0.3, linestyle='--', color=valid_colors[i])
    axs[i].errorbar(bins, m, yerr=s, marker="o", linestyle="--", capsize=3, capthick=1, color=valid_colors[i])
    axs[i].set_ylim(-0.05, 1)
    axs[i].set_xlim(0, 1)
    axs[i].set_ylabel("Accuracy")
    axs[i].set_xlabel("Confidence")
    # axs[i].legend(loc='upper left', fontsize='small')
    axs[i].set_title(run2label[name])
plt.tight_layout()
plt.savefig(f'./imgs/{DATASET}_{NUM_EXPERIENCES}_avg_std_calibration.png', dpi=400)
plt.show()