In [1]:
import numpy as np
import pandas as pd
import os
import glob
import seaborn as sns
import matplotlib.pyplot as plt
pd.set_option("display.max_colwidth", 200)
pd.set_option("display.max_row", 500)

In [17]:
def plot_scores(score_df, metric, score_type, title="", save_folder="", col_order=None, hue_order=None):

    g = sns.catplot(x="Domain", y="Score", hue="alpha", col="Data mul factor", col_order=col_order,
                    data=score_df, ci="sd", kind="bar", hue_order=hue_order,
                    row_order=[100.0, 80.0, 60.0, 40.0, 20.0])

    g.set_axis_labels("", "Score (Mean and Standard Deviation across 5 CV folds)")

    for i, ax in enumerate(g.fig.axes):
        ax.set_xticklabels(ax.get_xticklabels(), roation=65)
        ax.axhline(0, color="black")

    g.fig.suptitle(title, y=1.08, fontsize=30)
    
    if metric == "R2" and score_type == "Out-of-sample":
        g.set(ylim=(-1, 0.2))
#         plt.savefig(save_folder + title, bbox_inches="tight")
#         pass
        
    if metric == "R2" and (score_type == "In-sample" or score_type == "In-sample (original)"):
        g.set(ylim=(-0.05, 1.0))
        for i, ax in enumerate(g.fig.axes):
            ax.axhline(0.1, color="black")
            ax.axhline(0.2, color="black")
        
    if metric == "MAE":
        g.set(ylim=(0.0, 1.6))
        for i, ax in enumerate(g.fig.axes):
            ax.axhline(0.6, color="black")
        
    plt.show()
#     plt.savefig(save_folder + title, bbox_inches="tight")


def plot_all_scores(score_df, title_prefix="", save_folder="", col_order=None, hue_order=None):   

    for metric in score_df["Metric"].unique():

        for score_type in np.sort(score_df["Score type"].unique()):
            
            for model in score_df["Model"].unique():

                filtered_data = score_df[(score_df["Metric"] == metric) & (score_df["Score type"] == score_type) 
                                         & (score_df["Model"] == model)]
                
                if score_type == "In-sample (original)":
                    filtered_data = filtered_data[filtered_data["alpha"] != "No mixup"]
                    col_order_new=["5x", "10x"]
                else:
                    col_order_new = col_order

                title = title_prefix + model + " - " + metric + " - " + score_type

                plot_scores(filtered_data, metric, score_type, title, save_folder, col_order=col_order_new, hue_order=hue_order)

In [5]:
# utility to load results
def load_dfs(direc, regex):
    path = "/Users/hasnainmamdani/Academics/McGill/thesis/stroke-impairment-analysis/regression/results/" + direc
    filenames = glob.glob(os.path.join(path, regex))
    filenames.sort()

    res = []
    bps = []
    for fn in filenames:
        if "best-params-" in fn:
            bp = pd.read_hdf(fn, key='p', mode='r')
#             print(bp["% Data (train)"].unique(), bp["Data mul factor"].unique(), bp["alpha"].unique())
            print(bp["Data mul factor"].unique(), bp["alpha"].unique())
            bps.append(bp)
        else:
            res.append(pd.read_hdf(fn, key='p', mode='r'))

    print(len(res), len(bps))
    return res, bps

## 1- Multitask Ridge

In [66]:
# mridge = pd.read_hdf('results/multioutput/atlas-llm-with-wm-mixup-before-log/results_atlas_llm_multioutput_mixup_before_log_mridge_all.h5', key='p', mode='r')
# mbpridge = pd.read_hdf('results/multioutput/atlas-llm-with-wm-mixup-before-log/best_params_atlas_llm_multioutput_mixup_before_log_mridge_all.h5', key='p', mode='r')

mridge.to_hdf('results/multioutput/atlas-llm-with-wm-mixup-before-log/results_atlas_llm_multioutput_mixup_before_log_mridge_all.h5', key='p', mode='w')
mbpridge.to_hdf('results/multioutput/atlas-llm-with-wm-mixup-before-log/best_params_atlas_llm_multioutput_mixup_before_log_mridge_all.h5', key='p', mode='w')

In [61]:
mridlist, mbpridlist = load_dfs("", "*multioutput*mridge*")

['No mixup' '5x' '10x'] ['No mixup' 0.01 0.1]
['5x' '10x'] [0.3 1. ]
2 2


In [64]:
# mbpridge

In [62]:
mridge = pd.concat(mridlist, ignore_index=True)
mbpridge = pd.concat(mbpridlist, ignore_index=True)

In [4]:
# plot_all_scores(mridge, col_order=["No mixup", "5x", "10x"], hue_order=["No mixup", 0.01, 0.1, 0.3, 1.0])

## 2- PLS

In [3]:
pls = pd.read_hdf('results/multioutput/atlas-llm-with-wm-mixup-before-log/results_atlas_llm_multioutput_mixup_before_log_pls_all.h5', key='p', mode='r')
bppls = pd.read_hdf('results/multioutput/atlas-llm-with-wm-mixup-before-log/best_params_atlas_llm_multioutput_mixup_before_log_pls_all.h5', key='p', mode='r')

# pls.to_hdf('results/multioutput/atlas-llm-with-wm-mixup-before-log/results_atlas_llm_multioutput_mixup_before_log_pls_all.h5', key='p', mode='w')
# bppls.to_hdf('results/multioutput/atlas-llm-with-wm-mixup-before-log/best_params_atlas_llm_multioutput_mixup_before_log_pls_all.h5', key='p', mode='w')


In [6]:
plslist, bpplslist = load_dfs("", "*multioutput*pls*")

['No mixup' '5x'] ['No mixup' 0.01]
1 1


In [8]:
pls = pd.concat(plslist, ignore_index=True)
bppls = pd.concat(bpplslist, ignore_index=True)

In [9]:
bppls

Unnamed: 0,Data mul factor,alpha,Model,Fold,Best params
0,No mixup,No mixup,PLS,1,{'n_components': 1}
1,No mixup,No mixup,PLS,2,{'n_components': 1}
2,No mixup,No mixup,PLS,3,{'n_components': 1}
3,No mixup,No mixup,PLS,4,{'n_components': 2}
4,No mixup,No mixup,PLS,5,{'n_components': 1}
5,5x,0.01,PLS,1,{'n_components': 6}
6,5x,0.01,PLS,2,{'n_components': 6}
7,5x,0.01,PLS,3,{'n_components': 6}
8,5x,0.01,PLS,4,{'n_components': 6}
9,5x,0.01,PLS,5,{'n_components': 6}


In [11]:
# plot_all_scores(pls, col_order=["No mixup", "5x", "10x"], hue_order=["No mixup", 0.01, 0.1, 0.3, 1.0])

## 3- CCA

In [5]:
cca = pd.read_hdf('results/multioutput/atlas-llm-with-wm-mixup-before-log/results_atlas_llm_multioutput_mixup_before_log_cca_all.h5', key='p', mode='r')
bpcca = pd.read_hdf('results/multioutput/atlas-llm-with-wm-mixup-before-log/best_params_atlas_llm_multioutput_mixup_before_log_cca_all.h5', key='p', mode='r')

# cca.to_hdf('results/multioutput/atlas-llm-with-wm-mixup-before-log/results_atlas_llm_multioutput_mixup_before_log_cca_all.h5', key='p', mode='w')
# bpcca.to_hdf('results/multioutput/atlas-llm-with-wm-mixup-before-log/best_params_atlas_llm_multioutput_mixup_before_log_cca_all.h5', key='p', mode='w')


In [96]:
ccalist, bpccalist = load_dfs("", "*multioutput*cca*")

['No mixup' '5x' '10x'] ['No mixup' 0.01 0.1 0.3 1.0]
1 1


In [12]:
# bpcca

In [99]:
cca = pd.concat(ccalist, ignore_index=True)
bpcca = pd.concat(bpccalist, ignore_index=True)

In [13]:
# plot_all_scores(cca, col_order=["No mixup", "5x", "10x"], hue_order=["No mixup", 0.01, 0.1, 0.3, 1.0])

## 4- Multioutput Random Forest

In [9]:
rf = pd.read_hdf('results/multioutput/atlas-llm-with-wm-mixup-before-log/results_atlas_llm_multioutput_mixup_before_log_rf_all.h5', key='p', mode='r')
bprf = pd.read_hdf('results/multioutput/atlas-llm-with-wm-mixup-before-log/best_params_atlas_llm_multioutput_mixup_before_log_rf_all.h5', key='p', mode='r')

# rf.to_hdf('results/multioutput/atlas-llm-with-wm-mixup-before-log/results_atlas_llm_multioutput_mixup_before_log_rf_all.h5', key='p', mode='w')
# bprf.to_hdf('results/multioutput/atlas-llm-with-wm-mixup-before-log/best_params_atlas_llm_multioutput_mixup_before_log_rf_all.h5', key='p', mode='w')

In [67]:
rflist, bprflist = load_dfs("","*multioutput*rf*")

['10x'] [0.01]
['10x'] [0.1]
['10x'] [0.3]
['10x'] [1.]
['No mixup' '5x'] ['No mixup' 0.01 0.1]
['5x'] [0.3 1. ]
6 6


In [15]:
# bprf

In [68]:
rf = pd.concat(rflist, ignore_index=True)
bprf = pd.concat(bprflist, ignore_index=True)

In [16]:
# plot_all_scores(rf, col_order=["No mixup", "5x", "10x"], hue_order=["No mixup", 0.01, 0.1, 0.3, 1.0])