In [None]:
import pandas as pd
import numpy as np
from os import listdir
from os.path import isfile, join
import matplotlib.pyplot as plt
import os
from matplotlib import rcParams
rcParams.update({'figure.autolayout': True})


def decode_feature_string(feature_string):
    feature_string = feature_string.split("[")[1]
    flist = feature_string.split("-")
    embedding_method = flist[1]
    merge_features = flist[2]
    add_degree = flist[3]
    compute_node_embeddings = flist[4]
    time_series_feature_set = flist[5]
    if time_series_feature_set == "empty":
        time_series_feature_set = ""
    if add_degree == "False":
        add_degree = ""
    elif add_degree == "True":
        add_degree = "D"

    if compute_node_embeddings == "True":
        if merge_features == "True":
            main_text = f"{time_series_feature_set}\n{embedding_method}"
        elif merge_features == "False":
            main_text = embedding_method
    elif compute_node_embeddings == "False":
        main_text = time_series_feature_set

    final_feature_label = f"{main_text}\n{add_degree}"
    return final_feature_label


def nlargest_and_plot(dftemp, metric, title_ads, save=False, tail=False, file_folder="hcp_rs_606_results_vis"):
    metrics_list = ['Accuracy', 'Balanced_accuracy_score', 'F1macro', 'F1micro',
       'F1weighted', 'MCC', 'precision', 'recall_score']
    dftemp[metrics_list] = dftemp[metrics_list].astype(np.float)
    
    nsmallest_metric = dftemp[[metric, "x_names"]].nsmallest(len(dftemp), metric)
    if tail:
        nsmallest_metric = nsmallest_metric.tail(tail)
    print(nsmallest_metric)
    x = nsmallest_metric["x_names"]
    y = nsmallest_metric[metric]
    fig = plt.figure(figsize=(20,9), dpi=150)
    plt.scatter(x, y)
    fig.suptitle(f"{metric}, {title_ads}")
    if save:
        try:
            os.mkdir(file_folder)
        except FileExistsError:
            pass
        save_name = f"./{file_folder}/{metric}_{title_ads}.jpg"
        save_name = save_name.replace(",", "_")
        save_name = save_name.replace(" ", "_")
        plt.savefig(save_name)
    else:
        plt.show()

In [None]:
# LOAD DATASET
# file_folder = "hcp_17_51"
# file_folder = "mutag"
# file_folder = "uj_gpu"
file_folder = "hcp_rs_606"
# file_folder = 'hcp_17_49'
# file_folder = "hcp_17_ex"

file_folder = f"./training_visualizations/{file_folder}"
    
file_list = [f for f in listdir(file_folder) if isfile(join(file_folder, f))]
file_list = [f"{file_folder}/{x}" for x in file_list]

file_ident = "metric_results"

metric_results_file_list = [x for x in file_list if file_ident in x]


# CREATE A SINGLE DF WITH ALL RESULTS
dffinal = pd.DataFrame()
for file in metric_results_file_list:
    df = pd.read_excel(file)
    df = df.T
    df.columns = df.loc["Unnamed: 0"]
    df.drop("Unnamed: 0", axis=0, inplace=True)
    df.index = [file.split("/")[-1][len(file_folder.split("/")[-1])+1:-len(file_ident)-6]]
    dffinal = pd.concat([dffinal, df], axis=0)
    
# DECODE VALUES OF PARAMETERS  
ifl = 0
dffinal["Threshold"] = [f"{x.split('_')[ifl]}" for x in dffinal.index.tolist()]
if dffinal.index.tolist()[0].find("empty") != -1:
    ifl += 1
                        
dffinal["Hidden Channels"] = [f"{x.split('_')[ifl+2]}" for x in dffinal.index.tolist()]

dffinal["Features"] = [f"{decode_feature_string(x)}" for x in dffinal.index.tolist()]                       
dffinal["Model"] = [f"{x.split('_')[ifl+6]}" for x in dffinal.index.tolist()]  
dffinal["Graph type"] = [f"{x.split('_')[ifl+8]}" for x in dffinal.index.tolist()]
dffinal["Batch size"] = [f"{x.split('_')[ifl+9]}" for x in dffinal.index.tolist()]
                    
name_cols = ["Threshold", "Hidden Channels", "Features", "Model", "Batch size", "Graph type"]                       
dffinal["x_names"] = ["\n".join(dffinal.loc[dffinal.index.tolist(), name_cols].values.tolist()[x]) for x in range(len(dffinal))]


In [None]:
for name1, group1 in dffinal.groupby("Features"):
    for name2, group2 in group1.groupby("Model"):
        for name3, group3 in group2.groupby("Batch size"):
            print(group3)
            # PLOT N BEST MODELS
            n_best= 20
            metric = "MCC"
            metrics_list = ['Accuracy', 'Balanced_accuracy_score', 'F1macro', 'F1micro',
                   'F1weighted', 'MCC', 'precision', 'recall_score']
            group3[metrics_list] = group3[metrics_list].astype(np.float)
            local_name1 = name1.replace('\n', ' ')
            nlargest_and_plot(dftemp=group3.nlargest(n_best, metric), metric=metric,
                              title_ads=f"{local_name1, name2, name3}", save=True,
                             )


In [None]:
# a function to select only a part of the results
def show_only(x):
#     if x.find("_GCNse_") != -1:
    if x.find("full") != -1:
            if x.find("Feather") != -1:
                return True
    else:
        return False        

In [None]:
# dffinal[dffinal.index.map(lambda x: show_only(x)) == True].sort_values(by="MCC")

In [None]:
# PLOT IN GROUPS
name_cols = ["Threshold", "Hidden Channels", "Features", "Model", "Batch size", "Graph type"] 
for col in name_cols:
    for name, group in dffinal.groupby(col):
        nlargest_and_plot(dftemp=group, metric="MCC", tail=20, save=True, title_ads=f"grouped by {col}")

In [None]:
# PLOT N BEST MODELS
n_best= 20
metric = "precision"
metrics_list = ['Accuracy', 'Balanced_accuracy_score', 'F1macro', 'F1micro',
       'F1weighted', 'MCC', 'precision', 'recall_score']
for metric in metrics_list:
    dffinal[metrics_list] = dffinal[metrics_list].astype(np.float)
    nlargest_and_plot(dftemp=dffinal.nlargest(n_best, metric), metric=metric, save=True, title_ads=f"{n_best} best models")

In [None]:
!zip -r hcp_rs_606_results_vis.zip hcp_rs_606_results_vis