In [28]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
import json

In [29]:
j_values = [50, 200, 500]
r_values = [0.1, 0.25, 0.5, 0.75, 0.9]
m_values = range(50)  # From 0 to 49 (inclusive)
temp_json_results_dir = 'temp_json_results/soft_kmeans'

dic = {}
dic['param'] = {
    "num_of_datasets_per_combination": 50,
    "n_iter": 5000,
    "n_biomarkers": 10
}
not_available = 0
for j in j_values:
    for r in r_values:
        combstr = f"{int(j*r)}|{j}"
        if combstr not in dic:
            dic[combstr] = []
        for m in m_values:
            try:
                with open(f"{temp_json_results_dir}/temp_results_{j}_{r}_{m}.json") as f:
                    d = json.load(f)
                tau = list(d.values())[0][0]
                dic[combstr].append(tau)
            except:
                not_available += 1
                print(f"{combstr}_{m}")
                dic[combstr].append(np.nan)

print(f"not available: {not_available}")
with open('results/results.json', "w") as file:
        json.dump(dic, file, indent=4)

not available: 0


In [30]:
def plot_tau_synthetic(
        tau_file,
        ns,
        rs,
        num_of_datasets_per_combination,
        plot_name = 'violin_plot',
        method = "Soft K-Means"
    ):
    with open(tau_file) as f:
        tau = json.load(f)
    data = tau
    param = data['param']
    print(param)
    dict_list = []
    for n in ns:
        for r in rs:
            key = f"{int(n*r)}|{n}"
            for m in range(0, num_of_datasets_per_combination):
                dic = {}  # Create a new dictionary for each loop iteration
                dic["n"] = f"$J={n}$"
                dic['r'] = f"{int(r*100)}%"
                dic['tau'] = data[key][m]
                dict_list.append(dic)  # Append the new dictionary
    df = pd.DataFrame(dict_list)

    fig, ax = plt.subplots(figsize=(12, 8))
    
    # Create the boxplot
    # Optionally, add a violin plot for better distribution visualization
    if 'boxplot' in plot_name:
        g = sns.boxplot(data=df, x="n", y="tau", hue="r", palette="bright", ax = ax)
    else:
        g = sns.violinplot(data=df, x="n", y="tau", hue="r", palette="bright", 
                       dodge=True, alpha=0.6, linewidth=0, ax = ax)
        
    g.set_ylim(-0.5, 1)

    # Set the x-axis label
    g.set_xlabel("Participant Size", fontsize=14)

    # Set the y-axis label
    g.set_ylabel("Kendall's Tau", fontsize=14)

    # Set the plot title
    g.set_title(f"Kendall's Tau values across different combinations in synthetic data ({method})", fontsize=16)

    # Adjust the legend and move it outside the figure
    plt.legend(title="Healthy Ratio", title_fontsize='13', fontsize='10', 
            bbox_to_anchor=(1.05, 1), loc='upper left')
    
    print(param['num_of_datasets_per_combination'])
    print(param['n_iter'])

    # Add a multi-line caption to the plot
    caption_text = (
        "Notes:\n"
        "\n"
        "This figure shows Kendall's Tau for different combinations of participant size and healthy ratio.\n"
        f"Each bombination has {param['num_of_datasets_per_combination']} variants of datasets\n"
        "The results are derived from our own implementation of soft kmeans based on synthetic data with 10 biomarkers.\n"
        f"Number of iterations is {param['n_iter']}."
    )
    ax.figure.text(
        0.05, -0.01, caption_text, ha='left', va='top',
        fontsize=12, wrap=True
    )

    # Adjust the layout to make room for the legend
    plt.tight_layout()  # Leave some space at the bottom for the caption
    # Show the plot
    # plt.show()
    plt.savefig(f'results/{plot_name}.png', bbox_inches='tight')
    # Close the plot to avoid issues with subsequent plots
    plt.close()

In [31]:
# tau_file = 'results/cp_results.json'
tau_file = "results/results.json"
ns = [50, 200, 500]
rs = [0.1, 0.25, 0.5, 0.75, 0.9]
num_of_datasets_per_combination = 50

In [32]:
plot_tau_synthetic(
        tau_file,
        ns,
        rs,
        num_of_datasets_per_combination,
        plot_name = 'boxplot',
)

{'num_of_datasets_per_combination': 50, 'n_iter': 5000, 'n_biomarkers': 10}
50
5000


In [33]:
plot_tau_synthetic(
        tau_file,
        ns,
        rs,
        num_of_datasets_per_combination,
)

{'num_of_datasets_per_combination': 50, 'n_iter': 5000, 'n_biomarkers': 10}
50
5000
