In [None]:
from pathlib import Path
import json
import pandas as pd

path = Path('../../reports')

# get folders from path
folders = [x for x in path.iterdir() if x.is_dir()]

# in the folders are multiple json files. 
# Create a dataframe that contains in every row the data from the json files named "synthetic.json".

df = pd.DataFrame()
for folder in folders:
    files = [x for x in folder.iterdir() if x.is_file()]
    for file in files:
        with open(file) as json_file:
            data = json.load(json_file)
            data = {k: v for k, v in data.items() if k in ['metrics', 'data stats']}
            data = pd.json_normalize(data, meta=['metrics', 'data stats'])
            model = folder.name.split("_")[0]
            data['type'] = file.name.split(".")[0]
            data["model"] = model
            df = pd.concat([df, data], ignore_index=True)

df


In [3]:

#normalize the column metrics.avg min matching distance with the column data stats.graph stats.avg number of nodes
df['metrics.avg min matching distance'] = df['metrics.avg min matching distance']

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# get all rows with type = synthetic and remove them from the dataframe
df_synthetic = df[df['type'].isin(['synthetic'])]
df = df[~df['type'].isin(['synthetic'])]

metrics = ["avg min matching distance", "avg F1 score", "False positive rate", "True positive rate"]

# add column containing small, meidum or large depending whether the value for model contains small, medium or large
df_synthetic['size'] = df_synthetic['model'].apply(lambda x: 'small' if 'small' in x else ('medium' if 'medium' in x else 'large'))

# create a plot for every value of size and every metric
for size in df_synthetic['size'].unique():

    df_synthetic_size = df_synthetic[df_synthetic['size'] == size]
    #set order so that the model containing "unsup" is always on the left
    order = df_synthetic_size['model'].unique()
    order = sorted(order, key=lambda x: 'unsup' in x)
    df_synthetic_size['model'] = pd.Categorical(df_synthetic_size['model'], order)
    df_synthetic_size = df_synthetic_size.sort_values('model')


    for metric in metrics:
        g = sns.catplot(
            data=df_synthetic_size, kind="bar",
            x="size", y=f"metrics.{metric}", hue="model",
            errorbar="sd", palette="dark", alpha=.6, height=6,
        )
        

        g.despine(left=True)
        g.set_axis_labels("", metric)
        plt.title(f"{metric}")
        #plt.ylim(0, 0.9)
        plt.show()
    


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

metrics = ["avg min matching distance", "avg F1 score", "False positive rate", "True positive rate"]

df = df[df["model"] != "GCNR-hard"]


for metric in metrics:
    #set order so that the model containing "unsup" is always on the left. than small, medium, large
    order = df['model'].unique()
    order = sorted(order, key=lambda x: 'unsup' in x)
    order = sorted(order, key=lambda x: 'small' in x)
    order = sorted(order, key=lambda x: 'medium' in x)
    order = sorted(order, key=lambda x: 'hard' in x)
    df['model'] = pd.Categorical(df['model'], order)
    df = df.sort_values('model')


    g = sns.catplot(
        data=df, kind="bar",
        x="type", y=f"metrics.{metric}", hue="model",
        errorbar="sd", palette="dark", alpha=.6, height=6
    )
    g.despine(left=True)
    g.set_axis_labels("", metric)
    plt.title(f"{metric}")
    plt.ylim(0, None)
    plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

df = df[df["model"] != "unsup"]
df = df[df["model"] != "GCNR-hard"]

metrics2 = ["avg rank of source", "roc score"]


for metric in metrics2:
    g = sns.catplot(
        data=df, kind="bar",
        x="type", y=f"metrics.{metric}", hue="model",
        errorbar="sd", palette="dark", alpha=.6, height=6
    )
    g.despine(left=True)
    g.set_axis_labels("", metric)
    plt.title(f"{metric}")
    plt.show()