## Validation Data

In [None]:
import os
import pandas as pd
import torch
from pathlib import Path
from tqdm import tqdm
import glob

names = ["karate", "airports", "facebook", "actor", "wiki"]
#names = ["karate", "airports", "facebook", "actor", "wiki"]

rw_metrics = {}

for name in tqdm(names):
    if name == "synthetic":
        continue
    data_file = Path(f"../../data/validation/{name}/raw/0.pt")
    data = torch.load(data_file)
    rw_metrics[name] = data.metrics

metrics = []
settings = []


for name in tqdm(names):
    data_dir = Path(f"../data/validation/{name}/raw")
    data_files = os.listdir(data_dir)
    data_files = glob.glob(f"{data_dir}/*.pt")

    for data_file in tqdm(data_files):
        data = torch.load(data_file)
        if name == "synthetic":
            metrics.append(data.metrics)
        else:
            metrics.append(rw_metrics[name])
        settings.append(data.settings)
        if name != "synthetic":
            data.settings["graph_type"] = name

training_data_files = glob.glob(f"{Path(f'../data/training/synthetic/raw')}/*.pt")
for data_file in tqdm(training_data_files):
    data = torch.load(data_file)
    metrics.append(data.metrics)
    settings.append(data.settings)
    #data.settings["graph_type"] = "synthetic"

df_metrics = pd.DataFrame(metrics)
df_settings = pd.DataFrame(settings)
df_metrics.to_csv("metrics.csv")
df_settings.to_csv("settings.csv")

df_settings = pd.read_csv("settings.csv")
df_metrics = pd.read_csv("metrics.csv")
df_metrics = df_metrics.drop(columns=["Unnamed: 0"])
rw_metrics = df_metrics.join(df_settings)

# df_metrics = pd.concat([df_metrics, rw_metrics], axis=0)
rw_metrics["percentages_infected"] = rw_metrics["currently_infected"] / rw_metrics["n_nodes"]
rw_metrics


In [None]:
training_data_files = glob.glob(f"{Path(f'../data/training/synthetic/processed')}/*.pt")
for data_file in tqdm(training_data_files):
    data = torch.load(data_file)
    print(data.x.shape)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Set the style
sns.set(style="whitegrid", font_scale=1.2)

# Create a new column synthetic_or_real containing either "synthetic" or "real" depending on the graph type
rw_metrics["synthetic_or_real"] = rw_metrics["graph_type"].apply(lambda x: "barabasi_albert" if x == "barabasi_albert" else "watts_strogatz" if x=="watts_strogatz" else "real")
rw_metrics["sorting_column"] = rw_metrics["graph_type"].apply(lambda x: 0 if x == "barabasi_albert" else 1 if x=="watts_strogatz" else 2)
rw_metrics = rw_metrics.sort_values(by=["sorting_column"], ascending=True)
print(rw_metrics["graph_type"].unique())
print(rw_metrics["synthetic_or_real"].unique())

# Create the plot where the data from every folder has to be plotted in a different color.
g1 = sns.JointGrid(
    data=rw_metrics, 
    # y="avg_degree_centrality", 
    # y="diameter", 
    x="average_clustering_coefficient", 
    y="average_shortest_path_length", 
    # y="n_edges", 
    hue="graph_type",
)
g1.plot_joint(sns.scatterplot, edgecolor="black", linewidth=0.5, s=rw_metrics["n_edges"]/rw_metrics["n_edges"].max() * 700, alpha=.5)

# Filter the dataframe to exclude "real" graph type
rw_metrics_filtered = rw_metrics[rw_metrics["synthetic_or_real"] != "real"]
g1.plot_marginals(sns.histplot, data=rw_metrics_filtered, hue="synthetic_or_real")

# Customize the plot
g1.ax_marg_x.set_axis_off()  # Hide x-axis marginal plot
g1.ax_marg_y.set_axis_off()  # Hide y-axis marginal plot

# # set x-axis range and rotate labels
# g1.ax_joint.set_xlim(0, 5)
# g1.ax_joint.set_xticks(np.arange(0, 5, 0.5))

# Set plot title and axis labels
g1.set_axis_labels("Clustering Coefficient", "Average Shortest Path Length", labelpad=15)

# Change the legend item names to be more readable (Barabasi-Albert, Watts-Strogatz, Real World)
# move the legend to an empty part of the plot (top left)

rename_dict = {
    "barabasi_albert":"Barabasi-Albert",
    "watts_strogatz":"Watts-Strogatz",
    "actor":"Actor",
    "airports":"Airports",
    "facebook":"Facebook",
    "karate":"Karate",
    "wiki":"Wiki",
    "real":"Real-World"
}
rw_metrics["graph_type"] = rw_metrics["graph_type"].apply(lambda x: rename_dict[x])

handles, labels = g1.ax_joint.get_legend_handles_labels()
print(handles, labels)
g1.ax_joint.legend(title='Graph Type', loc="upper left")

#sns.move_legend(g1.ax_joint, "lower right", title='Graph Type', bbox_to_anchor=(1.8, 1))

# Show the plot
plt.tight_layout()
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Set the style
sns.set(style="whitegrid", font_scale=1.2)

# Create the plot where the data from every folder has to be plotted in a different color.
g = sns.JointGrid(
    data=rw_metrics, 
    x="average_clustering_coefficient", 
    # x="avg_degree_centrality", 
    y="n_nodes", 
    hue="graph_type",
)
g.plot(sns.scatterplot, sns.histplot)

# Customize the plot
g.ax_marg_x.set_axis_off()  # Hide x-axis marginal plot
g.ax_marg_y.set_axis_off()  # Hide y-axis marginal plot

# Set plot title and axis labels
g.set_axis_labels("Clustering Coefficient", "Percentage Infected", labelpad=15)
sns.move_legend(g.ax_joint, "lower right", title='Graph Type', bbox_to_anchor=(1.8, 1))

# Show the plot
plt.tight_layout()
plt.show()

## Training Data:

In [None]:
# iterate over every data object in the data/training/synthetic/raw folder
import os
import pandas as pd
import torch
from pathlib import Path
from tqdm import tqdm
import glob

data_dir = Path("../data/training/synthetic/raw")
data_files = os.listdir(data_dir)
data_files = glob.glob(f"{data_dir}/*.pt")

# create a list of dictionaries to store the data
metrics = []
settings = []

for data_file in tqdm(data_files):
    data = torch.load(data_file)
    metrics.append(data.metrics)
    settings.append(data.settings)
df_metrics = pd.DataFrame(metrics)
df_settings = pd.DataFrame(settings)
df_metrics.to_csv("metrics.csv")
df_settings.to_csv("settings.csv")

df_settings = pd.read_csv("settings.csv")
df_metrics = pd.read_csv("metrics.csv")
df_metrics = df_metrics.drop(columns=["Unnamed: 0"])
df_metrics = df_metrics.join(df_settings)

# df_metrics = pd.concat([df_metrics, rw_metrics], axis=0)
df_metrics["percentages_infected"] = df_metrics["currently_infected"] / df_metrics["n_nodes"]
df_metrics


In [None]:
print(list(df_metrics.columns))

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Set the style
sns.set(style="whitegrid", font_scale=1.2)

# Create the plot where the data from every folder has to be plotted in a different color.
g1 = sns.JointGrid(
    data=df_metrics, 
    # y="avg_degree_centrality", 
    # y="diameter", 
    x="average_shortest_path_length", 
    y="n_nodes", 
    # y="n_edges", 
    hue="neighbours",
)
g1.plot(sns.scatterplot, sns.histplot)

# make the x-axis more dense
#g1.ax_joint.set_xticks(np.arange(3, 10, 0.5))
# rotate the x-axis labels
g1.ax_joint.tick_params(axis='x', rotation=45)

# Customize the plot
g1.ax_marg_x.set_axis_off()  # Hide x-axis marginal plot
g1.ax_marg_y.set_axis_off()  # Hide y-axis marginal plot

# Set plot title and axis labels
g1.set_axis_labels("Clustering Coefficient", "Amount of Nodes", labelpad=15)
sns.move_legend(g1.ax_joint, "lower right", title='Graph Type', bbox_to_anchor=(1.8, 1))

# Show the plot
plt.tight_layout()
plt.show()

In [None]:
# display the average_shortest_path_length as a boxplot with annotations for the different values of neighbours
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Set the style
sns.set(style="whitegrid", font_scale=1.2)

# Create the plot where the data from every folder has to be plotted in a different color.
g1 = sns.boxplot(
    data=df_metrics,
    x="neighbours",
    y="average_shortest_path_length",
    hue="neighbours",
    palette="Set3",
    linewidth=2.5,
    showfliers=False,
    width=0.5,
    dodge=False,
)

# Customize the plot
g1.set_xlabel("Neighbours")
g1.set_ylabel("Average Shortest Path Length")
g1.set_title("Average Shortest Path Length for different values of neighbours")
g1.legend(loc="upper right", title="Neighbours")

# Show the plot
plt.tight_layout()
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Set the style
sns.set(style="whitegrid", font_scale=1.2)

# create the plot above for every value of prob_reconnect in the range of 0.01 to 0.4 with a step size of 0.1
last = 0
for prob_reconnect in np.arange(0.11, 0.21, 0.2):
    print(f"range: {last} - {prob_reconnect}")
    df = df_metrics[df_metrics["prob_reconnect"].between(last, prob_reconnect)]
    # Set the style
    sns.set(style="whitegrid", font_scale=1.2)

    # Create the plot where the data from every folder has to be plotted in a different color.
    g1 = sns.JointGrid(
        data=df, 
        # y="avg_degree_centrality", 
        # y="diameter", 
        x="average_shortest_path_length", 
        y="n_nodes", 
        # y="n_edges", 
        hue="neighbours",
    )
    g1.plot(sns.scatterplot, sns.histplot)

    # Customize the plot
    g1.ax_marg_x.set_axis_off()  # Hide x-axis marginal plot
    g1.ax_marg_y.set_axis_off()  # Hide y-axis marginal plot

    # Set plot title and axis labels
    g1.set_axis_labels("Clustering Coefficient", "Amount of Nodes", labelpad=15)
    sns.move_legend(g1.ax_joint, "lower right", title='Graph Type', bbox_to_anchor=(1.8, 1))

    # Show the plot
    plt.tight_layout()
    plt.show()
    last = prob_reconnect



In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Set the style
sns.set(style="whitegrid", font_scale=1.2)

# create the plot above for every value of prob_reconnect in the range of 0.01 to 0.4 with a step size of 0.1
last = 0
for prob_reconnect in np.arange(0.11, 0.21, 0.02):
    print(f"range: {last} - {prob_reconnect}")
    df = df_metrics[df_metrics["prob_reconnect"].between(last, prob_reconnect)]
    # Set the style
    sns.set(style="whitegrid", font_scale=1.2)

    # Create the plot where the data from every folder has to be plotted in a different color.
    g1 = sns.JointGrid(
        data=df, 
        # y="avg_degree_centrality", 
        # y="diameter", 
        x="average_clustering_coefficient", 
        y="n_nodes", 
        # y="n_edges", 
        hue="neighbours",
    )
    g1.plot(sns.scatterplot, sns.histplot)

    # Customize the plot
    g1.ax_marg_x.set_axis_off()  # Hide x-axis marginal plot
    g1.ax_marg_y.set_axis_off()  # Hide y-axis marginal plot

    # Set plot title and axis labels
    g1.set_axis_labels("Clustering Coefficient", "Amount of Nodes", labelpad=15)
    sns.move_legend(g1.ax_joint, "lower right", title='Graph Type', bbox_to_anchor=(1.8, 1))

    # Show the plot
    plt.tight_layout()
    plt.show()
    last = prob_reconnect

