# Packages Loading


In [None]:
%load_ext autoreload
%autoreload 2

from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pingouin as pg
import yaml
from tqdm import tqdm

from analysis.dataset.june_dataset import JuNEDataset
from analysis.metrics.metrics_graph import GraphMetrics
from analysis.metrics.utils.graph_tools import dataframe_to_graphviz


def read_config(config_path: Path = Path("data_config.yaml")) -> dict:
    with config_path.open("r") as stream:
        try:
            return yaml.safe_load(stream)
        except yaml.YAMLError:
            return {}


config = read_config()


# Read Data


In [None]:
df_hack = pd.read_csv(config.get("dataset_path"), index_col=0)
df_labels = pd.read_csv(config.get("label_mapping_path"), index_col=0)
df_hack = df_hack.merge(df_labels, on='action_id')

df_hack.head()


In [None]:
june = JuNEDataset(df_hack)
june.prepare_dataset()
june.df.head()


# Initialize Metrics Processor


In [None]:
processor = GraphMetrics()


# Metrics Calculation


In [None]:
grouped = june.df.groupby("kernel_id")
df_kernel = grouped.get_group(list(grouped.groups.keys())[0])
df_kernel.head()


In [None]:
graph_metrics = processor.calculate_kernel_metrics(df_kernel)
graph_metrics.head()


In [None]:
graph_metrics = processor.calculate_metrics(june.df)
graph_metrics.head()


In [None]:
df_tmp = june.df[['user_id', 'kernel_id', 'expert', 'task']] \
    .drop_duplicates(subset=['user_id', 'kernel_id', 'expert'], keep='last')

graph_metrics_merged = graph_metrics.merge(df_tmp, on='kernel_id')
graph_metrics_merged.head()


In [None]:
metrics = list(processor.graph_metrics_mapping.keys())
graph_table = graph_metrics_merged.groupby(['task', 'expert'])[metrics].mean().round(2).applymap('{:.2f}'.format)
graph_table


In [None]:
pretty_lang = {'task1': 'DS', 'task2': 'ML', np.NaN: 'All',
               False: 'Student', True: 'Professional'}

new_names = ['Task','Level of expertise', 'Modularity', 'Average degree', 'Average clustering coef.']
graph_table = graph_table.reset_index()
graph_table.columns = new_names
graph_table = graph_table.replace(pretty_lang).set_index(['Task', 'Level of expertise']).astype(float).applymap(lambda x: str.format("{:0_.2f}", x).replace('.', ',').replace('_', '.'))
print(graph_table.to_latex(escape=True))

# print(graph_table.to_latex(escape=True, multirow=True))


In [None]:
graph_table.replace(pretty_lang).set_index(['Task', 'Level of expertise']).astype(float).applymap(lambda x: str.format("{:0_.2f}", x).replace('.', ',').replace('_', '.'))

In [None]:
graph_metrics_merged.anova(dv="modularity", between=["expert", "task"]).round(3)


In [None]:
graph_metrics_merged.anova(dv="average_degree", between=["expert", "task"]).round(3)


In [None]:
graph_metrics_merged.anova(dv="average_clustering", between=["expert", "task"]).round(3)



# Display graph

In [None]:
graphs = {}
grouped = june.df.groupby(['user_id', 'kernel_id'])

for (user_id, kernel_id), g in tqdm(grouped):
    df_kernel = g
    gv = dataframe_to_graphviz(df_kernel)
    gv.attr(rankdir='LR', size='100,100')
    gv.render(directory='figures/graphs', format='png', filename=f"graph_{user_id}", cleanup=True).replace('\\', '/')
    graphs[(user_id, kernel_id)] = gv


In [None]:
list(graphs.values())[0]


In [None]:
grouped = june.df.groupby("kernel_id")

load = True
if not load:
    evolution_dfs = [
        pd.concat(
            [processor.calculate_metrics(g.iloc[:i], progress=False)
             for i in tqdm(range(1, g.shape[0], 10))], axis=0, ignore_index=True
        ).reset_index().rename({"index": "state_num"}, axis=1)
        for kernel_id, g in grouped
    ]

    all_evolutions = pd.concat(evolution_dfs, axis=0, ignore_index=True)
    all_evolutions.to_csv("../data/graph_evolution_distill.csv")
else:
    all_evolutions = pd.read_csv(config.get("graph_evolution_path"), index_col=0)

all_evolutions.head()


In [None]:
def continuous_transform(x, y, size: int = 1000, normalize: bool = True) -> tuple[np.ndarray, np.ndarray]:
    x_max = 1 if normalize else np.max(x)
    xp = np.linspace(0, x_max, size)
    x = np.linspace(0, x_max, len(y))
    y = np.array(y)

    y[np.isnan(y)] = 0
    y = np.interp(xp, x, y) + 1e-5
    return xp, y


In [None]:
import seaborn as sns

sns.reset_orig()


In [None]:
plt.style.use('https://github.com/klieret/simple-science-style/raw/main/stylesheets/sss1.mplstyle')
plt.rcParams["font.family"] = "Times New Roman"


In [None]:
fig, axs = plt.subplots(1, 3, figsize=(10, 2))

for i, metric in enumerate(metrics):
    ax = axs[i]
    curves = []
    for kernel_id, g in all_evolutions.groupby("kernel_id"):
        x, y = continuous_transform(
            np.linspace(0, 1, g[metric].shape[0]),
            g[metric]
        )
        curves.append(y)
        ax.plot(x, y, alpha=0.2, color='k')

    ax.set_title(metric, fontsize=14)
    ax.plot(x, np.sum(curves, axis=0) / len(curves), color='firebrick', lw=4)

axs[1].set_xlabel("Normalized time", fontsize=14)
axs[1].set_yscale("log")
plt.savefig("figures/evolution/graph_metrics.pdf", dpi=300, bbox_inches='tight')
plt.show()


In [None]:
df_tmp = june.df[['user_id', 'kernel_id', 'expert', 'task']] \
    .drop_duplicates(subset=['user_id', 'kernel_id', 'expert'], keep='last')

graph_evolution_merged = all_evolutions.merge(df_tmp, on='kernel_id')
graph_evolution_merged.head()


In [None]:
df_tmp = graph_evolution_merged
max_values = df_tmp.groupby('kernel_id')['state_num'].transform('max')
df_tmp['normalized_state'] = df_tmp['state_num'] / max_values

df_corr = df_tmp.groupby(['task', 'expert'])[['normalized_state', *metrics]].corr()
indices = [i for i in df_corr.index if i[-1] == 'normalized_state']
df_corr[metrics] = df_corr[metrics]

table = df_corr.loc[indices][metrics].droplevel(2)
table.round(3)


In [None]:
print(table.round(2).applymap('{:.2f}'.format).to_latex(escape=True, multirow=True))


In [None]:
alpha = 0.05
for task in ['task1', 'task2']:
    for expert in [True, False]:
        for metric in metrics:
            df_sample = df_tmp[(df_tmp.task == task) & (df_tmp.expert == expert)][['normalized_state', metric]]
            cor = pg.corr(df_sample.normalized_state, df_sample[metric])
            if cor['p-val'].iloc[0] > alpha:
                print(metric, task, expert, cor['r'].iloc[0].round(2), cor['p-val'].iloc[0].round(2))
