In [None]:
%load_ext autoreload
%autoreload 2

from pathlib import Path

import numpy as np
import pandas as pd
import yaml
from matplotlib import pyplot as plt

from analysis.dataset.june_dataset import JuNEDataset
from analysis.metrics.metrics_cells import CellsMetrics


def read_config(config_path: Path = Path("data_config.yaml")) -> dict:
    with config_path.open("r") as stream:
        try:
            return yaml.safe_load(stream)
        except yaml.YAMLError:
            return {}

config = read_config()


In [None]:
df_hack = pd.read_csv(config.get("dataset_path"), index_col=0)
df_labels = pd.read_csv(config.get("label_mapping_path"), index_col=0)
df_hack = df_hack.merge(df_labels, on='action_id')

df_hack.head()


In [None]:
june = JuNEDataset(df_hack)
june.prepare_dataset()
june.df.head()


In [None]:
evolution_df = june.to_evolution_dataframe()
evolution_df


In [None]:
evolution_df.shape


In [None]:
mask = (
    evolution_df.event.isin(["execute", "create", "delete", "rendered"])
)
evolution_df[mask].head()


In [None]:
processor = CellsMetrics()

df_tmp = evolution_df[mask]
df_tmp['event'] = 'execute'

df_analysis = processor.calculate_cell_metrics(df_tmp.iloc[:])
df_analysis.head()


In [None]:
def continuous_transform(x, y, size: int = 1000, normalize: bool = True) -> tuple[np.ndarray, np.ndarray]:
    x_max = 1 if normalize else np.max(x)
    xp = np.linspace(0, x_max, size)
    x = np.linspace(0, x_max, len(y))
    y = np.array(y)

    y[np.isnan(y)] = 0
    y = np.interp(xp, x, y) + 1e-5
    return xp, y


In [None]:
import seaborn as sns
sns.reset_orig()


In [None]:
plt.style.use('https://github.com/klieret/simple-science-style/raw/main/stylesheets/sss1.mplstyle')
plt.rcParams["font.family"] = "Times New Roman"


_, (ax1, ax2) = plt.subplots(1, 2, figsize=(8, 3))
metric_name = 'ccn'
aggregation_function = 'sum'
task_ax_mapping = {'task1': ax1, 'task2': ax2}
mean_curve = {'task1': [], 'task2': []}

df = df_analysis
df = df[df.comments <= df.comments.quantile(0.95)]

size_threshold = 200
mn, mx = np.inf, -np.inf
for (task, kernel_id), g in df.groupby(['task', 'kernel_id']):
    if g.shape[0] < size_threshold:
        continue
    events_count = g.state_num.unique().shape[0]
    if metric_name == "cells_num":
        agg_fun_values = g.groupby("state_num").cell_index.agg('count')
        if agg_fun_values.iloc[:3].max() > 10:
            continue
    else:
        agg_fun_values = g.groupby("state_num")[metric_name].agg(aggregation_function)
        
    # if agg_fun_values.iloc[:3].max() > 10:
    #     continue

    x, y = continuous_transform(
        np.arange(events_count), agg_fun_values,
        normalize=True
    )
    mean_curve[task].append(y)

    mn = y.min() if y.min() < mn else mn
    mx = y.max() if y.max() > mx else mx

    task_ax_mapping[task].plot(x, y, color='k', alpha=0.2)

for task, curves in mean_curve.items():
    y = np.sum(curves, axis=0) / len(curves)
    x = np.linspace(0, 1, len(y))

    task_ax_mapping[task].axhline(y[-1], color='k', ls=(0, (5, 5)))
    task_ax_mapping[task].plot(x, y, color='firebrick', lw=3)

for task, ax in task_ax_mapping.items():
    ax.set_ylim(mn * 0.9, mx * 1.1)
    ax.set_xlim(0, 1)
    ax.set_xlabel('Normalized Time', fontsize=14)
    ax.set_ylabel(f"{metric_name} ({aggregation_function})", fontsize=14)
    ax.set_title(task.replace("_", " "), fontsize=14)
    ax.grid(False)


ax2.set_ylabel(None)
ax2.set_yticklabels([])
plt.tight_layout()
plt.savefig(f"figures/evolution/{metric_name}_{aggregation_function}_tasks.pdf", dpi=300, bbox_inches='tight')
plt.show()


In [None]:
import itertools

df_tmp = df.groupby(['kernel_id', 'state_num'])[['sloc', 'ccn', 'objects']] \
    .agg(['mean', 'sum']).reset_index()

df_tmp = df_tmp.loc[df_tmp.groupby('kernel_id').state_num.idxmax()].set_index("kernel_id")
df_tmp['task'] = df_analysis[['task', 'kernel_id']] \
    .drop_duplicates(keep='last').set_index('kernel_id')['task']

df_tmp['expert'] = june.df[['expert', 'kernel_id']] \
    .drop_duplicates(keep='last').set_index('kernel_id')['expert']

df_tmp = df_tmp.reset_index()

cols = list(itertools.product(['sloc', 'ccn', 'objects'], ['mean', 'sum'], ['mean']))
table = df_tmp.drop('state_num', axis=1).groupby(["task", "expert"]).describe().loc[:, cols]
table.round(2)


In [None]:
print(table.round(2).applymap('{:.2f}'.format).to_latex(escape=True, multirow=True))


In [None]:
df_tmp = df_analysis.groupby(['kernel_id', 'state_num'])[['sloc', 'ccn', 'objects']] \
    .agg(['mean', 'sum']).reset_index()
df_tmp.columns = ['_'.join(col).strip("_") for col in df_tmp.columns.values]

df_tmp = pd.merge(
    df_tmp,
    df_analysis[['task', 'kernel_id']] \
        .drop_duplicates(keep='last')[['kernel_id', 'task']],
    on='kernel_id'
)

df_tmp = pd.merge(
    df_tmp,
    june.df[['expert', 'kernel_id']] \
        .drop_duplicates(keep='last')[['kernel_id', 'expert']],
    on='kernel_id'
)

max_values = df_tmp.groupby('kernel_id')['state_num'].transform('max')
df_tmp['normalized_state'] = df_tmp['state_num'] / max_values

metrics_list = list(df_tmp)[2:-3]

df_corr = df_tmp.groupby(['task', 'expert'])[['normalized_state', *metrics_list]].corr()
indices = [i for i in df_corr.index if i[-1] == 'normalized_state']
df_corr[metrics_list] = df_corr[metrics_list]

table = df_corr.loc[indices][metrics_list]
table.round(3)


In [None]:
table.columns = pd.MultiIndex.from_tuples([tuple(i.split('_')) for i in table.columns.to_list()])
table = table.reset_index().drop(columns='level_2')

In [None]:
pretty_lang = {'task1': 'DS', 'task2': 'ML', np.NaN: 'All',
               False: 'Student', True: 'Professional'}
new_names = ['Task','Level of expertise', 'SLOC', 'SLOC', 'CCN', 'CCN', 'N of objects', 'N of objects']
table.columns =  pd.MultiIndex.from_tuples([(new_names[i] ,column[1]) for i, column in enumerate(table.columns.to_list())])
table = table.replace(pretty_lang).set_index(['Task', 'Level of expertise']).applymap(lambda x: str.format("{:0_.2f}", x).replace('.', ',').replace('_', '.'))
print(table.to_latex(escape=True, multirow=True))


In [None]:
import pingouin as pg

alpha = 0.05
for task in ['task1', 'task2']:
    for expert in [True, False]:
        for metric in metrics_list:
            df_sample = df_tmp[(df_tmp.task == task) & (df_tmp.expert == expert)][['normalized_state', metric]]
            cor = pg.corr(df_sample.normalized_state, df_sample[metric])
            if cor['p-val'].iloc[0] > alpha:
                print(metric, task, expert, cor['r'].iloc[0].round(2), cor['p-val'].iloc[0].round(2))
