In [None]:
%load_ext autoreload
%autoreload 2

from itertools import product
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pingouin as pg
import seaborn as sns
import yaml

from dataset.june_dataset import JuNEDataset
from metrics.metrics_time import TimeMetrics


def read_config(config_path: Path = Path("data_config.yaml")) -> dict:
    with config_path.open("r") as stream:
        try:
            return yaml.safe_load(stream)
        except yaml.YAMLError:
            return {}


config = read_config()


In [None]:
df_hack = pd.read_csv(config.get("dataset_path"), index_col=0)
df_labels = pd.read_csv(config.get("label_mapping_path"), index_col=0)
df_hack = df_hack.merge(df_labels, on='action_id')

df_hack.head()


In [None]:
june = JuNEDataset(df_hack)
june.prepare_dataset()
june.df.head()


In [None]:
TM = TimeMetrics()
metrics = TM.calculate_metrics(june.df)
metrics.head()
kernel_mapping  = metrics.loc[:, ['kernel_id', 'task', 'expert']].drop_duplicates()


# Action time

In [None]:
metrics.groupby(['task', 'expert']).next_action_time.describe()


In [None]:
test  = metrics.groupby(['task', 'expert']).describe()


In [None]:
tr = metrics.next_action_time.quantile(.95)
# TODO recalculate execute time
at_df = metrics.loc[(metrics.next_action_time < tr), :]


In [None]:
cols = list(product(['execution_time_sec', 'next_action_time'],['mean', 'std']))
desc_stats_all = at_df.groupby(['task', 'expert']).describe().loc[:,cols]
desc_stats_exp = at_df.groupby(['expert']).describe().loc[:,cols]
desc_stats_task = at_df.groupby(['task']).describe().loc[:,cols]


In [None]:
pg.anova(data=at_df, dv= 'next_action_time', between=['task', 'expert'])


In [None]:

sns.set(rc={'figure.figsize':(13.7,6.27)})
def plot_box_time(df, var='next_action_time'):
    my_order = df.groupby(by=["cell_label"])[var].mean()
    my_order = my_order.sort_values().index

    sns.boxplot(df, x='cell_label', y =var, order=my_order)

    plt.xticks(rotation=90)


In [None]:
plot_box_time(at_df)

In [None]:
plot_box_time(at_df[at_df.task == 'task2'])

In [None]:
plot_box_time(at_df[at_df.task == 'task1'])

# Execution time

In [None]:
# TODO:
# deal with missing execs
# metrics.loc[metrics.event == 'execute', ['kernel_id','expert','task', 'state_time_dt']].state_time_dt.isna().sum()

In [None]:
metrics.loc[metrics.event == 'execute', 'cell_index'].nunique(), len(metrics[metrics.event == 'execute'])

In [None]:
sum_exec = metrics.groupby('kernel_id').apply(lambda x: x.loc[x.event == 'execute', 'execution_time_sec'].sum()).to_frame('exec_time_sum') / 60
sum_exec_count = metrics.groupby('kernel_id').apply(lambda x: x.loc[x.event == 'execute', 'execution_time_sec'].count()).to_frame('exec_time_count')
sum_exec_mean = metrics.groupby('kernel_id').apply(lambda x: x.loc[x.event == 'execute', 'execution_time_sec'].mean()).to_frame('exec_time_mean') / 60
sum_all = metrics.groupby('kernel_id').next_action_time.sum().to_frame('overall_time') / 60

kernel_time_df = kernel_mapping.merge(sum_exec, on = 'kernel_id').merge(sum_exec_mean, on = 'kernel_id').merge(sum_all, on = 'kernel_id').merge(sum_exec_count, on = 'kernel_id')
kernel_time_df['percent'] = kernel_time_df.exec_time_sum / kernel_time_df.overall_time * 100
kernel_time_df['exec_time_n'] = kernel_time_df.exec_time_sum / kernel_time_df.exec_time_count
exec_comp = kernel_time_df.groupby(['task', 'expert']).describe()
exec_comp_task = kernel_time_df.groupby('task').describe()
exec_comp_exp = kernel_time_df.groupby('expert').describe()

cols = list(product(['exec_time_sum', 'percent','exec_time_n'],['mean', 'std']))
exec_stats = exec_comp.loc[:,cols]
exec_stats


In [None]:
test


In [None]:
# TODO: Refactor normal names
test = pd.concat([desc_stats_all.reset_index(), desc_stats_task.reset_index(), desc_stats_exp.reset_index()])
test2 = pd.concat([exec_comp.reset_index(), exec_comp_task.reset_index(), exec_comp_exp.reset_index()])
test = pd.concat([test, test2[[('percent', 'mean'), ('percent', 'std')]]], axis=1)

pretty_lang = {'task1': 'DS', 'task2': 'ML', np.NaN: 'All',
               False: 'Student', True: 'Professional'}
new_names = ['Task','Level of expertise', 'Execution time (sec)', 'Execution time (sec)', 'Time between actions (sec)', 'Time between actions (sec)', '% of total time', '% of total time']
test.columns =  pd.MultiIndex.from_tuples([(new_names[i] ,column[1]) for i, column in enumerate(test.columns.to_list())])
test = test.replace(pretty_lang).set_index(['Task', 'Level of expertise']).applymap(lambda x: str.format("{:0_.2f}", x).replace('.', ',').replace('_', '.'))
test = test.loc[test.index[[0,1,4,2,3,5,6,7]]]
print(test.to_latex(escape=True))

In [None]:
execs = metrics.loc[metrics.event == 'execute', ['kernel_id','expert','task', 'execution_time_sec']].dropna()

In [None]:
execs.anova(dv="execution_time_sec", between=["expert", "task"]).round(3)

In [None]:
t1_execs_s = kernel_time_df.loc[((~kernel_time_df.expert) & (kernel_time_df.task == 'task1')), 'exec_time_count']
t1_execs_e = kernel_time_df.loc[((kernel_time_df.expert) & (kernel_time_df.task == 'task1')), 'exec_time_count']
pg.ttest(t1_execs_s, t1_execs_e).round(2)

In [None]:
t1_execs_s = kernel_time_df.loc[((~kernel_time_df.expert) & (kernel_time_df.task == 'task1')), 'exec_time_n']
t1_execs_e = kernel_time_df.loc[((kernel_time_df.expert) & (kernel_time_df.task == 'task1')), 'exec_time_n']
pg.ttest(t1_execs_s, t1_execs_e).round(2)

In [None]:
t1_execs_s = kernel_time_df.loc[((~kernel_time_df.expert) & (kernel_time_df.task == 'task2')), 'exec_time_n']
t1_execs_e = kernel_time_df.loc[((kernel_time_df.expert) & (kernel_time_df.task == 'task2')), 'exec_time_n']
pg.ttest(t1_execs_s, t1_execs_e).round(2)

In [None]:
plot_box_time(metrics[metrics.execution_time_sec < 1], var='execution_time_sec')

In [None]:
plot_box_time(metrics[metrics.execution_time_sec > 10], var='execution_time_sec')

In [None]:
metrics['scr_len'] = metrics.cell_source.str.len()

In [None]:
sns.jointplot(metrics[metrics.event == 'execute'], x='execution_time_sec', y='scr_len', hue='task');

In [None]:
pg.corr(metrics[metrics.event == 'execute'].execution_time_sec, metrics[metrics.event == 'execute'].scr_len)

In [None]:
# plot_box_time(metrics[metrics.execution_time_sec < 10], var='execution_time_sec')