In [None]:
%load_ext autoreload
%autoreload 2

from itertools import product
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pingouin as pg
import seaborn as sns
import yaml

from dataset.june_dataset import JuNEDataset
from metrics.metrics_time import TimeMetrics


def read_config(config_path: Path = Path("data_config.yaml")) -> dict:
    with config_path.open("r") as stream:
        try:
            return yaml.safe_load(stream)
        except yaml.YAMLError:
            return {}


config = read_config()


In [None]:
df_hack = pd.read_csv(config.get("dataset_path"), index_col=0)
df_labels = pd.read_csv(config.get("label_mapping_path"), index_col=0)
df_hack = df_hack.merge(df_labels, on='action_id')

df_hack.head()


In [None]:
june = JuNEDataset(df_hack)
june.prepare_dataset()
june.df.head()


In [None]:
TM = TimeMetrics()
metrics = TM.calculate_metrics(june.df)
metrics.head()
kernel_mapping  = metrics.loc[:, ['kernel_id', 'task', 'expert']].drop_duplicates()


In [None]:
sum([len(i) for i in TM.unfinished])

# Action time

In [None]:
tr = metrics.next_action_time.quantile(.95)
# TODO recalculate execute time
at_df = metrics.loc[(metrics.next_action_time < tr), :]


In [None]:
def calc_description_stats(df, metrics = ['percent'], stats= ['mean', 'std']):

    cols = list(product(metrics,stats))

    split = df.groupby(['task', 'expert']).describe().loc[:,cols].reset_index()
    expert = df.groupby(['expert']).describe().loc[:,cols].reset_index()
    task = df.groupby(['task']).describe().loc[:,cols].reset_index()
    return pd.concat([split, expert, task])



In [None]:

sum_exec = metrics.groupby('kernel_id').apply(lambda x: x.loc[x.event == 'execute', 'execution_time_sec'].sum()).to_frame('exec_time_sum') / 60
sum_all = metrics.groupby('kernel_id').next_action_time.sum().to_frame('overall_time') / 60
percent = (sum_exec.exec_time_sum / sum_all.overall_time * 100).to_frame('percent').reset_index().merge(metrics[['kernel_id','task','expert']], on='kernel_id', how='left').drop_duplicates()

a = calc_description_stats(at_df, ['next_action_time'],['mean', 'std'])
b = calc_description_stats(metrics, ['execution_time_sec'],['mean', 'std'])
c = calc_description_stats(percent, ['percent'],['mean', 'std'])

merged_stats = a.merge(b, on=['task', 'expert']).merge(c, on=['task', 'expert'])
all = [[np.NaN, np.NaN, at_df.next_action_time.mean(), at_df.next_action_time.std(),
       metrics.execution_time_sec.mean(), metrics.execution_time_sec.std(),
       percent.percent.mean(), percent.percent.std()]]

merged_stats = pd.concat([merged_stats, pd.DataFrame(all, columns=merged_stats.columns)], axis=0).reset_index(drop=True)

pretty_lang = {'task1': 'DA', 'task2': 'ML', np.NaN: 'All',
               False: 'Student', True: 'Professional'}
new_names = ['Task','Level of expertise', 'Time between actions (sec)', 'Time between actions (sec)', 'Execution time (sec)', 'Execution time (sec)', '% of total time', '% of total time']
merged_stats.columns =  pd.MultiIndex.from_tuples([(new_names[i] ,column[1]) for i, column in enumerate(merged_stats.columns.to_list())])
merged_stats = merged_stats.replace(pretty_lang).set_index(['Task', 'Level of expertise']).applymap(lambda x: str.format("{:0_.2f}", x).replace('_', '.'))
merged_stats = merged_stats.loc[merged_stats.index.sort_values(ascending=False)]

In [None]:
print(merged_stats.to_latex(escape=True))

In [None]:
pg.homoscedasticity(data=at_df, dv= 'next_action_time', group='expert')

In [None]:
pg.anova(data=at_df, dv= 'next_action_time', between=['task', 'expert'])


In [None]:
pg.mwu(at_df.loc[at_df.task == 'task1', 'next_action_time'], at_df.loc[at_df.task != 'task1', 'next_action_time'])

In [None]:
pg.mwu(at_df.loc[at_df.expert, 'next_action_time'], at_df.loc[~at_df.expert , 'next_action_time'])

In [None]:
sns.set(rc={'figure.figsize':(13.7,6.27)})
def plot_box_time(df, var='next_action_time', label= 'cell_label'):
    my_order = df.groupby(by=[label])[var].mean()
    my_order = my_order.sort_values().index

    sns.boxplot(df, x=label, y =var, order=my_order)

    plt.xticks(rotation=90)


In [None]:
plot_box_time(at_df, label='matched_label')

In [None]:
plot_box_time(at_df)

In [None]:
plot_box_time(at_df[at_df.task == 'task2'], label='matched_label')

In [None]:
plot_box_time(at_df[at_df.task == 'task1'], label='matched_label')

# Execution time

In [None]:
# TODO:
# deal with missing execs
# metrics.loc[metrics.event == 'execute', ['kernel_id','expert','task', 'state_time_dt']].state_time_dt.isna().sum()

In [None]:
metrics.loc[metrics.event == 'execute', 'cell_index'].nunique(), len(metrics[metrics.event == 'execute'])

In [None]:
execs = metrics.loc[metrics.event == 'execute', ['kernel_id','expert','task', 'execution_time_sec']].dropna()

In [None]:
execs.anova(dv="execution_time_sec", between=["expert", "task"]).round(3)

In [None]:
t1_execs_s = execs.loc[((~execs.expert) & (execs.task == 'task1')), 'execution_time_sec']
t1_execs_e = execs.loc[((execs.expert) & (execs.task == 'task1')), 'execution_time_sec']
res = pg.mwu(t1_execs_s, t1_execs_e).round(2)
if res['p-val'][0] >.05:
    print('')

t1_execs_s = execs.loc[((~execs.expert) & (execs.task == 'task2')), 'execution_time_sec']
t1_execs_e = execs.loc[((execs.expert) & (execs.task == 'task2')), 'execution_time_sec']
res = pg.mwu(t1_execs_s, t1_execs_e).round(2)
if res['p-val'][0] >.05:
    print('!!')

t1_execs_s = execs.loc[((execs.expert) & (execs.task == 'task1')), 'execution_time_sec']
t1_execs_e = execs.loc[((execs.expert) & (execs.task == 'task2')), 'execution_time_sec']
res = pg.mwu(t1_execs_s, t1_execs_e).round(2)
if res['p-val'][0] >.05:
    print('!!')

t1_execs_s = execs.loc[((~execs.expert) & (execs.task == 'task1')), 'execution_time_sec']
t1_execs_e = execs.loc[((~execs.expert) & (execs.task == 'task2')), 'execution_time_sec']
res = pg.mwu(t1_execs_s, t1_execs_e).round(2)
if res['p-val'][0] >.05:
    print('!!')

t1_execs_s = execs.loc[execs.task == 'task1', 'execution_time_sec']
t1_execs_e = execs.loc[execs.task == 'task2', 'execution_time_sec']
res = pg.mwu(t1_execs_s, t1_execs_e).round(2)
if res['p-val'][0] >.05:
    print('!!')

t1_execs_s = execs.loc[execs.expert, 'execution_time_sec']
t1_execs_e = execs.loc[~execs.expert, 'execution_time_sec']
res = pg.mwu(t1_execs_s, t1_execs_e).round(2)
if res['p-val'][0] >.05:
    print('!!')


In [None]:
t1_execs_s.mean()

In [None]:
t1_execs_e.mean()

In [None]:
t1_execs_s = execs.loc[execs.task == 'task1', 'execution_time_sec']
t1_execs_e = execs.loc[execs.task == 'task2', 'execution_time_sec']
res = pg.mwu(t1_execs_s, t1_execs_e)

In [None]:
res

In [None]:
plot_box_time(metrics[metrics.execution_time_sec > 10], var='execution_time_sec')

In [None]:
metrics['scr_len'] = metrics.cell_source.str.len()

In [None]:
sns.jointplot(metrics[metrics.event == 'execute'], x='execution_time_sec', y='scr_len', hue='task');

In [None]:
pg.corr(metrics[metrics.event == 'execute'].execution_time_sec, metrics[metrics.event == 'execute'].scr_len)