# Data Preparation

In [None]:
import pickle
from pathlib import Path

import pandas as pd
from IPython.display import display, Markdown
import plotnine as pn
from scipy.stats import kruskal

from clustering import CKMeans
from experiment import (linear_drift, constant_diffusion, gbm_drift_and_diffusion, oup_drift, trigonometric_drift,
                        multivariate_drift, multivariate_diffusion)


def refactor(experiment_result: dict):
    refactored_results = {"simulation_mode": (_ := ' '.join(experiment_result["series_name"].split(' ')[:-1])),
                          "ts_mode": _.replace(" variable length", ''),
                          "simulation_iteration": experiment_result["series_name"].split(' ')[-1],
                          "proxy": experiment_result["series_name"].replace(" variable length", ''),
                          'length_type': "variable" if 'variable' in experiment_result[
                              "series_name"].lower() else 'fixed',
                          "algorithm": "K-Means" if issubclass(experiment_result['alg_model'].func,
                                                               CKMeans) else "DBSCAN",
                          **experiment_result['alg_model'].keywords}

    return refactored_results


def try_get_gamma(gamma: dict[str, float]):
    try:
        gamma = gamma['gamma']
    except TypeError:
        pass
    return gamma


def refactor_distance_measure(frame_section: pd.DataFrame):
    distance_measure = frame_section['distance_measure'].tolist()
    distance_measure = [b if pd.isnull(m) else m for m, b in
                        zip(distance_measure, frame_section['distance_base'].tolist())]
    gamma = frame_section['gamma'].tolist()
    gamma = [try_get_gamma(b) if pd.isnull(m) else m for m, b in zip(gamma, frame_section['distance_base_kwargs'])]
    return [d if pd.isnull(g) else f'{d}, gamma={round(float(g), 1)}' for d, g in zip(distance_measure, gamma)]


results = []
for p in Path("./experiments-results").glob("*.experiment"):
    with open(p, "rb") as file:
        experiment = pickle.load(file)
        results.extend([{**refactor(r), **r} for r in experiment.results])

# noinspection PyTypeChecker
data_for_stats = (pd.DataFrame(results)
.assign(algorithm=lambda df: df.algorithm.apply(str),
        distance_measure=lambda df: refactor_distance_measure(df),
        )
.drop(
    columns=['iterations', 'gamma', 'min_pts', 'state', 'results', 'alg_model', 'distance_base_kwargs', 'epsilon',
             'distance_base'])
)
data = data_for_stats.melt(['simulation_mode', 'ts_mode', 'simulation_iteration', 'proxy',
                            'length_type', 'algorithm', 'distance_measure', 'series_name'],
                           var_name="PerformanceMeasure")

distance_measures = data.distance_measure.unique().tolist()
simulations = data.simulation_mode.unique().tolist()
performance_measures = ['Rand Index',
                        'Adjusted Rand Index',
                        'Adjusted Mutual Info Score',
                        'Normalized Mutual Info Score',
                        'Homogeneity Score',
                        'Completeness Score',
                        'V Measure']

In [None]:
data_for_stats

In [None]:
data

# Results summary per Algorithm

In [None]:
(data
 .pivot_table(values='value',
              index=['algorithm'],
              columns=['PerformanceMeasure']))

In [None]:
# noinspection PyTypeChecker
(pn.ggplot(data, pn.aes(x='factor(algorithm)', y='value')) +
 pn.geom_violin() +
 pn.theme(axis_text_x=pn.element_text(rotation=90, hjust=1)))

In [None]:
comparisons = []
for measure in performance_measures:
    kruskal_result = kruskal(data_for_stats.query("algorithm=='K-Means'")[measure],
                             data_for_stats.query("algorithm!='K-Means'")[measure])
    comparisons.append({"measure": measure, "statistic": kruskal_result.statistic, "p-value": kruskal_result.pvalue})
pd.DataFrame(comparisons).sort_values('p-value', ascending=False)

In [None]:
(data
 .pivot_table(values='value',
              index=['distance_measure'],
              columns=['PerformanceMeasure']))

In [None]:
# noinspection PyTypeChecker
(pn.ggplot(data, pn.aes(x='factor(distance_measure)', y='value')) +
 pn.geom_violin() +
 pn.facet_wrap('algorithm') +
 pn.theme(axis_text_x=pn.element_text(rotation=90, hjust=1)))

In [None]:
with pd.option_context('display.max_rows', 100, 'display.max_columns', 10):
    display(data
            .pivot_table(values='value',
                         index=['algorithm', 'distance_measure'],
                         columns=['PerformanceMeasure']))



In [None]:
comparisons = []
for measure in performance_measures:
    kruskal_result = kruskal(*[data_for_stats.query("distance_measure==@d")[measure] for d in distance_measures])
    comparisons.append({"measure": measure, "statistic": kruskal_result.statistic, "p-value": kruskal_result.pvalue})
pd.DataFrame(comparisons).sort_values('p-value', ascending=False)

In [None]:
distance_measures

## K-Means

In [None]:
k_means_data: pd.DataFrame = data.query("algorithm == 'K-Means'")

In [None]:
(k_means_data
 .pivot_table(values='value',
              index=['distance_measure'],
              columns=['PerformanceMeasure']))

In [None]:
# noinspection PyTypeChecker
(pn.ggplot(k_means_data, pn.aes(x='factor(distance_measure)', y='value')) +
 pn.geom_violin() +
 pn.theme(axis_text_x=pn.element_text(rotation=90, hjust=1)) +
 pn.ggtitle("K-Means"))

In [None]:
(k_means_data
 .pivot_table(values='value',
              index=['distance_measure', 'simulation_mode'],
              columns=['PerformanceMeasure']))

In [None]:
# noinspection PyTypeChecker
p = (pn.ggplot(k_means_data, pn.aes(x='factor(distance_measure)', y='value')) +
 pn.geom_violin() +
 pn.facet_wrap('simulation_mode') +
 pn.theme(axis_text_x=pn.element_text(rotation=90, hjust=1), figure_size=(13, 7)) +
 pn.ggtitle("Results for K-Means"))
p.save("./plots/K-Means.png")
display(p)

# DBSCAN

In [None]:
dbscan_data: pd.DataFrame = data.query("algorithm != 'K-Means'")

In [None]:
(dbscan_data
 .pivot_table(values='value',
              index=['distance_measure'],
              columns=['PerformanceMeasure']))

In [None]:
# noinspection PyTypeChecker
(pn.ggplot(dbscan_data, pn.aes(x='factor(distance_measure)', y='value')) +
 pn.geom_violin() +
 pn.theme(axis_text_x=pn.element_text(rotation=90, hjust=1)) +
 pn.ggtitle("DBSCAN"))

In [None]:
(dbscan_data
 .pivot_table(values='value',
              index=['distance_measure', 'simulation_mode'],
              columns=['PerformanceMeasure']))

In [None]:
# noinspection PyTypeChecker
p = (pn.ggplot(dbscan_data, pn.aes(x='factor(distance_measure)', y='value')) +
     pn.geom_violin() +
     pn.facet_wrap('simulation_mode') +
     pn.theme(axis_text_x=pn.element_text(rotation=90, hjust=1), figure_size=(14, 8)) +
     pn.ggtitle("Results for DBSCAN"))
p.save("plots/DBSCAN.png")

p

# Summary all results

In [None]:
data_for_stats.query("algorithm != 'K-Means'")

In [None]:
keeper = {}
comparisons = []
for alg in data_for_stats.algorithm.unique().tolist():
    for l_type in ['variable', 'fixed']:
        for s in data_for_stats.ts_mode.unique().tolist():
            for measure in performance_measures:
                temp = (data_for_stats.query(
                    "algorithm==@alg & "
                    "length_type==@l_type &"
                    "ts_mode==@s")
                        .pivot(columns='distance_measure',
                               index='series_name',
                               values=measure))
                kruskal_result = kruskal(*temp.dropna().T.values.tolist())
                comparisons.append(
                    {"algorithm": alg, "simulation mode": s, "measure": measure, "statistic": kruskal_result.statistic,
                     "p-value": kruskal_result.pvalue})
        display(Markdown(f"# {alg}, {l_type}"))
        display(pd.DataFrame(comparisons).sort_values('p-value', ascending=True))
        keeper[f"{alg}_{l_type}"] = comparisons
        comparisons = []

In [None]:
comparisons = []
for alg in data_for_stats.algorithm.unique().tolist():
    for d in distance_measures:
        for s in data_for_stats.ts_mode.unique().tolist():
            for measure in performance_measures:
                temp = (data_for_stats
                        .query("algorithm==@alg & "
                               "ts_mode==@s &"
                               "distance_measure==@d"
                               )
                        .pivot(columns='length_type',
                               index='proxy',
                               values=measure))
                kruskal_result = kruskal(*temp.dropna().T.values.tolist())
                comparisons.append(
                    {"algorithm": alg, 'distance_measure': d, "simulation mode": s, "measure": measure,
                     "statistic": kruskal_result.statistic,
                     "p-value": kruskal_result.pvalue})
pd.DataFrame(comparisons).sort_values(by='p-value', ascending=True)

In [None]:
# noinspection PyTypeChecker
for measure in performance_measures:
    p = (pn.ggplot(data.query("algorithm == 'DBSCAN' & "
                              "PerformanceMeasure==@measure"), 
                   pn.aes(x='factor(distance_measure)', y='value')) +
         pn.geom_violin() +
         pn.facet_wrap('simulation_mode') +
         pn.theme(axis_text_x=pn.element_text(rotation=90, hjust=1), 
                  figure_size=(14, 8)) +
         pn.ggtitle(f"Results for DBSCAN; {measure=}"))
    p.save(f"./plots/dbscan-{measure.replace(' ', '_')}.png")
    display(p)

In [None]:
# noinspection PyTypeChecker
for measure in performance_measures:
    p = (pn.ggplot(data.query("algorithm == 'K-Means' & "
                              "PerformanceMeasure==@measure"),
                   pn.aes(x='factor(distance_measure)', y='value')) +
         pn.geom_violin() +
         pn.facet_wrap('simulation_mode') +
         pn.theme(axis_text_x=pn.element_text(rotation=90, hjust=1),
                  figure_size=(14, 8)) +
         pn.ggtitle(f"Results for K-Means; {measure=}"))
    p.save(f"./plots/k_means-{measure.replace(' ', '_')}.png")
    display(p)

In [None]:
for k, v in keeper.items():
    with open(f"./tables/{k}.tex", 'w') as file:
        file.writelines(pd.DataFrame(v).sort_values('p-value', ascending=True).to_latex(index=False))

In [None]:
pd.DataFrame(keeper['DBSCAN_variable']).sort_values('p-value', ascending=True).tail()