In [None]:
from pathlib import Path
import numpy as np
import importlib
from matplotlib import pyplot as plt
import humanize
import sys
import os, psutil
import time
import shutil
import pandas as pd
import seaborn as sns
import warnings
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
import logging

import astrocast.reduction as red
import astrocast.clustering as clust
import astrocast.analysis as ana
import astrocast.autoencoders as AE
import astrocast.helper as helper

for pack in [red, clust, helper, ana, AE, helper]:
    importlib.reload(pack)


# Generate example datasets

In [None]:
importlib.reload(helper)

# default settings signal generator
# noise_amplitude = 0.0001
def_gen = dict(noise_amplitude=0.001, trace_length=(50, 50), parameter_fluctuations=0.01)
ident = (def_gen, def_gen)

# big diff
def_diff = def_gen.copy()
def_diff.update(dict(b=2, plateau_duration=6))
diff = (def_gen, def_diff)

# small diff
small_diff_1 = def_gen.copy()
small_diff_1.update(dict(b=1.5, plateau_duration=2, signal_amplitude=1))

small_diff_2 = def_gen.copy()
small_diff_2.update(dict(signal_amplitude=1))
small_diff = (small_diff_1, small_diff_2)

# tripple 
trip_1 = def_gen.copy()
trip_1.update(dict(b=1, plateau_duration=1, signal_amplitude=1))

trip_2 = def_gen.copy()
trip_2.update(dict(b=0.8, plateau_duration=2, signal_amplitude=1))

trip_3 = def_gen.copy()
trip_3.update(dict(b=3, plateau_duration=5, signal_amplitude=1))
triplet = [trip_1, trip_2, trip_3]

# Variable Length
def_var_1 = def_gen.copy()
def_var_1.update(dict(trace_length=60, ragged_allowed=True, signal_amplitude=None, abort_amplitude=None))

def_var_2 = def_var_1.copy()
def_var_2.update(dict(leaky_k=0.2))
var_length = (def_var_1, def_var_2)


In [None]:
importlib.reload(ana)

def_dummy = dict(num_rows=1000)
gen_params = {"ident": ident, "diff": diff, "small_diff": small_diff, "triplet": triplet, "var_length": var_length}

experiments = {}
e_id = 0
for _ in range(3):
    for k, gen_param in gen_params.items():
        
        experiments[e_id] = {}
        
        ############################################
        # Create generator for identical populations
        generators = [helper.SignalGenerator(**param, ) for param in gen_param]
        dg = helper.DummyGenerator(generators=generators, **def_dummy)
        
        eObj = dg.get_events()
        eObj.name = k
        
        # save events
        experiments[e_id]["eObj"] = eObj
        experiments[e_id]["population_type"] = eObj.name
        
        #########################
        # Plot example population
        param = dict(num_samples=4, by="group", alpha=.9, linestyle="--", )
        
        plot = ana.Plotting(eObj)
        _ = plot.plot_traces(figsize=(4, 2), title=f"Exp {e_id} ({k})", **param)
        
        experiments[e_id]["plot"] = plot
        
        ##############
        # increment id
        e_id += 1

In [None]:
experiments[0]["eObj"]

# Conditional Constrasts

## Reduction

### Feature Extraction

In [None]:
for i, exp in experiments.items():
    eObj = exp["eObj"]
    
    fe = red.FeatureExtraction(eObj)
    
    features = fe.all_features(dropna=True)
    features = features.values.astype(float)
    
    if "embeddings" not in experiments[i]:
        experiments[i]["embeddings"] = {}
    
    experiments[i]["embeddings"]["FExt"] = features


### CNN Autoencoder

In [None]:
for i, exp in experiments.items():
    eObj = exp["eObj"]
    
    # create CNNAutoEncoder
    target_length = len(eObj.events.iloc[0].trace)
    cnn = AE.CNN_Autoencoder(target_length=target_length, use_cuda=True)
    
    # prepare data
    data = np.array(eObj.events.trace.tolist())
    X_train, X_val, X_test = cnn.split_dataset(data=data)
    
    # train
    cnn.train_autoencoder(X_train=X_train, X_val=X_val, epochs=25)
    
    # embedding
    embedding = cnn.embed(data)
    embedding = embedding.astype(float)
    
    experiments[i]["embeddings"]["CNN"] = embedding

### RNN Autoencoder

In [None]:
for i, exp in experiments.items():
    eObj = exp["eObj"]
    
    # create data loader
    pdl = AE.PaddedDataLoader(data=eObj.events.trace)
    X_train, X_val, X_test = pdl.get_datasets(batch_size=16,
                                              val_size=0.1,
                                              test_size=0.05)
    # train RecurrentAutoEncoder
    tRAE = AE.TimeSeriesRnnAE(use_cuda=True)
    tRAE.train_epochs(dataloader_train=X_train,
                      dataloader_val=X_val,
                      num_epochs=10,
                      patience=10,
                      safe_after_epoch=None,
                      show_mode='progress'
                      )
    
    # embedding
    X = pdl.get_dataloader(data=eObj.events.trace, batch_size=16, shuffle=False)
    _, _, embedding, _ = tRAE.embedd(X)
    
    embedding = np.array(embedding).astype(float)
    
    experiments[i]["embeddings"]["RNN"] = embedding

## Classifier (Predict condition)

In [None]:
importlib.reload(clust)

results = {k: [] for k in ['eid', 'dataset', 'embedding', 'data split', 'cm', 'accuracy', 'precision', 'recall', 'f1']}

for i, exp in experiments.items():
    eObj = exp["eObj"]
    
    for emb_name, embedding in exp['embeddings'].items():
        
        discr = clust.Discriminator(eObj)
        
        clf = discr.train_classifier(embedding=embedding, category_vector=eObj.events.group.tolist())
        res = discr.evaluate(show_plot=False, title=f"condition: {eObj.name} [{emb_name}]", figsize=(8, 4))
        
        for k, v in res.items():
            
            results['eid'].append(i)
            results['dataset'].append(exp['population_type'])
            results['embedding'].append(emb_name)
            results['data split'].append(k)
            results['cm'].append(v['cm'])
            results['accuracy'].append(v['accuracy'])
            results['precision'].append(v['precision'])
            results['recall'].append(v['recall'])
            results['f1'].append(v['f1'])

df = pd.DataFrame(results)
df

In [None]:
plot_type = "barplot"

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    
    datasets = df.dataset.unique()
    n_ds = len(datasets)
    fig, axx = plt.subplots(2, n_ds, figsize=(int(3 * n_ds), 8))
    
    for n in range(n_ds):
        
        ds = datasets[n]
        
        # traces
        plot = None
        for i, exp in experiments.items():
            if exp["population_type"] == ds:
                plot = exp['plot']
                break
        
        if plot is not None:
            _ = plot.plot_traces(num_samples=4, by="group", ax=axx[0, n], alpha=.9, linestyle="--")
        
        # precision
        data = df[df.dataset == ds]
        
        if plot_type == "pointplot":
            sns.pointplot(data=data, ax=axx[1, n], x="data split", y="accuracy", hue="embedding", dodge=True)
        
        elif plot_type == "barplot":
            sns.barplot(data=data, ax=axx[1, n], x="embedding", y="accuracy", hue="data split")
        
        elif plot_type == "violinplot":
            sns.violinplot(data=data, ax=axx[1, n], x="embedding", y="accuracy", hue="data split", split=True)
        else:
            raise ValueError(f"unknown plot type")
        
        # axis label
        axx[0, n].set_title(ds)
    
    # set random line
    for ax in axx[1, :]:
        ax.set_ylim(0, 1.1)
        ax.axhline(0.5, linestyle="--", color="gray")
    
    # legends
    for ax in axx[0, :-1]:
        ax.get_legend().remove()
    
    for ax in axx[:, -1]:
        sns.move_legend(ax, loc="upper left", bbox_to_anchor=(1.04, 1))

fig.savefig("conditional_classifier.png", dpi=480)

## Hierarchical Clustering

In [None]:
importlib.reload(clust)

link = clust.Linkage()
for i, exp in experiments.items():
    eObj = exp["eObj"]
    
    for correlation_type in ['pearson', 'dtw']:
        
        num_groups = len(eObj.events.group.unique())
        
        barycenters, cluster_lookup_table = link.get_barycenters(eObj.events,
                                                                 cutoff=num_groups, criterion='maxclust',
                                                                 distance_type=correlation_type
                                                                 )
        print(experiments[i]["population_type"], num_groups,
              len(np.unique(list(cluster_lookup_table.values()))))
        
        if "distance" not in experiments[i]:
            experiments[i]["distance"] = {}
        
        experiments[i]["distance"][correlation_type] = dict(barycenters=barycenters,
                                                            cluster_lookup_table=cluster_lookup_table)

In [None]:
importlib.reload(clust)

results = {k: [] for k in
           ['eid', 'dataset', 'distance', 'adjusted_mutual_info_score', 'adjusted_rand_score', 'homogeneity_score',
            'rand_score']}

for i, exp in experiments.items():
    eObj = exp["eObj"]
    
    for corr_type, v in exp['distance'].items():
        
        true_labels = eObj.events.group.tolist()
        predicted_labels = [v['cluster_lookup_table'][n] - 1 for n in range(len(true_labels))]
        
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            scores = clust.Discriminator.compute_scores(true_labels, predicted_labels, scoring="clustering")
            
            cm = confusion_matrix(predicted_labels, true_labels, normalize=None)
            experiments[i]['distance'][corr_type]['cm'] = cm
            
            results['eid'].append(i)
            results['dataset'].append(exp['population_type'])
            results['distance'].append(corr_type)
            results['adjusted_mutual_info_score'].append(scores['adjusted_mutual_info_score'])
            results['adjusted_rand_score'].append(scores['adjusted_rand_score'])
            results['homogeneity_score'].append(scores['homogeneity_score'])
            results['rand_score'].append(scores['rand_score'])

df = pd.DataFrame(results)
df

In [None]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    
    datasets = df.dataset.unique()
    n_ds = len(datasets)
    fig, axx = plt.subplots(2, n_ds, figsize=(int(3 * n_ds), 8))
    
    for n in range(n_ds):
        
        ds = datasets[n]
        
        # traces
        plot = None
        for i, exp in experiments.items():
            if exp["population_type"] == ds:
                plot = exp['plot']
                break
        
        if plot is not None:
            _ = plot.plot_traces(num_samples=4, by="group", ax=axx[0, n], alpha=.9, linestyle="--")
        
        # precision
        data = df[df.dataset == ds]
        sns.barplot(data=data, ax=axx[1, n], x="distance", y="rand_score")
        
        # axis label
        axx[0, n].set_title(ds)
    
    # set random line
    for ax in axx[1, :]:
        ax.set_ylim(0, 1.1)
        ax.axhline(0.5, linestyle="--", color="gray")
    
    # legends
    for ax in axx[0, :-1]:
        ax.get_legend().remove()
    
    sns.move_legend(axx[0, -1], loc="upper left", bbox_to_anchor=(1.04, 1))

fig.savefig("conditional_hierarchical.png", dpi=480)