## Setup

First, we set up our notebook by downloading the appropriate packages

### Dependencies

In [1]:
pip install matplotlib scipy pandas numpy networkx seaborn scikit-learn cython

Note: you may need to restart the kernel to use updated packages.


### Imports

In [2]:
# base libraries
import importlib
import pandas as pd
import numpy as np

import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt

import os

import networkx as nx

import time

from IPython.utils.io import capture_output # supress outputs

from sklearn.metrics.cluster import adjusted_rand_score

### Fix pathway

In [4]:
# fix directory
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
os.chdir(parent_dir)

print("Current working directory:", os.getcwd())

from trcrpm.src import Hierarchical_TRCRP_Mixture, TRCRP_Mixture

Current working directory: /hdsi-collab-persistent/cephfs


### Helper Functions

1. **`run_model(data, num_chains = 8, p = 5, MCMC_steps=1000, hyperparam_steps=50, runtime = True)`**
    - **Returns**: Fitted model
        - The dataframe index needs to be reset as an integer.
        - Choose a high number for `MCMC_steps` to allow for a burn-in period.
        - `p` is our lag window

2. **`post_dep(model, num_samples)`**
    - **Returns**: Pairwise posterior dependence matrices (cluster probabilities)
        - The array is of size `NUM_SAMPLES x NUM_CHAINS x LEN(DF) x LEN(DF)`

3. **`clustering(post_probs, threshold = 0.75)`**
    - **Returns**: Clusters
        - If time series `i` and `k` are dependent in `threshold%` of samples, they are clustered together.
        - Averages over `NUM_SAMPLES` and `NUM_CHAINS`.

4. **`return_ari(true_labels, predicted_clusters)`**
    - **Returns**: Adjusted Rand Index (ARI) if ground truth is available


In [5]:
import importlib

In [6]:
import experiments.helper_functions as helpers
importlib.reload(helpers)

ModuleNotFoundError: No module named 'experiments'

## Data Generation (Sin Waves)

In [None]:
import experiments.data_generation as data_gen
importlib.reload(data_gen)

df = data_gen.generate_sine_wave_data(frequency_noise=0.1, random_state=42, use_colors = True, 
                                      num_samples = 10, fixed_amplitude = 2, num_clusters = 2)
df.head()

## Run Analysis

In [None]:
data = df.iloc[1:]
labels = df.iloc[0].values

print(data.shape) # NUM_OBSERVATIONS X NUM_TIMESEREIES

In [None]:
model = helpers.run_model(data)

In [None]:
dep_matrices = helpers.post_dep(model, 20)

In [None]:
# compare predicted and true clusters

In [None]:
predicted_clusters = helpers.clustering(dep_matrices, threshold = 0.75)
predicted_clusters

In [None]:
helpers.return_ari(labels, predicted_clusters)

In [None]:
unique_labels = np.unique(labels)
# step size (number of elements per cluster)
step = len(labels) // len(unique_labels)

# clusters as list 
clusters = [[int(label) for label in np.where(labels == label)[0]] for label in unique_labels]
clusters

## Plotting


In [None]:
import plotting_functions as plot
importlib.reload(plot)

In [None]:
%%capture
# need to simulate samples for confidence interval
# we sample from the posterior to get confidence interval for forecasted values
# some bug inside .simulate so supressing outputs for now
# bug is in cgpm.src.crosscat.sampling (print statement in simulate_row)
probes = model.dataset.index
numsamples = 10
samples = model.simulate(probes, model.variables, numsamples)

### Helper Functions 

In [None]:
# compare labels with predicted_labels
plot.plot_df(data, labels)
plot.plot_df(data, predicted_labels)

In [None]:
plot.plot_cluster_variables(samples, [1, 6, 26, 41], ax, states_from_chain=2)


### Serialize 

In [None]:
metadata = model.to_metadata()

In [None]:
# from collections import OrderedDict
# def convert_to_serializable(obj):
#     if isinstance(obj, dict):
#         return {key: convert_to_serializable(value) for key, value in obj.items()}
#     elif isinstance(obj, list):
#         return [convert_to_serializable(item) for item in obj]
#     elif isinstance(obj, tuple):
#         return tuple(convert_to_serializable(item) for item in obj)
#     elif isinstance(obj, set):
#         return list(convert_to_serializable(item) for item in obj)
#     elif isinstance(obj, OrderedDict):  # Handling OrderedDict specifically
#         return list(obj.items())
#     else:
#         return obj

# # Convert metadata to a serializable format
# serializable_metadata = convert_to_serializable(metadata)

# # Now you can serialize it
# json_metadata = json.dumpfs(serializable_metadata)
# print(json_metadata)

In [None]:
# metadata['engine']

In [None]:
import json
json_metadaata = json.dumps(metadata)

In [None]:
modname, attrname = metadata['factory']

In [None]:
module = importlib.import_module(modname + '.src') 

In [None]:
module

In [None]:
klass = getattr(module, attrname)

In [None]:
model2 = klass.from_metadata(