# Examples on how to use the code for K-means clustering

In [None]:
from src.continuous_series import Cols
from src.stats import DailyTimeseries
from src.timeseries_kmeans_clustering import TimeSeriesKMeansClustering
from src.timeseries_kmeans_crossvalidation import TimeSeriesKMeansClusteringCrossValidation
from src.reshape_resampled_data_into_timeseries import ReshapeResampledDataIntoTimeseries
from src.configurations import Hourly, Configuration, GeneralisedCols
from src.read_preprocessed_df import ReadPreprocessedDataFrame

## Multivariate clustering

### Read the data - examples of different configurations given

In [None]:
# data to read
zip_id = ''  # provide a string of the zip_id you want to cluster

# resampling rules, alternative src.stats.WeeklyTimeseries
daily_ts = DailyTimeseries()  # clusters mean 'daily' time series that have 24 datapoints with at least a reading every 60min

# how to aggregate value if there is more than one value in a re sampling interval, alternatives: std, min, max
col_to_cluster = Cols.Mean

# read hourly resampled data for zip id
raw_df = ReadPreprocessedDataFrame(sampling=Hourly(), zip_id=zip_id).df

# which variates to use. Here mean iob, mean cob, mean bg
variates = Configuration.resampled_mean_columns()

# class to translate the resampled data into x train and provides other convenient functions
translate = ReshapeResampledDataIntoTimeseries(raw_df, daily_ts, variates)

### Cluster the data

In [None]:
# create 3d numpy array for x_train of shape (n_ts, ts_length, dimensions)
x_train = translate.to_x_train()

# cluster data
n_cluster = 2
km = TimeSeriesKMeansClustering(n_clusters=n_cluster, x_train=x_train, x_train_column_names=['IOB', 'COB', 'BG'],
                                timeseries_description=daily_ts)

### Different visualisations of the clusters

In [None]:
# all three barycenters for each cluster -> n_cluster plots
km.plot_barrycenters_of_different_cols_in_one_plot(col_to_cluster, show_title=False, show_legend=False,
                                                   show_overall_labels=False)

In [None]:
# time series and barycenters in each cluster -> n_cluster plots
km.plot_clusters_in_grid(col_to_cluster)

In [None]:
# all barycenters for each dimension -> n_clusters x 3 plots
km.plot_barry_centers_in_one_plot(col_to_cluster)

### Find best number of clusters k

In [None]:
# plot mean silhouette scores for ks
ks = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
km.plot_mean_silhouette_score_for_k(ks)

In [None]:
# plot sum of square distances for ks (elbow method)
ks = range(2, 50, 2)
km.plot_sum_of_square_distances_for_k(ks)

In [None]:
# plot silhouette blob for ks - looks best for only 4 different ks
ks = [2, 8, 12, 16]
km.plot_silhouette_blob_for_k(ks=ks)

## Single variate Clustering
Reading the data is the same as above, obtaining x_train to cluster is different.
Visualisation and finding the best k is the same too


### Cluster the data

In [None]:
ts_variate = GeneralisedCols.mean_iob.value  # which variate to cluster by; here IOB
x_train_sv = translate.to_x_train(cols=[ts_variate])
x_full = translate.to_x_train()  # this is the same as x train above and used to plot the other variates time series that were not used for the clustering

km_sv = TimeSeriesKMeansClustering(n_clusters=n_cluster, x_train=x_train_sv, x_train_column_names=[ts_variate],
                                   timeseries_description=daily_ts, x_full=x_full, x_full_column_names=["IOB", "COB", "BG"])

# Changing Distance Metric
By default, TimeSeriesKMeansClustering uses DTW as distance measure. You can provide additional parameters to e.g. specify a Sakoe Chiba band. See tslearn for more details. While you can specify different distance metrics atm you cannot change the plots from DBA barycenters which might not be appropriate for your distance measure.

In [None]:
distance_params = {"global_constraint": "sakoe_chiba",
                   "sakoe_chiba_radius": 2}
metric = "dtw"
km_sk = TimeSeriesKMeansClustering(n_clusters=n_cluster, x_train=x_train, x_train_column_names=["IOB", "COB", "BG"],
                                   timeseries_description=daily_ts, distance_metric=metric, metric_prams=distance_params)

# K-fold validation
To validate that your results are consistent even if you randomly drop a few time series you can use the
TimeSeriesKMeansClusteringCrossValidation convenience class to run a k-fold like clustering. Note atm you cannot change all the defaults (distance metrics, variates to cluster by). Doing that would be simple to implement - just need to provide parameters

### Calculate the cluster dropping one fold each time

In [None]:
# this is for multivariate clustering
n_folds =   # choose such that n_ts/n_folds is a whole number, usually k is around 10
val = TimeSeriesKMeansClusteringCrossValidation(n_fold=n_folds, n_clusters=2, x_train=x_train,
                                                x_train_column_names=['IOB', 'COB', 'BG'], timeseries_description=daily_ts)

### Visualise the results for each of the n_fold clusters

In [None]:
val.plot_barycenters_for_each_model()

### Calculate the silhouette score for each of the models

In [None]:
sil_scores = val.silhouette_scores()