In [None]:
#Imports
from tslearn.clustering import TimeSeriesKMeans
import pandas as pd
import os
import numpy as np
from dtaidistance import dtw
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import davies_bouldin_score, silhouette_score

### Clustering for first 6 users

In [2]:
# Define the function to compute DTW Within-Cluster Variance
def compute_dtw_within_cluster_variance(data, labels, centroids):
    clusters_dtw_variance = []
    for cluster_id in set(labels):
        cluster_indices = np.where(labels == cluster_id)[0]
        cluster_data = data[cluster_indices]
        centroid = centroids[cluster_id]
        dtw_distances = [dtw.distance(cluster_data[i], centroid) for i in range(len(cluster_data))]
        clusters_dtw_variance.append(np.mean(dtw_distances))
    return np.mean(clusters_dtw_variance)

# Get data 
cwd = os.path.normpath(os.path.dirname(os.getcwd()))
df = pd.read_csv(cwd+'/data/2feature_engineering_data/df_with_final_features.csv', index_col='Date')
df.index = pd.to_datetime(df.index)
df.fillna(0, inplace=True)

# Only consider data for the first 5 users
columns_to_keep = []
for column_name in df.columns:
    if column_name.startswith('User') and column_name[4:].isdigit():
        user_number = int(column_name[4:])
        if 1 <= user_number <= 6:  # Change to 5 users
            columns_to_keep.append(column_name)

filtered_df = df[columns_to_keep]
filtered_df.reset_index(inplace=True)

# Preprocessing
filtered_df = filtered_df[(filtered_df['Date'].dt.month == 3) & (filtered_df['Date'].dt.year == 2013)]
data_array = np.array(filtered_df.T.drop('Date').values)
data_array = MinMaxScaler().fit_transform(data_array)
data_array_2d = data_array.reshape(data_array.shape[0], -1)

# Specify the maximum number of clusters you want to consider
# max_clusters = 2
cluster_count = 2  # Specify the desired number of clusters

kmeans = TimeSeriesKMeans(n_clusters=cluster_count, verbose=False, random_state=42, metric="dtw")
labels = kmeans.fit_predict(data_array)
np.savetxt(f'../evaluations/clusters_KMeansNew{cluster_count}_dtw.csv', labels, delimiter=",")
centroids = kmeans.cluster_centers_

# Compute DTW Within-Cluster Variance
dtw_variance = compute_dtw_within_cluster_variance(data_array, labels, centroids)

# Evaluate the clustering performance using all scores
db_score = davies_bouldin_score(data_array, labels)
sil_score = silhouette_score(data_array, labels)
print(f"Davies-Bouldin Score: {db_score}, Silhouette Score: {sil_score}, DTW Variance: {dtw_variance}")

print("Final labels: ", labels)
print("Cluster centroids: ", centroids)


Davies-Bouldin Score: 1.4370608363900756, Silhouette Score: 0.12032298331837392, DTW Variance: 3.2519800324016126
Final labels:  [1 0 0 0 0 1]
Cluster centroids:  [[[0.3626918 ]
  [0.13862752]
  [0.13862752]
  ...
  [0.35665761]
  [0.41610647]
  [0.01775309]]

 [[0.62397984]
  [0.94274809]
  [0.94274809]
  ...
  [0.18917071]
  [0.84536082]
  [0.81232295]]]


### Clustering for 30 users

In [3]:
# Define the function to compute DTW Within-Cluster Variance
def compute_dtw_within_cluster_variance(data, labels, centroids):
    clusters_dtw_variance = []
    for cluster_id in set(labels):
        cluster_indices = np.where(labels == cluster_id)[0]
        cluster_data = data[cluster_indices]
        centroid = centroids[cluster_id]
        dtw_distances = [dtw.distance(cluster_data[i], centroid) for i in range(len(cluster_data))]
        clusters_dtw_variance.append(np.mean(dtw_distances))
    return np.mean(clusters_dtw_variance)

# Get data 
cwd = os.path.normpath(os.path.dirname(os.getcwd()))
df = pd.read_csv(cwd+'/data/2feature_engineering_data/df_with_final_features.csv', index_col='Date')
df.index = pd.to_datetime(df.index)
df.fillna(0, inplace=True)

# Only consider data for the first 5 users
columns_to_keep = []
for column_name in df.columns:
    if column_name.startswith('User') and column_name[4:].isdigit():
        user_number = int(column_name[4:])
        if 1 <= user_number <= 30:  # Change to 30 users
            columns_to_keep.append(column_name)

filtered_df = df[columns_to_keep]
filtered_df.reset_index(inplace=True)

# Preprocessing
filtered_df = filtered_df[(filtered_df['Date'].dt.month == 3) & (filtered_df['Date'].dt.year == 2013)]
data_array = np.array(filtered_df.T.drop('Date').values)
data_array = MinMaxScaler().fit_transform(data_array)
data_array_2d = data_array.reshape(data_array.shape[0], -1)

# Specify the maximum number of clusters you want to consider
# max_clusters = 2
cluster_count = 6  # Specify the desired number of clusters

kmeans = TimeSeriesKMeans(n_clusters=cluster_count, verbose=False, random_state=42, metric="dtw")
labels = kmeans.fit_predict(data_array)
np.savetxt(f'../evaluations/clusters_KMeansNew{cluster_count}_dtw.csv', labels, delimiter=",")
centroids = kmeans.cluster_centers_

# Compute DTW Within-Cluster Variance
dtw_variance = compute_dtw_within_cluster_variance(data_array, labels, centroids)

# Evaluate the clustering performance using all scores
db_score = davies_bouldin_score(data_array, labels)
sil_score = silhouette_score(data_array, labels)
print(f"Davies-Bouldin Score: {db_score}, Silhouette Score: {sil_score}, DTW Variance: {dtw_variance}")

print("Final labels: ", labels)
print("Cluster centroids: ", centroids)


Davies-Bouldin Score: 1.8730964723651322, Silhouette Score: 0.042810982720721126, DTW Variance: 1.9605766620467513
Final labels:  [5 0 2 2 1 4 0 4 3 1 0 1 4 3 1 1 1 4 2 4 1 1 0 0 1 1 2 1 2 0]
Cluster centroids:  [[[0.04689709]
  [0.07786725]
  [0.14781002]
  ...
  [0.04848416]
  [0.12180112]
  [0.07022518]]

 [[0.01708401]
  [0.16148332]
  [0.42328509]
  ...
  [0.20050317]
  [0.58654335]
  [0.15516239]]

 [[0.30532011]
  [0.34342972]
  [0.61141321]
  ...
  [0.5464541 ]
  [0.5464541 ]
  [0.65168773]]

 [[0.38987891]
  [1.        ]
  [0.96701121]
  ...
  [0.40982287]
  [0.58616011]
  [0.51118421]]

 [[0.30768322]
  [0.443817  ]
  [0.43227624]
  ...
  [0.05876   ]
  [0.09146159]
  [0.17098031]]

 [[0.04418395]
  [0.30084746]
  [0.73170732]
  ...
  [0.13768116]
  [0.17774763]
  [0.99342105]]]
