In [None]:
import kmeans_ts_clustering
import pandas as pd
import numpy as np

In [None]:
# import data with your own file path
students = pd.read_csv('../../data_dictionary/growth.csv').iloc[:, 2:]
students.head(1)

In [None]:
# check the number of complete records
len(students)

In [None]:
# load data with existing group labels
students_with_groups = pd.read_csv('./data_dictionary/trajgps.csv').iloc[:, [0,-1]].drop_duplicates(subset=['id'])
print(len(students_with_groups))
students_with_groups.head()

In [None]:
# merge the labels
students = pd.merge(students, students_with_groups[['id', 'GROUP']], on='id', how='inner')
print(len(students))
students.head()

In [None]:
# get BMIz time series
# the variable bmiz stores time series of BMI z-scores for each student
bmiz = kmeans_ts_clustering.bmiz_time_series(students)
bmiz.head()

In [None]:
# get bmipbool time series
# the variable bmipbool stores time series of whether the BMI percentile
# >= 85th (indicating overweight/obese) for each student
bmipbool = kmeans_ts_clustering.bmipbool_time_series(students)
bmipbool.head()

In [None]:
# add columns of bmiz changes, bmi changes, and bmi percentage changes
kmeans_ts_clustering.add_changes_columns(students)

In [None]:
# check the updates
# chg_bmiz stores time series of changes in BMI z-scores for each student
chg_bmiz = students.loc[:, 'chg_bmiz_1':'chg_bmiz_10']
chg_bmiz.head()

In [None]:
# percent_chg_bmi stores time series of percentage changes in raw BMI for each student
percent_chg_bmi = students.loc[:, 'percent_chg_bmi_1':'percent_chg_bmi_10']
percent_chg_bmi.head()

In [None]:
# chg_bmi stores time series of changes in raw BMI for each student
chg_bmi = students.loc[:, 'chg_bmi_1':'chg_bmi_10']
chg_bmi.head()

In [None]:
# will need to convert the 2d dataframe into a list of rows to fit the plotting function
# the variables below are just 2d list representations of the variables mentioned above
bmipbool_to_list = kmeans_ts_clustering.dataframe_to_list(bmipbool)
bmiz_to_list = kmeans_ts_clustering.dataframe_to_list(bmiz)
chg_bmiz_to_list = kmeans_ts_clustering.dataframe_to_list(chg_bmiz)
percent_chg_bmi_to_list = kmeans_ts_clustering.dataframe_to_list(percent_chg_bmi)
chg_bmi_to_list = kmeans_ts_clustering.dataframe_to_list(chg_bmi)


In [None]:
# optimal k using whether being overweight/obese as the outcome variable
bmipbool_k = kmeans_ts_clustering.choose_num_clusters(bmipbool.values, 
                                                      students, 
                                                      seed=0)
bmipbool_k

In [None]:
# test for comparing distance metrics
kmeans_ts_clustering.compare_dist_metric(bmipbool_k, bmipbool, 
                                         dist_metric = ['euclidean', 'dtw'], 
                                         seed=0)

In [None]:
# test clustering plots using whether being overweight/obese as the outcome variable
bmipbool_labels = kmeans_ts_clustering.plot_kmeans_ts_clustering(bmipbool_to_list, 
                                                                 "Whether Overweight/Obese", 
                                                                 bmipbool_k,
                                                                 dist_metric='euclidean')

In [None]:
# plot the probability of being overweight/obese over time for the generated clusters
kmeans_ts_clustering.plot_prob_overweight_for_clusters(bmipbool_k, bmipbool_labels, students)

In [None]:
# Evaluate the performance of the model across different distance metrics using silhouette scores.
dist_metric_evals = kmeans_ts_clustering.tune_dist_metric(bmipbool, bmipbool_k, 
                                                          dist_metric=['euclidean', 'dtw'], 
                                                          seed=np.arange(100))

In [None]:
# Plot the mean silhouette scores with 95% confidence intervals across different 
# distance metrics to visualize which distance metric performs the best. 
kmeans_ts_clustering.plot_dist_metric_tunning(dist_metric_evals)

In [None]:
# Evaluate the performance of the model across different methods for the initialization of centroids
init_evals = kmeans_ts_clustering.tune_initialization(bmipbool, 
                                                      bmipbool_k, 
                                                      dist_metric='euclidean', 
                                                      seed=np.arange(100))

In [None]:
# Plot the mean silhouette scores with 95% confidence intervals across different 
# initialization methods to visualize which method performs the best.
kmeans_ts_clustering.plot_initialization_tunning(init_evals)

# The fine-tuned hyperparameters for K-means Time Series Clustering are:
### K = 5 (the number of clusters), distance metric = 'euclidean', and init = 'k-means++' (initialization method)

In [None]:
# retrain the model using the tuned parameters and the fixed seed 99 for reproducibility
cluster_labels = kmeans_ts_clustering.plot_kmeans_ts_clustering(bmipbool_to_list, 
                                                                "Whether Overweight/Obese",
                                                                num_clusters=5,
                                                                dist_metric='euclidean', 
                                                                seed=99, 
                                                                cluster_names=kmeans_ts_clustering.TRAJGPS)

In [None]:
# plot the probability of being overweight/obese over time for the tuned clusters
kmeans_ts_clustering.plot_prob_overweight_for_clusters(num_clusters=5, 
                                                       labels=cluster_labels, 
                                                       dataset=students, 
                                                       cluster_names=kmeans_ts_clustering.TRAJGPS)

In [None]:
# Calculate k-means group membership percentages
kmeans_ts_clustering.cluster_membership_percent_kmeans(students)

In [None]:
# Map GBTM membership labels to k-means membership labels
# to maintain consistency for comparison.
kmeans_ts_clustering.map_group_cluster(students)

In [None]:
# Calculate GBTM group membership percentages
kmeans_ts_clustering.cluster_membership_percent_gbtm(students)

In [None]:
# get a list of observations' distance from k-means centroids by cluster 
kmeans_dist_from_center_per_cluster = kmeans_ts_clustering.dist_from_center_by_cluster(students)[0]

In [None]:
# get k-means mean deviation (Euclidean distance) from centers for each cluster
kmeans_mean_dist_per_cluster = kmeans_ts_clustering.kmeans_mean_dist_by_cluster(kmeans_dist_from_center_per_cluster)

In [None]:
# get k-means cluster sizes 
kmeans_cluster_size = kmeans_ts_clustering.kmeans_cluster_sizes(kmeans_dist_from_center_per_cluster)

In [None]:
# get k-means standard deviation per cluster
kmeans_std_per_cluster = kmeans_ts_clustering.kmeans_std_by_cluster(kmeans_dist_from_center_per_cluster, 
                                                                    kmeans_mean_dist_per_cluster, 
                                                                    kmeans_cluster_size)

In [None]:
# get k-means overall accuracy and accuracy by cluster
kmeans_accuracy = kmeans_ts_clustering.kmeans_accuracy(kmeans_dist_from_center_per_cluster, 
                                                       kmeans_mean_dist_per_cluster, 
                                                       kmeans_std_per_cluster, 
                                                       kmeans_cluster_size, students)

In [None]:
# get a list of observations' distance from GBTM centroids by cluster 
gbtm_dist_from_center_per_cluster = kmeans_ts_clustering.dist_from_center_by_cluster(students)[1]

In [None]:
# get GBTM mean deviation (Euclidean distance) from centers for each cluster
gbtm_mean_dist_per_cluster = kmeans_ts_clustering.gbtm_mean_dist_by_cluster(gbtm_dist_from_center_per_cluster)

In [None]:
# get GBTM cluster sizes 
gbtm_cluster_size = kmeans_ts_clustering.gbtm_cluster_sizes(gbtm_dist_from_center_per_cluster)

In [None]:
# get GBTM standard deviation per cluster
gbtm_std_per_cluster = kmeans_ts_clustering.gbtm_std_by_cluster(gbtm_dist_from_center_per_cluster, 
                                                                gbtm_mean_dist_per_cluster, 
                                                                gbtm_cluster_size)

In [None]:
# get GBTM overall accuracy and accuracy by cluster
gbtm_accuracy = kmeans_ts_clustering.gbtm_accuracy(gbtm_dist_from_center_per_cluster, 
                                                   gbtm_mean_dist_per_cluster, 
                                                   gbtm_std_per_cluster, 
                                                   gbtm_cluster_size, students)

In [None]:
# display the accuracy table
kmeans_ts_clustering.accuracy_table(kmeans_accuracy, gbtm_accuracy)

In [None]:
# plot the trajectory groups of k-means vs. GBTM
kmeans_ts_clustering.plot_refined_trajgps(kmeans_ts_clustering.store_trajgps(students))