In [1]:
import pandas as pd
import json
import numpy as np
import os

In [None]:
#import feature vectors
X_save_scaled = "X_tr_scaled.csv"
X_train_scaled = pd.read_csv(X_save_scaled, index_col=0)

## Try MiniBatchKMeans with K=5

In [20]:
### using MiniBatchKMeans is faster and uses less memory than regular KMeans
### explanation at: https://algorithmicthoughts.wordpress.com/2013/07/26/machine-learning-mini-batch-k-means/

from sklearn.cluster import MiniBatchKMeans
K=5
kmeans = MiniBatchKMeans(n_clusters=K, batch_size=100)
kmeans.fit(X_train_scaled)

#Find clusters each row belongs to
cluster_labels=kmeans.labels_
cluster_labels_df=pd.DataFrame(cluster_labels)

#Get centroid positions for each cluster
centroids=kmeans.cluster_centers_
centroids_df=pd.DataFrame(centroids)

In [4]:
cluster_labels.shape

(4194,)

In [5]:
cluster_labels[0:4]

array([0, 4, 4, 4], dtype=int32)

In [6]:
centroids.shape

(5, 138)

In [7]:
cluster_labels_df.columns = ['Cluster']
print(cluster_labels_df.head())

   Cluster
0        0
1        4
2        4
3        4
4        4


### Append cluster labels to feature vectors

In [21]:
cluster_labels_series = pd.Series(cluster_labels, name='clusters')
result = pd.concat([cluster_labels_series, X_train_scaled], axis=1)
result.head()

#using cluster_labels_df gives same result:
#cluster_X_train_scaled_df = pd.concat([cluster_labels_df, X_train_scaled], axis=1)
#cluster_X_train_scaled_df.head()

Unnamed: 0,clusters,mean,std,max,min,mean_change_abs,mean_change_rate,abs_max,abs_min,std_first_50000,...,std_roll_mean_1000,max_roll_mean_1000,min_roll_mean_1000,q01_roll_mean_1000,q05_roll_mean_1000,q95_roll_mean_1000,q99_roll_mean_1000,av_change_abs_roll_mean_1000,av_change_rate_roll_mean_1000,abs_max_roll_mean_1000
0,0,1.42414,-0.170214,-0.218194,0.193218,-1.32642,-1.569265,-0.222567,0.0,0.052067,...,0.26847,-0.004742,0.178278,0.287332,0.965402,1.509153,0.885262,-0.6313,-1.832422,-0.004742
1,4,0.805716,0.004734,0.063936,-0.018037,0.002747,-1.040206,0.036797,0.0,0.153858,...,-0.141264,0.007341,-0.025387,0.622391,0.842747,0.522428,0.294357,-0.912054,-0.890022,0.007341
2,4,1.511155,0.049252,-0.086289,0.163039,-0.218781,0.949925,-0.101306,0.0,0.004241,...,0.085078,0.099556,0.245184,0.634878,1.207106,1.530919,0.88979,0.441128,0.639209,0.099556
3,0,1.494934,0.04395,0.12256,-0.187796,0.002747,-0.634909,0.097427,0.0,0.020852,...,0.083085,0.068076,0.105059,0.770151,1.160208,1.432972,0.815078,-0.949994,-1.097513,0.068076
4,4,1.520242,0.088495,-0.067969,0.08759,-0.108017,0.094279,-0.084464,0.0,-0.093104,...,-0.164151,0.138032,0.187535,1.040695,1.557034,1.393068,0.90111,0.595416,-0.465464,0.138032


### Save dataframe containing centroids for each cluster in CSV file

In [10]:
centroids_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,128,129,130,131,132,133,134,135,136,137
0,0.771107,-0.268435,-0.226011,0.238559,0.018441,0.075951,-0.22555,0.0,-0.246979,-0.219912,...,-0.075899,-0.057749,0.207522,0.470334,0.745,0.696714,0.388631,0.004522,0.076559,-0.057749
1,-1.102103,0.145864,0.142928,-0.164184,-0.060281,-0.13178,0.147034,0.0,0.140143,0.164009,...,-0.034561,-0.072284,-0.14627,-0.564439,-1.015255,-1.042111,-0.678334,-0.082073,-0.206403,-0.072284
2,0.308867,14.630244,12.708468,-12.632251,-0.307392,3.526626,12.426279,0.0,12.055426,6.738408,...,17.26633,16.465468,-15.920152,-14.648405,-6.207052,5.639617,14.179047,0.129011,2.400987,16.465468
3,-0.433466,-0.205392,-0.188017,0.190328,0.012463,0.000562,-0.186211,0.0,-0.182298,-0.167345,...,-0.088369,-0.145876,0.070936,-0.160095,-0.359982,-0.436225,-0.327053,0.046259,0.034987,-0.145876
4,0.619599,0.159901,0.128714,-0.107276,-0.001493,-0.070482,0.113729,0.0,0.188924,0.118497,...,-0.000323,0.081598,0.036394,0.323239,0.550815,0.599034,0.367991,0.052419,-0.049793,0.081598


In [11]:
centroids_df.columns=X_train_scaled.columns.values
centroids_df.head()

Unnamed: 0,mean,std,max,min,mean_change_abs,mean_change_rate,abs_max,abs_min,std_first_50000,std_last_50000,...,std_roll_mean_1000,max_roll_mean_1000,min_roll_mean_1000,q01_roll_mean_1000,q05_roll_mean_1000,q95_roll_mean_1000,q99_roll_mean_1000,av_change_abs_roll_mean_1000,av_change_rate_roll_mean_1000,abs_max_roll_mean_1000
0,0.771107,-0.268435,-0.226011,0.238559,0.018441,0.075951,-0.22555,0.0,-0.246979,-0.219912,...,-0.075899,-0.057749,0.207522,0.470334,0.745,0.696714,0.388631,0.004522,0.076559,-0.057749
1,-1.102103,0.145864,0.142928,-0.164184,-0.060281,-0.13178,0.147034,0.0,0.140143,0.164009,...,-0.034561,-0.072284,-0.14627,-0.564439,-1.015255,-1.042111,-0.678334,-0.082073,-0.206403,-0.072284
2,0.308867,14.630244,12.708468,-12.632251,-0.307392,3.526626,12.426279,0.0,12.055426,6.738408,...,17.26633,16.465468,-15.920152,-14.648405,-6.207052,5.639617,14.179047,0.129011,2.400987,16.465468
3,-0.433466,-0.205392,-0.188017,0.190328,0.012463,0.000562,-0.186211,0.0,-0.182298,-0.167345,...,-0.088369,-0.145876,0.070936,-0.160095,-0.359982,-0.436225,-0.327053,0.046259,0.034987,-0.145876
4,0.619599,0.159901,0.128714,-0.107276,-0.001493,-0.070482,0.113729,0.0,0.188924,0.118497,...,-0.000323,0.081598,0.036394,0.323239,0.550815,0.599034,0.367991,0.052419,-0.049793,0.081598


In [12]:
cluster_X_train_scaled_df.to_csv('KMeans-clusterLabels.csv', index=False)
centroids_df.to_csv('KMeans-centroids.csv', index=False)

## Determine distances of each row vector to each cluster centroid

In [13]:
# squared distance to cluster center
X_dist = kmeans.transform(X_train_scaled)**2
X_dist.shape

(4194, 5)

In [14]:
X_dist_df = pd.DataFrame(X_dist)
X_dist_df.head()

Unnamed: 0,0,1,2,3,4
0,68.602082,179.800386,13610.352848,112.953659,80.586628
1,45.362412,94.4603,13178.219807,60.491034,26.851937
2,103.026707,161.319004,12923.945127,137.479769,50.648234
3,57.894704,143.588088,13110.035159,100.840662,33.813174
4,68.912468,137.017878,12986.931488,106.032851,33.455538


In [15]:
cols=[]

for i in range(1,K+1):
    coli='Cluster'+str(i)
    cols.append(coli)
X_dist_df.columns=cols
X_dist_df.head()

Unnamed: 0,Cluster1,Cluster2,Cluster3,Cluster4,Cluster5
0,68.602082,179.800386,13610.352848,112.953659,80.586628
1,45.362412,94.4603,13178.219807,60.491034,26.851937
2,103.026707,161.319004,12923.945127,137.479769,50.648234
3,57.894704,143.588088,13110.035159,100.840662,33.813174
4,68.912468,137.017878,12986.931488,106.032851,33.455538


### Append the Cluster features to original feature vectors

In [16]:
cluster_X_train_scaled_df = pd.concat([X_dist_df, X_train_scaled], axis=1)
print(cluster_X_train_scaled_df.shape)

(4194, 143)


In [17]:
print(X_train_scaled.shape)

(4194, 138)


In [18]:
cluster_X_train_scaled_df.head()

Unnamed: 0,Cluster1,Cluster2,Cluster3,Cluster4,Cluster5,mean,std,max,min,mean_change_abs,...,std_roll_mean_1000,max_roll_mean_1000,min_roll_mean_1000,q01_roll_mean_1000,q05_roll_mean_1000,q95_roll_mean_1000,q99_roll_mean_1000,av_change_abs_roll_mean_1000,av_change_rate_roll_mean_1000,abs_max_roll_mean_1000
0,68.602082,179.800386,13610.352848,112.953659,80.586628,1.42414,-0.170214,-0.218194,0.193218,-1.32642,...,0.26847,-0.004742,0.178278,0.287332,0.965402,1.509153,0.885262,-0.6313,-1.832422,-0.004742
1,45.362412,94.4603,13178.219807,60.491034,26.851937,0.805716,0.004734,0.063936,-0.018037,0.002747,...,-0.141264,0.007341,-0.025387,0.622391,0.842747,0.522428,0.294357,-0.912054,-0.890022,0.007341
2,103.026707,161.319004,12923.945127,137.479769,50.648234,1.511155,0.049252,-0.086289,0.163039,-0.218781,...,0.085078,0.099556,0.245184,0.634878,1.207106,1.530919,0.88979,0.441128,0.639209,0.099556
3,57.894704,143.588088,13110.035159,100.840662,33.813174,1.494934,0.04395,0.12256,-0.187796,0.002747,...,0.083085,0.068076,0.105059,0.770151,1.160208,1.432972,0.815078,-0.949994,-1.097513,0.068076
4,68.912468,137.017878,12986.931488,106.032851,33.455538,1.520242,0.088495,-0.067969,0.08759,-0.108017,...,-0.164151,0.138032,0.187535,1.040695,1.557034,1.393068,0.90111,0.595416,-0.465464,0.138032


### perform box-cox on cluster features

In [19]:
from scipy.special import boxcox1p
cluster_X_train_scaled_boxcox_df = boxcox1p(cluster_X_train_scaled_df, .25)
cluster_X_train_scaled_boxcox_df.tail()

Unnamed: 0,Cluster1,Cluster2,Cluster3,Cluster4,Cluster5,mean,std,max,min,mean_change_abs,...,std_roll_mean_1000,max_roll_mean_1000,min_roll_mean_1000,q01_roll_mean_1000,q05_roll_mean_1000,q95_roll_mean_1000,q99_roll_mean_1000,av_change_abs_roll_mean_1000,av_change_rate_roll_mean_1000,abs_max_roll_mean_1000
4189,5.700415,8.432767,39.519507,4.51125,8.594247,-0.322513,-0.384364,-0.368704,0.297848,0.619007,...,-0.051135,-0.165811,0.101938,-0.031447,-0.269066,-0.328458,-0.276346,0.110208,0.142033,-0.165811
4190,6.442258,8.522524,39.495433,4.747096,9.026759,-0.49833,-0.354538,-0.339671,0.300886,-2.892724,...,-0.196137,-0.185091,0.07355,-0.177867,-0.356563,-0.631292,-0.549017,0.785211,0.666421,-0.185091
4191,4.948712,10.164577,39.709094,6.332237,9.270262,0.305771,-0.466787,-0.434115,0.363247,0.467884,...,-0.255423,-0.100695,0.184108,0.352422,0.44,0.150904,0.133565,0.542859,-1.068891,-0.100695
4192,5.801947,8.321436,39.487885,4.711226,8.338388,-0.230843,-0.316452,-0.283593,0.183913,-0.725809,...,-0.287705,-0.197245,0.193635,0.215222,0.062065,-0.371425,-0.319175,-0.040038,-0.025788,-0.197245
4193,5.255064,9.621167,39.667852,5.766534,9.144488,-0.004163,-0.440783,-0.428954,0.282561,0.386188,...,-0.262903,-0.159688,0.166348,0.18284,0.196477,-0.203623,-0.194867,-0.488671,0.775212,-0.159688
