## Creating a variable clustering algorithm that is similar to SAS varclus
1. PCA-based recursive decomposition
2. stopping critiera
3. plotting
4. OOP

In [5]:
%matplotlib inline

import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale

from decomposition.var_clus import VarClus, Cluster

In [6]:
# Fake data
df_list = []
num_clusters = 5

for i in range(num_clusters):
    feature_df, label_df = make_classification(n_samples=int(1e4), n_features=20, n_informative=1, n_redundant=19, n_clusters_per_class=1)
    feature_df = scale(feature_df)

    # Make sure data is zscored
    np.max(np.abs(feature_df.mean(axis=0))), np.max((feature_df.std()))

    df_list.append(pd.DataFrame(feature_df, columns=['fs{}_'.format(str(i)) + str(j) for j in range(20)]))

feature_df = pd.concat(df_list, axis=1)

In [7]:
# VarClus
test_varclus = VarClus(max_eigenvalue=2, n_split=2)

In [8]:
a = test_varclus.decompose(feature_df)

decomposing cluster with hash 1855743091441939594
phase #1: NCS
phase #2: Search
there are 0 remaining clusters
assessing feature fs0_0
current EV is 40.68117102515575, new EV is 40.46148347794313
there are 0 remaining clusters
assessing feature fs0_1
current EV is 40.68117102515575, new EV is 40.461483477943105
there are 0 remaining clusters
assessing feature fs0_10
current EV is 40.68117102515575, new EV is 40.46148347794312
there are 0 remaining clusters
assessing feature fs0_11
current EV is 40.68117102515575, new EV is 40.46148347794312
there are 0 remaining clusters
assessing feature fs0_12
current EV is 40.68117102515575, new EV is 40.46148347794313
there are 0 remaining clusters
assessing feature fs0_13
current EV is 40.68117102515575, new EV is 40.46148347794313
there are 0 remaining clusters
assessing feature fs0_14
current EV is 40.68117102515575, new EV is 40.461483477943105
there are 0 remaining clusters
assessing feature fs0_15
current EV is 40.68117102515575, new EV is 4

current EV is 40.68117102515575, new EV is 40.559575415483536
there are 0 remaining clusters
assessing feature fs1_19
current EV is 40.68117102515575, new EV is 40.55957541548352
there are 0 remaining clusters
assessing feature fs1_2
current EV is 40.68117102515575, new EV is 40.559575415483536
there are 0 remaining clusters
assessing feature fs1_3
current EV is 40.68117102515575, new EV is 40.55957541548352
there are 0 remaining clusters
assessing feature fs1_4
current EV is 40.68117102515575, new EV is 40.55957541548351
there are 0 remaining clusters
assessing feature fs1_5
current EV is 40.68117102515575, new EV is 40.55957541548354
there are 0 remaining clusters
assessing feature fs1_6
current EV is 40.68117102515575, new EV is 40.559575415483536
there are 0 remaining clusters
assessing feature fs1_7
current EV is 40.68117102515575, new EV is 40.559575415483536
there are 0 remaining clusters
assessing feature fs1_8
current EV is 40.68117102515575, new EV is 40.5595754154835
there a

current EV is 40.36237290956861, new EV is 39.36251958220424
there are 0 remaining clusters
assessing feature fs4_10
current EV is 40.36237290956861, new EV is 39.36251958220423
there are 0 remaining clusters
assessing feature fs4_11
current EV is 40.36237290956861, new EV is 39.36251958220423
there are 0 remaining clusters
assessing feature fs4_12
current EV is 40.36237290956861, new EV is 39.362519582204236
there are 0 remaining clusters
assessing feature fs4_13
current EV is 40.36237290956861, new EV is 39.362519582204236
there are 0 remaining clusters
assessing feature fs4_14
current EV is 40.36237290956861, new EV is 39.362519582204236
there are 0 remaining clusters
assessing feature fs4_15
current EV is 40.36237290956861, new EV is 39.36251958220425
there are 0 remaining clusters
assessing feature fs4_16
current EV is 40.36237290956861, new EV is 39.36251958220423
there are 0 remaining clusters
assessing feature fs4_17
current EV is 40.36237290956861, new EV is 39.36251958220422


current EV is 40.00400040004, new EV is 39.003951893863444
there are 0 remaining clusters
assessing feature fs1_19
current EV is 40.00400040004, new EV is 39.00395189386344
there are 0 remaining clusters
assessing feature fs1_2
current EV is 40.00400040004, new EV is 39.00395189386345
there are 0 remaining clusters
assessing feature fs1_3
current EV is 40.00400040004, new EV is 39.00395189386343
there are 0 remaining clusters
assessing feature fs1_4
current EV is 40.00400040004, new EV is 39.00395189386345
there are 0 remaining clusters
assessing feature fs1_5
current EV is 40.00400040004, new EV is 39.00395189386344
there are 0 remaining clusters
assessing feature fs1_6
current EV is 40.00400040004, new EV is 39.00395189386344
there are 0 remaining clusters
assessing feature fs1_7
current EV is 40.00400040004, new EV is 39.00395189386343
there are 0 remaining clusters
assessing feature fs1_8
current EV is 40.00400040004, new EV is 39.003951893863444
there are 0 remaining clusters
asse

In [14]:
test_varclus.final_cluster_structure[0][0][0].pca.explained_variance_

array([  2.00020002e+01,   1.41428795e-32])