## Creating a variable clustering algorithm that is similar to SAS varclus
1. PCA-based recursive decomposition
2. stopping critiera
3. plotting
4. OOP

In [43]:
%matplotlib inline

import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale

from decomposition.var_clus import VarClus, Cluster

In [47]:
make_classification?

In [55]:
# Fake data
df_list = []
num_clusters = 5

for i in range(num_clusters):
    feature_df, label_df = make_classification(n_samples=int(1e4), n_features=20, n_informative=1, n_redundant=19, n_clusters_per_class=1)
    feature_df = scale(feature_df)

    # Make sure data is zscored
    np.max(np.abs(feature_df.mean(axis=0))), np.max((feature_df.std()))

    df_list.append(pd.DataFrame(feature_df, columns=['fs{}_'.format(str(i)) + str(j) for j in range(20)]))

feature_df = pd.concat(df_list, axis=1)

In [56]:
# VarClus
test_varclus = VarClus(max_eigenvalue=2, n_split=2)

In [57]:
a = test_varclus.decompose(feature_df)

decomposing cluster with hash 6762036793146285556
phase #1: NCS
phase #2: Search
there are 0 remaining clusters
assessing feature fs0_2
current EV is 24.024107855542866, new EV is 25.02393954386799
feature fs0_2 was re-assigned
child_clusters[i] has 61 features and child_clusters[j] has 39 features
there are 0 remaining clusters
assessing feature fs0_3
current EV is 25.02393954386799, new EV is 26.023884071508405
feature fs0_3 was re-assigned
child_clusters[i] has 60 features and child_clusters[j] has 40 features
there are 0 remaining clusters
assessing feature fs0_5
current EV is 26.023884071508405, new EV is 27.02388102904962
feature fs0_5 was re-assigned
child_clusters[i] has 59 features and child_clusters[j] has 41 features
there are 0 remaining clusters
assessing feature fs0_6
current EV is 27.02388102904962, new EV is 28.02390724676838
feature fs0_6 was re-assigned
child_clusters[i] has 58 features and child_clusters[j] has 42 features
there are 0 remaining clusters
assessing fea

assessing feature fs1_19
current EV is 34.024299886315646, new EV is 34.02404913649041
there are 0 remaining clusters
assessing feature fs1_0
current EV is 34.024299886315646, new EV is 34.025097371833766
feature fs1_0 was re-assigned
child_clusters[i] has 47 features and child_clusters[j] has 53 features
there are 0 remaining clusters
assessing feature fs1_2
current EV is 34.025097371833766, new EV is 34.07625780071838
feature fs1_2 was re-assigned
child_clusters[i] has 46 features and child_clusters[j] has 54 features
there are 0 remaining clusters
assessing feature fs1_3
current EV is 34.07625780071838, new EV is 35.00987226817405
feature fs1_3 was re-assigned
child_clusters[i] has 45 features and child_clusters[j] has 55 features
there are 0 remaining clusters
assessing feature fs1_9
current EV is 35.00987226817405, new EV is 36.007255822274814
feature fs1_9 was re-assigned
child_clusters[i] has 44 features and child_clusters[j] has 56 features
there are 0 remaining clusters
assess

assessing feature fs0_15
current EV is 40.16855624259084, new EV is 39.16847990285187
there are 0 remaining clusters
assessing feature fs0_17
current EV is 40.16855624259084, new EV is 39.16847990285187
decomposing cluster with hash 5325827380228092502
phase #1: NCS
phase #2: Search
there are 0 remaining clusters
assessing feature fs2_3
current EV is 22.04442965538724, new EV is 23.022826229074518
feature fs2_3 was re-assigned
child_clusters[i] has 51 features and child_clusters[j] has 29 features
there are 0 remaining clusters
assessing feature fs2_5
current EV is 23.022826229074518, new EV is 24.02171055645406
feature fs2_5 was re-assigned
child_clusters[i] has 50 features and child_clusters[j] has 30 features
there are 0 remaining clusters
assessing feature fs2_6
current EV is 24.02171055645406, new EV is 25.02113907564499
feature fs2_6 was re-assigned
child_clusters[i] has 49 features and child_clusters[j] has 31 features
there are 0 remaining clusters
assessing feature fs2_7
curre

assessing feature fs1_12
current EV is 36.007230925476414, new EV is 37.00642402877521
feature fs1_12 was re-assigned
child_clusters[i] has 35 features and child_clusters[j] has 45 features
there are 0 remaining clusters
assessing feature fs1_13
current EV is 37.00642402877521, new EV is 38.00606541545143
feature fs1_13 was re-assigned
child_clusters[i] has 34 features and child_clusters[j] has 46 features
there are 0 remaining clusters
assessing feature fs1_14
current EV is 38.00606541545143, new EV is 39.00588728947713
feature fs1_14 was re-assigned
child_clusters[i] has 33 features and child_clusters[j] has 47 features
there are 0 remaining clusters
assessing feature fs1_18
current EV is 39.00588728947713, new EV is 40.00579989749306
feature fs1_18 was re-assigned
child_clusters[i] has 32 features and child_clusters[j] has 48 features
there are 0 remaining clusters
assessing feature fs2_0
current EV is 40.00579989749306, new EV is 39.005718622485645
there are 0 remaining clusters
as

assessing feature fs4_17
current EV is 26.018697753070462, new EV is 26.00329055658838
there are 0 remaining clusters
assessing feature fs4_18
current EV is 26.018697753070462, new EV is 26.003290556588354
there are 0 remaining clusters
assessing feature fs4_19
current EV is 26.018697753070462, new EV is 26.003290556588365
there are 0 remaining clusters
assessing feature fs1_0
current EV is 26.018697753070462, new EV is 27.018731950061778
feature fs1_0 was re-assigned
child_clusters[i] has 35 features and child_clusters[j] has 19 features
there are 0 remaining clusters
assessing feature fs1_2
current EV is 27.018731950061778, new EV is 28.018782733569804
feature fs1_2 was re-assigned
child_clusters[i] has 34 features and child_clusters[j] has 20 features
there are 0 remaining clusters
assessing feature fs1_3
current EV is 28.018782733569804, new EV is 29.018844535225625
feature fs1_3 was re-assigned
child_clusters[i] has 33 features and child_clusters[j] has 21 features
there are 0 rem

KeyboardInterrupt: 

In [74]:
test_varclus.final_cluster_structure[0][0][0].__hash__()

8682865062666898867

In [78]:
cluster = test_varclus.final_cluster_structure[0][0][0]

In [79]:
if not getattr(cluster, 'pac', False):
    cluster.run_pca()

corr_table = pd.concat(cluster.pca_corr, axis=1)
corr_max = corr_table.max(axis=1)
cluster_membership = corr_table.apply(lambda x: x == corr_max)

child_clusters = [
    Cluster(dataframe=cluster.dataframe,
            n_split=cluster.n_split,
            features=[feature for (feature, condition)
                      in cluster_membership[membership].to_dict().items()
                      if condition],
            parents=[cluster])
    for membership in cluster_membership
]

In [97]:
new_child_clusters = child_clusters

In [109]:
new_child_clusters, change_flag = \
    VarClus.nearest_component_sorting_once(new_child_clusters)
    

change_flag, new_child_clusters[0].features, new_child_clusters[1].features

(True,
 ['fs4_0', 'fs4_4', 'fs4_5', 'fs4_10', 'fs4_11', 'fs4_16'],
 ['fs3_1',
  'fs3_2',
  'fs3_3',
  'fs3_4',
  'fs3_5',
  'fs3_6',
  'fs3_7',
  'fs3_8',
  'fs3_9',
  'fs3_10',
  'fs3_12',
  'fs3_15',
  'fs3_17',
  'fs3_19',
  'fs4_1',
  'fs4_2',
  'fs4_3',
  'fs4_6',
  'fs4_7',
  'fs4_8',
  'fs4_9',
  'fs4_12',
  'fs4_13',
  'fs4_14',
  'fs4_15',
  'fs4_17',
  'fs4_18',
  'fs4_19'])

In [114]:
# Check if clusters are unchanged
old_cluster_features = set([
    tuple(cluster.features) for cluster in child_clusters
])

new_cluster_features = set([
    tuple(cluster.features) for cluster in new_child_clusters
])
old_cluster_features == new_cluster_features

True

In [128]:


a = [['a', 'b'], ['c']]
b = [['c'], ['b', 'a']]

test_a = set([tuple(item.sort() or item) for item in a])
test_b = set([tuple(item.sort() or item) for item in b])

test_b

{('a', 'b'), ('c',)}

In [54]:
test_varclus.final_cluster_structure[1].features

['fs1_0',
 'fs1_1',
 'fs1_3',
 'fs1_5',
 'fs1_9',
 'fs1_11',
 'fs1_14',
 'fs1_18',
 'fs1_19',
 'fs1_2',
 'fs1_4',
 'fs1_6',
 'fs1_7',
 'fs1_8',
 'fs1_10',
 'fs1_12',
 'fs1_13',
 'fs1_15',
 'fs1_16',
 'fs1_17']

In [None]:
test_varclus.final_cluster_structure[0].pca.explained_variance_

In [None]:
for a in test_varclus.final_cluster_structure:
    for b in a:
        print(b.pca.explained_variance_)