## Creating a variable clustering algorithm that is similar to SAS varclus
1. PCA-based recursive decomposition
2. stopping critiera
3. plotting
4. OOP

In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale

from decomposition.var_clus import VarClus, Cluster

In [2]:
# Fake data
feature_df, label_df = make_classification(n_samples=int(1e4), n_features=50)
feature_df = scale(feature_df)

# Make sure data is zscored
np.max(np.abs(feature_df.mean(axis=0))), np.max((feature_df.std()))

feature_df = pd.DataFrame(feature_df, columns=['feature_' + str(i) for i in range(50)])

In [3]:
# VarClus
test_varclus = VarClus(max_eigenvalue=2)

In [4]:
cluster = Cluster(feature_df)

In [5]:
cluster.run_pca()
corr_table = pd.concat(cluster.pca_corr, axis=1)
corr_max = corr_table.max(axis=1)
cluster_membership = corr_table.apply(lambda x: x == corr_max)

child_clusters = [
    Cluster(dataframe=cluster.dataframe,
            n_split=cluster.n_split,
            features=[feature for (feature, condition)
                      in cluster_membership[membership].to_dict().items()
                      if condition],
            parents=[cluster])
    for membership in cluster_membership
]

In [6]:
child_clusters, len(child_clusters[0].features + child_clusters[1].features) 

([<decomposition.var_clus.Cluster at 0x2284b79fc88>,
  <decomposition.var_clus.Cluster at 0x2284b79ff28>],
 50)

In [7]:
child_clusters[0].features

['feature_0',
 'feature_1',
 'feature_2',
 'feature_3',
 'feature_6',
 'feature_8',
 'feature_9',
 'feature_10',
 'feature_11',
 'feature_14',
 'feature_16',
 'feature_17',
 'feature_19',
 'feature_20',
 'feature_21',
 'feature_24',
 'feature_25',
 'feature_32',
 'feature_33',
 'feature_38',
 'feature_39',
 'feature_40',
 'feature_42',
 'feature_43',
 'feature_46',
 'feature_47',
 'feature_49']

In [8]:
child_clusters = test_varclus.nearest_component_sorting(child_clusters)

In [9]:
child_clusters, len(child_clusters[0].features + child_clusters[1].features) 

([<decomposition.var_clus.Cluster at 0x2284b79f208>,
  <decomposition.var_clus.Cluster at 0x2284b79f438>],
 50)

In [10]:
child_clusters[0].features

['feature_0',
 'feature_1',
 'feature_3',
 'feature_6',
 'feature_9',
 'feature_10',
 'feature_11',
 'feature_14',
 'feature_16',
 'feature_17',
 'feature_19',
 'feature_24',
 'feature_25',
 'feature_32',
 'feature_33',
 'feature_39',
 'feature_43',
 'feature_49',
 'feature_12',
 'feature_23',
 'feature_28',
 'feature_29',
 'feature_34',
 'feature_35',
 'feature_37',
 'feature_41',
 'feature_44',
 'feature_48']

In [16]:
child_clusters = test_varclus.reassign_features_pca(child_clusters)

feature feature_0 was re-assigned
feature feature_16 was re-assigned
feature feature_17 was re-assigned
feature feature_37 was re-assigned
feature feature_41 was re-assigned
feature feature_44 was re-assigned


In [18]:
child_clusters, len(child_clusters[0].features + child_clusters[1].features) 

([<decomposition.var_clus.Cluster at 0x2284b79f208>,
  <decomposition.var_clus.Cluster at 0x2284b8e4908>],
 49)

In [19]:
all_features = [feature for cluster in child_clusters for feature in cluster.features]
all_features.sort()
all_features

['feature_0',
 'feature_1',
 'feature_10',
 'feature_11',
 'feature_12',
 'feature_13',
 'feature_14',
 'feature_15',
 'feature_16',
 'feature_17',
 'feature_18',
 'feature_19',
 'feature_2',
 'feature_20',
 'feature_21',
 'feature_22',
 'feature_23',
 'feature_24',
 'feature_25',
 'feature_26',
 'feature_27',
 'feature_28',
 'feature_29',
 'feature_3',
 'feature_30',
 'feature_31',
 'feature_32',
 'feature_33',
 'feature_34',
 'feature_35',
 'feature_36',
 'feature_37',
 'feature_38',
 'feature_39',
 'feature_4',
 'feature_40',
 'feature_41',
 'feature_42',
 'feature_43',
 'feature_44',
 'feature_46',
 'feature_47',
 'feature_48',
 'feature_49',
 'feature_5',
 'feature_6',
 'feature_7',
 'feature_8',
 'feature_9']

In [13]:
test_varclus.decompose(feature_df)

In [20]:
test_varclus.cluster

<decomposition.var_clus.Cluster at 0x2284b8e44e0>

In [22]:
test_varclus.cluster.pca.explained_variance_

array([ 2.39695906,  1.62000409])