## Creating a variable clustering algorithm that is similar to SAS varclus
1. PCA-based recursive decomposition
2. stopping critiera
3. plotting
4. OOP

In [2]:
%matplotlib inline

import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale

from decomposition.var_clus import VarClus, Cluster

In [3]:
# Fake data
feature_df, label_df = make_classification(n_samples=int(1e4), n_features=50)
feature_df = scale(feature_df)

# Make sure data is zscored
np.max(np.abs(feature_df.mean(axis=0))), np.max((feature_df.std()))

feature_df = pd.DataFrame(feature_df, columns=['feature_' + str(i) for i in range(50)])

In [13]:
# Some simple tests
test_clus = Cluster(feature_df)

In [10]:
type(test_clus.dataframe.shape)

tuple

In [4]:
# VarClus
test_varclus = VarClus()

In [5]:
test_varclus.decompose(feature_df)

<decomposition.var_clus.Cluster object at 0x000001E4C4883E80>
[<decomposition.var_clus.Cluster object at 0x000001E4C4883E80>]
<decomposition.var_clus.Cluster object at 0x000001E4C4883CC0>
[<decomposition.var_clus.Cluster object at 0x000001E4C4883CC0>]


TypeError: unhashable type: 'Index'

In [6]:
cluster = Cluster(feature_df)

if not getattr(cluster, 'pac', False):
    cluster.run_pca()

corr_table = pd.concat(cluster.pca_corr, axis=1)
corr_max = corr_table.max(axis=1)
cluster_membership = corr_table.apply(lambda x: x == corr_max)

child_clusters = [
    Cluster(dataframe=cluster.dataframe,
            n_split=cluster.n_split,
            features=[feature for (feature, condition)
                      in cluster_membership[membership].to_dict().items()
                      if condition],
            parents=[cluster])
    for membership in cluster_membership
]

In [16]:
child_clusters

[<decomposition.var_clus.Cluster at 0x2d6d69cf2b0>,
 <decomposition.var_clus.Cluster at 0x2d6d69cf2e8>]

In [17]:
for i, child_cluster in enumerate(child_clusters):
    other_clusters = list(set(child_clusters) - {child_cluster})

    for feature in child_cluster.features:
        for j, other_cluster in enumerate(other_clusters):
            remaining_clusters = list(set(other_clusters) - {other_cluster})

In [19]:
set(child_clusters)

{<decomposition.var_clus.Cluster at 0x1e4c513de48>,
 <decomposition.var_clus.Cluster at 0x1e4c513dcf8>}

In [35]:
a = (cluster.features)

In [39]:
list(feature_df)

['feature_0',
 'feature_1',
 'feature_2',
 'feature_3',
 'feature_4',
 'feature_5',
 'feature_6',
 'feature_7',
 'feature_8',
 'feature_9',
 'feature_10',
 'feature_11',
 'feature_12',
 'feature_13',
 'feature_14',
 'feature_15',
 'feature_16',
 'feature_17',
 'feature_18',
 'feature_19',
 'feature_20',
 'feature_21',
 'feature_22',
 'feature_23',
 'feature_24',
 'feature_25',
 'feature_26',
 'feature_27',
 'feature_28',
 'feature_29',
 'feature_30',
 'feature_31',
 'feature_32',
 'feature_33',
 'feature_34',
 'feature_35',
 'feature_36',
 'feature_37',
 'feature_38',
 'feature_39',
 'feature_40',
 'feature_41',
 'feature_42',
 'feature_43',
 'feature_44',
 'feature_45',
 'feature_46',
 'feature_47',
 'feature_48',
 'feature_49']