## Creating a variable clustering algorithm that is similar to SAS varclus
1. PCA-based recursive decomposition
2. stopping critiera
3. plotting
4. OOP

In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale

from decomposition.var_clus import VarClus, Cluster

In [2]:
# Fake data
feature_df, label_df = make_classification(n_samples=int(1e4), n_features=50)
feature_df = scale(feature_df)

# Make sure data is zscored
np.max(np.abs(feature_df.mean(axis=0))), np.max((feature_df.std()))

feature_df = pd.DataFrame(feature_df, columns=['feature_' + str(i) for i in range(50)])

In [None]:
# Some simple tests
test_clus = Cluster(feature_df)

In [None]:
test_clus.run_pca()
type(test_clus.pca_corr), type(test_clus.pca_features)

In [None]:
test_clus.return_all_leaves() == test_clus

In [3]:
# VarClus
test_varclus = VarClus()

In [4]:
test_varclus.decompose(feature_df)

TypeError: unhashable type: 'list'

In [None]:
# Logic check
cluster = Cluster(feature_df, 2)
cluster.run_pca()

In [None]:
corr_table = pd.concat(cluster.pca_corr, axis=1)
corr_max = corr_table.max(axis=1)
cluster_membership = corr_table.apply(lambda x: x == corr_max)

child_clusters = [
    Cluster(dataframe=cluster.dataframe,
            n_split=cluster.n_split,
            features=[feature for (feature, condition)
                      in cluster_membership[membership].to_dict().items()
                      if condition],
            parents=[cluster])
    for membership in cluster_membership
]

In [None]:
len(child_clusters[0].features), len(child_clusters[1].features)

In [None]:
child_clusters[0].dataframe.shape, child_clusters[1].dataframe.shape

In [None]:
for cluster in child_clusters:
    if not getattr(cluster, 'pca', False):
        cluster.run_pca()

In [None]:
type(child_clusters[0].pca_features[0]), child_clusters[0].pca_features[0].shape

In [None]:
full_dataframe = pd.concat(
    [cluster.dataframe for cluster in child_clusters],
    axis=1
)

In [None]:
full_dataframe.shape

In [None]:
_ = full_dataframe.corrwith(child_clusters[0].pca_features[0])

In [None]:

corr_table = pd.concat(
    [full_dataframe.dot(cluster.pca_features[0]) for cluster in child_clusters],
    axis=1
)