## Creating a variable clustering algorithm that is similar to SAS varclus
1. PCA-based recursive decomposition
2. stopping critiera
3. plotting
4. OOP

In [5]:
%matplotlib inline

import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale

from decomposition.var_clus import VarClus, Cluster

In [6]:
# Fake data
feature_df, label_df = make_classification(n_samples=int(1e4), n_features=50)
feature_df = scale(feature_df)

# Make sure data is zscored
np.max(np.abs(feature_df.mean(axis=0))), np.max((feature_df.std()))

feature_df = pd.DataFrame(feature_df, columns=['feature_' + str(i) for i in range(50)])

In [7]:
# VarClus
test_varclus = VarClus(max_eigenvalue=1)

In [None]:
cluster = Cluster(feature_df)

In [None]:
cluster.run_pca()
corr_table = pd.concat(cluster.pca_corr, axis=1)
corr_max = corr_table.max(axis=1)
cluster_membership = corr_table.apply(lambda x: x == corr_max)

child_clusters = [
    Cluster(dataframe=cluster.dataframe,
            n_split=cluster.n_split,
            features=[feature for (feature, condition)
                      in cluster_membership[membership].to_dict().items()
                      if condition],
            parents=[cluster])
    for membership in cluster_membership
]

In [None]:
child_clusters, len(child_clusters[0].features + child_clusters[1].features) 

In [None]:
child_clusters[0].features

In [None]:
child_clusters = test_varclus.nearest_component_sorting(child_clusters)

In [None]:
child_clusters, len(child_clusters[0].features + child_clusters[1].features) 

In [None]:
child_clusters[0].features

In [None]:
child_clusters = test_varclus.reassign_features_pca(child_clusters)

In [None]:
child_clusters, len(child_clusters[0].features + child_clusters[1].features) 

In [None]:
all_features = [feature for cluster in child_clusters for feature in cluster.features]
all_features.sort()
all_features

In [8]:
test_varclus.decompose(feature_df)

assessing feature feature_6
current EV is 3.1092611360158604, new EV is 3.1618298369069135
feature feature_6 was re-assigned
assessing feature feature_9
current EV is 4.037885528495924, new EV is 4.038238736666199
feature feature_9 was re-assigned
assessing feature feature_13
current EV is 4.038580495934999, new EV is 4.03849296647833
assessing feature feature_20
current EV is 4.038580495934999, new EV is 4.03864537854021
feature feature_20 was re-assigned
assessing feature feature_24
current EV is 4.038609628720539, new EV is 4.038643900223173
feature feature_24 was re-assigned
assessing feature feature_32
current EV is 4.038622000856842, new EV is 4.038730595175929
feature feature_32 was re-assigned
assessing feature feature_33
current EV is 4.03871017719919, new EV is 4.038147338274191
assessing feature feature_46
current EV is 4.03871017719919, new EV is 4.035513195344655
assessing feature feature_0
current EV is 4.03871017719919, new EV is 4.038744194600207
feature feature_0 was r

KeyboardInterrupt: 

In [None]:
test_varclus.cluster.children