## Creating a variable clustering algorithm that is similar to SAS varclus
1. PCA-based recursive decomposition
2. stopping critiera
3. plotting
4. OOP

In [None]:
%matplotlib inline

import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale

In [106]:
# Fake data
feature_df, label_df = make_classification(n_samples=int(1e5), n_features=500)
feature_df = scale(feature_df)

In [141]:
# Make sure data is zscored
np.max(np.abs(feature_df.mean(axis=0))), np.max((feature_df.std()))

(1.6914913913979035e-16, 1.0000050000375227)

In [122]:
feature_df = pd.DataFrame(feature_df, columns=['feature_' + str(i) for i in range(500)])

In [125]:
def one_step_clustering(df):
    # First two components
    num_split = 2
    
    # TODO: make a class to wrap pca features
    pca = PCA(n_components=num_split)
    _ = pca.fit(df)
    pca_features = []
    pca_corr = []
    clusters = {}
    
    for i in range(num_split):
        pca_features.append(df.dot(pca.components_[i]))
        pca_corr.append(df.corrwith(pca_features[i]) ** 2)
        
    # Initial assignment
    corr_table = pd.concat(pca_corr, axis=1)
    corr_max = corr_table.max(axis=1)
    cluster_membership = corr_table.apply(lambda x: x == corr_max)
    
    for i in range(num_split):
        clusters['cluster_{}'.format(i)] = \
            [feature 
             for (feature, condition) in cluster_membership[i].to_dict().items() 
             if condition]
            
    return clusters, pca

In [126]:
step_1_0, pca = one_step_clustering(feature_df)

In [158]:
cluster_0_df = feature_df[step_1_0['cluster_0']]
cluster_1_df = feature_df[step_1_0['cluster_1']]

In [None]:
def try_reassign(clus_0, clus_1, feature_to_clus1):
    """
    Tries to re-assign a feature from cluster 0 to cluster 1
    """
    
    pca = PCA(n_components=1)
    
    # TODO: parallelization 
    total_variance_explained = \
        pca.fit(clus_0).explained_variance_[0] \
        + pca.fit(clus_1).explained_variance_[0]
        
    new_clus_0 = clus_0.drop(feature_to_clus1, axis=1)
    new_clus_1 = clus_1.join(clus_0[feature_to_clus1])
    
    new_total_variance_explained = \
        pca.fit(new_clus_0).explained_variance_[0] \
        + pca.fit(new_clus_1).explained_variance_[0]
        
    return (new_total_variance_explained > total_variance_explained), \
           new_total_variance_explained, \
           total_variance_explained

In [None]:
%%time
try_reassign(cluster_0_df, cluster_1_df, cluster_0_df.columns[0])