In [1]:
%run load_data_2.py

### Evaluating Model Pipelines

We will evaluate a total of 24 model pipelines:

1. the original data
1. the original data with outliers removed
1. the original data transformed by a PCA with 2 components 
1. the original data with outliers removed transformed by a PCA with 2 components 
1. the original data transformed by a PCA with 3 components 
1. the original data with outliers removed transformed by a PCA with 3 components  
1. scaled data
1. scaled data with outliers removed
1. scaled data transformed by a PCA with 2 components 
1. scaled data with outliers removed transformed by a PCA with 2 components
1. scaled data transformed by a PCA with 3 components  
1. scaled data with outliers removed transformed by a PCA with 3 components  
1. log transformed, scaled data
1. log transformed, scaled data with outliers removed
1. log transformed, scaled data transformed by a PCA with 2 components
1. log transformed, scaled data with outliers removed transformed by a PCA with 2 components 
1. log transformed, scaled data transformed by a PCA with 3 components  
1. log transformed, scaled data with outliers removed transformed by a PCA with 3 components  
1. box-cox transformed, scaled data
1. box-cox transformed, scaled data with outliers removed
1. box-cox transformed, scaled data transformed by a PCA with 2 components 
1. box-cox transformed, scaled data with outliers removed transformed by a PCA with 2 components 
1. box-cox transformed, scaled data transformed by a PCA with 3 components  
1. box-cox transformed, scaled data with outliers removed transformed by a PCA with 3 components  

### Experiment Design

We will pass each of these transformed data sets to a Gaussian Mixture Model and then assess the model using the BIC.

In [2]:
from sklearn.mixture import GaussianMixture

In [3]:
original_data = [
    ('original', customer_features),
    ('original - no outliers', customer_features_outliers_removed),
    ('original - pca, 2 components', customer_features_pca_2),
    ('original - pca, 3 components', customer_features_pca_3),
    ('original - no outliers, pca, 2 components', customer_features_outliers_removed_pca_2),
    ('original - no outliers, pca, 3 components', customer_features_outliers_removed_pca_3)
]

scaled_data = [
    ('scaled', customer_sc),
    ('scaled - no outliers', customer_sc_outliers_removed),
    ('scaled - pca, 2 components', customer_sc_pca_2),
    ('scaled - pca, 3 components', customer_sc_pca_3),
    ('scaled - no outliers, pca, 2 components', customer_sc_outliers_removed_pca_2),
    ('scaled - no outliers, pca, 3 components', customer_sc_outliers_removed_pca_3),
]

log_transformed_data = [
    ('log transformed, scaled', customer_log_sc),
    ('log transformed, scaled - no outliers', customer_log_sc_outliers_removed),
    ('log transformed, scaled - pca, 2 components', customer_log_sc_pca_2),
    ('log transformed, scaled - pca, 3 components', customer_log_sc_pca_3),
    ('log transformed, scaled - no outliers, pca, 2 components', customer_log_sc_outliers_removed_pca_2),
    ('log transformed, scaled - no outliers, pca, 3 components', customer_log_sc_outliers_removed_pca_3),
]

box_cox_transformed_data = [
    ('box-cox transformed, scaled', customer_box_cox_sc),
    ('box-cox transformed, scaled - no outliers', customer_box_cox_sc_outliers_removed),
    ('box-cox transformed, scaled - pca, 2 components', customer_box_cox_sc_pca_2),
    ('box-cox transformed, scaled - pca, 3 components', customer_box_cox_sc_pca_3),
    ('box-cox transformed, scaled - no outliers, pca, 2 components', customer_box_cox_sc_outliers_removed_pca_2),
    ('box-cox transformed, scaled - no outliers, pca, 3 components', customer_box_cox_sc_outliers_removed_pca_3),
]

In [29]:
def fit_and_score_bic(data, n_components=2):
    model = GaussianMixture(n_components=n_components, covariance_type='full')
    model.fit(data)
    return model.bic(data)

def fit_and_score_aic(data, n_components=2):
    model = GaussianMixture(n_components=n_components, covariance_type='full')
    model.fit(data)
    return model.aic(data)

In [30]:
n = 2

results_2_clusters = []

for name, data in original_data:
    results_2_clusters.append({
        'name' : name, 
        'n' : n, 
        'BIC' : fit_and_score_bic(data, n),
        'AIC' : fit_and_score_aic(data, n)
    })

for name, data in scaled_data:
    results_2_clusters.append({
        'name' : name, 
        'n' : n, 
        'BIC' : fit_and_score_bic(data, n),
        'AIC' : fit_and_score_aic(data, n)
    })
    
for name, data in log_transformed_data:
    results_2_clusters.append({
        'name' : name, 
        'n' : n, 
        'BIC' : fit_and_score_bic(data, n),
        'AIC' : fit_and_score_aic(data, n)
    })
    
for name, data in box_cox_transformed_data:
    results_2_clusters.append({
        'name' : name, 
        'n' : n, 
        'BIC' : fit_and_score_bic(data, n),
        'AIC' : fit_and_score_aic(data, n)
    })


In [31]:
pd.DataFrame(results_2_clusters).sort_values('BIC')[:10]

Unnamed: 0,AIC,BIC,n,name
10,1775.83587,1819.588695,2,"scaled - no outliers, pca, 2 components"
11,2335.965185,2412.062814,2,"scaled - no outliers, pca, 3 components"
7,2229.095845,2448.488723,2,scaled - no outliers
8,2541.680954,2586.635476,2,"scaled - pca, 2 components"
16,2997.341526,3042.170333,2,"log transformed, scaled - no outliers, pca, 2 ..."
22,3091.094701,3135.999109,2,"box-cox transformed, scaled - no outliers, pca..."
14,3109.912684,3154.867206,2,"log transformed, scaled - pca, 2 components"
20,3140.34971,3185.304232,2,"box-cox transformed, scaled - pca, 2 components"
9,3577.767824,3415.37618,2,"scaled - pca, 3 components"
17,3983.45678,4060.916733,2,"log transformed, scaled - no outliers, pca, 3 ..."


In [28]:
pd.DataFrame(results_2_clusters).sort_values('BIC')[:10]

Unnamed: 0,AIC,BIC,n,name
10,1930.149507,1924.537822,2,"scaled - no outliers, pca, 2 components"
11,2724.439357,2776.295855,2,"scaled - no outliers, pca, 3 components"
8,2917.940142,2950.627295,2,"scaled - pca, 2 components"
16,3011.219827,3043.822596,2,"log transformed, scaled - no outliers, pca, 2 ..."
22,3086.654983,3119.312735,2,"box-cox transformed, scaled - no outliers, pca..."
14,3133.341041,3166.035239,2,"log transformed, scaled - pca, 2 components"
20,3135.151588,3167.845786,2,"box-cox transformed, scaled - pca, 2 components"
7,3223.044664,3358.662468,2,scaled - no outliers
9,4043.925279,4097.056182,2,"scaled - pca, 3 components"
17,4054.064827,4107.040375,2,"log transformed, scaled - no outliers, pca, 3 ..."


In [8]:
n = 3

results_3_clusters = []

for name, data in original_data:
    results_3_clusters.append({
        'name' : name, 
        'n' : n, 
        'BIC' : fit_and_score(data, n)
    })

for name, data in scaled_data:
    results_3_clusters.append({
        'name' : name, 
        'n' : n, 
        'BIC' : fit_and_score(data, n)
    })
    
for name, data in log_transformed_data:
    results_3_clusters.append({
        'name' : name, 
        'n' : n, 
        'BIC' : fit_and_score(data, n)
    })
    
for name, data in box_cox_transformed_data:
    results_3_clusters.append({
        'name' : name, 
        'n' : n, 
        'BIC' : fit_and_score(data, n)
    })



In [9]:
pd.DataFrame(results_3_clusters).sort_values('BIC')[:5]

Unnamed: 0,BIC,n,name
10,1727.253533,3,"scaled - no outliers, pca, 2 components"
7,2255.218493,3,scaled - no outliers
11,2257.444963,3,"scaled - no outliers, pca, 3 components"
8,2474.559794,3,"scaled - pca, 2 components"
16,3078.845985,3,"log transformed, scaled - no outliers, pca, 2 ..."


### One More Thing ... What About Those Labels?

In [10]:
channel = customers.Channel.astype(int) - 1
# region = customers.Region

In [11]:
from sklearn.metrics import accuracy_score

def fit_and_score_predictions(data, labels, n_components=2):
    model = GaussianMixture(n_components=n_components)
    model.fit(data)
    predictions = model.predict(data)
    labels_pos = labels
    labels_neg = (labels == 0).astype(int)
    return max(accuracy_score(labels_pos, predictions), accuracy_score(labels_neg, predictions))

In [12]:
customer_sc_outliers_removed.shape

(399, 6)

In [13]:
original_data_with_labels = [
    ('original', customer_features, channel),
    ('original - no outliers', customer_features_outliers_removed, channel_original_outliers_removed),
    ('original - pca, 2 components', customer_features_pca_2, channel),
    ('original - pca, 3 components', customer_features_pca_3, channel),
    ('original - no outliers, pca, 2 components', customer_features_outliers_removed_pca_2, channel_original_outliers_removed),
    ('original - no outliers, pca, 3 components', customer_features_outliers_removed_pca_3, channel_original_outliers_removed)
]

scaled_data_with_labels = [
    ('scaled', customer_sc, channel),
    ('scaled - no outliers', customer_sc_outliers_removed, channel_scaled_outliers_removed),
    ('scaled - pca, 2 components', customer_sc_pca_2, channel),
    ('scaled - pca, 3 components', customer_sc_pca_3, channel),
    ('scaled - no outliers, pca, 2 components', customer_sc_outliers_removed_pca_2, channel_scaled_outliers_removed),
    ('scaled - no outliers, pca, 3 components', customer_sc_outliers_removed_pca_3, channel_scaled_outliers_removed),
]

log_transformed_data_with_labels = [
    ('log transformed, scaled', customer_log_sc, channel),
    ('log transformed, scaled - no outliers', customer_log_sc_outliers_removed, channel_log_outliers_removed),
    ('log transformed, scaled - pca, 2 components', customer_log_sc_pca_2, channel),
    ('log transformed, scaled - pca, 3 components', customer_log_sc_pca_3, channel),
    ('log transformed, scaled - no outliers, pca, 2 components', customer_log_sc_outliers_removed_pca_2, channel_log_outliers_removed),
    ('log transformed, scaled - no outliers, pca, 3 components', customer_log_sc_outliers_removed_pca_3, channel_log_outliers_removed),
]

box_cox_transformed_data_with_labels = [
    ('box-cox transformed, scaled', customer_box_cox_sc, channel),
    ('box-cox transformed, scaled - no outliers', customer_box_cox_sc_outliers_removed, channel_box_cox_outliers_removed),
    ('box-cox transformed, scaled - pca, 2 components', customer_box_cox_sc_pca_2, channel),
    ('box-cox transformed, scaled - pca, 3 components', customer_box_cox_sc_pca_3, channel),
    ('box-cox transformed, scaled - no outliers, pca, 2 components', customer_box_cox_sc_outliers_removed_pca_2, channel_box_cox_outliers_removed),
    ('box-cox transformed, scaled - no outliers, pca, 3 components', customer_box_cox_sc_outliers_removed_pca_3, channel_box_cox_outliers_removed),
]

In [14]:
n = 2

results_2_accuracy = []

for name, data, label in original_data_with_labels:
    results_2_accuracy.append({
        'name' : name, 
        'n' : n, 
        'accuracy' : fit_and_score_predictions(data, label, n)
    })

for name, data, label in scaled_data_with_labels:
    results_2_accuracy.append({
        'name' : name, 
        'n' : n, 
        'accuracy' : fit_and_score_predictions(data, label, n)
    })
    
for name, data, label in log_transformed_data_with_labels:
    results_2_accuracy.append({
        'name' : name, 
        'n' : n, 
        'accuracy' : fit_and_score_predictions(data, label, n)
    })
    
for name, data, label in box_cox_transformed_data_with_labels:
    results_2_accuracy.append({
        'name' : name, 
        'n' : n, 
        'accuracy' : fit_and_score_predictions(data, label, n)
    })



In [17]:
pd.DataFrame(results_2_accuracy).sort_values('accuracy', ascending=False)

Unnamed: 0,accuracy,n,name
12,0.895455,2,"log transformed, scaled"
13,0.891954,2,"log transformed, scaled - no outliers"
18,0.890909,2,"box-cox transformed, scaled"
19,0.890411,2,"box-cox transformed, scaled - no outliers"
23,0.883562,2,"box-cox transformed, scaled - no outliers, pca..."
22,0.876712,2,"box-cox transformed, scaled - no outliers, pca..."
16,0.875862,2,"log transformed, scaled - no outliers, pca, 2 ..."
21,0.872727,2,"box-cox transformed, scaled - pca, 3 components"
14,0.870455,2,"log transformed, scaled - pca, 2 components"
20,0.870455,2,"box-cox transformed, scaled - pca, 2 components"


![](complex_pipe_1.png)

![](complex_pipe_2.png)

![](complex_pipe_3.png)

![](complex_pipe_4.png)

![](complex_pipe_5.png)
