In [1]:
run load_data.py

### Evaluating Model Pipelines

We will evaluate a total of 24 model pipelines:

1. the original data
1. the original data with outliers removed
1. the original data transformed by a PCA with 2 components 
1. the original data with outliers removed transformed by a PCA with 2 components 
1. the original data transformed by a PCA with 3 components 
1. the original data with outliers removed transformed by a PCA with 3 components  
1. scaled data
1. scaled data with outliers removed
1. scaled data transformed by a PCA with 2 components 
1. scaled data with outliers removed transformed by a PCA with 2 components
1. scaled data transformed by a PCA with 3 components  
1. scaled data with outliers removed transformed by a PCA with 3 components  
1. log transformed, scaled data
1. log transformed, scaled data with outliers removed
1. log transformed, scaled data transformed by a PCA with 2 components
1. log transformed, scaled data with outliers removed transformed by a PCA with 2 components 
1. log transformed, scaled data transformed by a PCA with 3 components  
1. log transformed, scaled data with outliers removed transformed by a PCA with 3 components  
1. box-cox transformed, scaled data
1. box-cox transformed, scaled data with outliers removed
1. box-cox transformed, scaled data transformed by a PCA with 2 components 
1. box-cox transformed, scaled data with outliers removed transformed by a PCA with 2 components 
1. box-cox transformed, scaled data transformed by a PCA with 3 components  
1. box-cox transformed, scaled data with outliers removed transformed by a PCA with 3 components  

### Experiment Design

We will pass each of these transformed data sets to a Gaussian Mixture Model and then assess the model using the BIC.

In [2]:
from sklearn.mixture import GaussianMixture

In [3]:
original_data = [
    ('original', customer_features),
    ('original - no outliers', customer_features_outliers_removed),
    ('original - pca, 2 components', customer_features_pca_2),
    ('original - pca, 3 components', customer_features_pca_3),
    ('original - no outliers, pca, 2 components', customer_features_outliers_removed_pca_2),
    ('original - no outliers, pca, 3 components', customer_features_outliers_removed_pca_3)
]

scaled_data = [
    ('scaled', customer_sc),
    ('scaled - no outliers', customer_sc_outliers_removed),
    ('scaled - pca, 2 components', customer_sc_pca_2),
    ('scaled - pca, 3 components', customer_sc_pca_3),
    ('scaled - no outliers, pca, 2 components', customer_sc_outliers_removed_pca_2),
    ('scaled - no outliers, pca, 3 components', customer_sc_outliers_removed_pca_3),
]

log_transformed_data = [
    ('log transformed, scaled', customer_log_sc),
    ('log transformed, scaled - no outliers', customer_log_sc_outliers_removed),
    ('log transformed, scaled - pca, 2 components', customer_log_sc_pca_2),
    ('log transformed, scaled - pca, 3 components', customer_log_sc_pca_3),
    ('log transformed, scaled - no outliers, pca, 2 components', customer_log_sc_outliers_removed_pca_2),
    ('log transformed, scaled - no outliers, pca, 3 components', customer_log_sc_outliers_removed_pca_3),
]

box_cox_transformed_data = [
    ('box-cox transformed, scaled', customer_box_cox_sc),
    ('box-cox transformed, scaled - no outliers', customer_box_cox_sc_outliers_removed),
    ('box-cox transformed, scaled - pca, 2 components', customer_box_cox_sc_pca_2),
    ('box-cox transformed, scaled - pca, 3 components', customer_box_cox_sc_pca_3),
    ('box-cox transformed, scaled - no outliers, pca, 2 components', customer_box_cox_sc_outliers_removed_pca_2),
    ('box-cox transformed, scaled - no outliers, pca, 3 components', customer_box_cox_sc_outliers_removed_pca_3),
]

In [4]:
def fit_and_score(data, n_components=2):
    model = GaussianMixture(n_components=n_components)
    model.fit(data)
    return model.bic(data)

In [5]:
n = 2

results_2_clusters = []

for name, data in original_data:
    results_2_clusters.append({
        'name' : name, 
        'n' : n, 
        'BIC' : fit_and_score(data, n)
    })

for name, data in scaled_data:
    results_2_clusters.append({
        'name' : name, 
        'n' : n, 
        'BIC' : fit_and_score(data, n)
    })
    
for name, data in log_transformed_data:
    results_2_clusters.append({
        'name' : name, 
        'n' : n, 
        'BIC' : fit_and_score(data, n)
    })
    
for name, data in box_cox_transformed_data:
    results_2_clusters.append({
        'name' : name, 
        'n' : n, 
        'BIC' : fit_and_score(data, n)
    })


In [8]:
pd.DataFrame(results_2_clusters).sort_values('BIC')

Unnamed: 0,BIC,n,name
10,1885.559367,2,"scaled - no outliers, pca, 2 components"
11,2412.065861,2,"scaled - no outliers, pca, 3 components"
7,2448.476967,2,scaled - no outliers
8,2586.743957,2,"scaled - pca, 2 components"
16,3042.170333,2,"log transformed, scaled - no outliers, pca, 2 ..."
22,3135.999109,2,"box-cox transformed, scaled - no outliers, pca..."
14,3154.867206,2,"log transformed, scaled - pca, 2 components"
20,3185.304232,2,"box-cox transformed, scaled - pca, 2 components"
9,3415.37618,2,"scaled - pca, 3 components"
17,4060.888354,2,"log transformed, scaled - no outliers, pca, 3 ..."


In [9]:
n = 3

results_3_clusters = []

for name, data in original_data:
    results_3_clusters.append({
        'name' : name, 
        'n' : n, 
        'BIC' : fit_and_score(data, n)
    })

for name, data in scaled_data:
    results_3_clusters.append({
        'name' : name, 
        'n' : n, 
        'BIC' : fit_and_score(data, n)
    })
    
for name, data in log_transformed_data:
    results_3_clusters.append({
        'name' : name, 
        'n' : n, 
        'BIC' : fit_and_score(data, n)
    })
    
for name, data in box_cox_transformed_data:
    results_3_clusters.append({
        'name' : name, 
        'n' : n, 
        'BIC' : fit_and_score(data, n)
    })



In [10]:
pd.DataFrame(results_3_clusters).sort_values('BIC')

Unnamed: 0,BIC,n,name
10,1727.215879,3,"scaled - no outliers, pca, 2 components"
7,2255.218493,3,scaled - no outliers
11,2257.444963,3,"scaled - no outliers, pca, 3 components"
8,2474.559794,3,"scaled - pca, 2 components"
16,3062.346233,3,"log transformed, scaled - no outliers, pca, 2 ..."
14,3157.549693,3,"log transformed, scaled - pca, 2 components"
22,3160.667316,3,"box-cox transformed, scaled - no outliers, pca..."
20,3188.782693,3,"box-cox transformed, scaled - pca, 2 components"
9,3359.809511,3,"scaled - pca, 3 components"
6,3660.361663,3,scaled


### One More Thing ... What About Those Labels?

In [11]:
channel = customers.Channel.astype(int) - 1
# region = customers.Region

In [12]:
from sklearn.metrics import accuracy_score

def fit_and_score_predictions(data, labels, n_components=2):
    model = GaussianMixture(n_components=n_components)
    model.fit(data)
    predictions = model.predict(data)
    labels_pos = labels
    labels_neg = (labels == 0).astype(int)
    return max(accuracy_score(labels_pos, predictions), accuracy_score(labels_neg, predictions))

In [13]:
customer_sc_outliers_removed.shape

(399, 6)

In [14]:
original_data_with_labels = [
    ('original', customer_features, channel),
    ('original - no outliers', customer_features_outliers_removed, channel_original_outliers_removed),
    ('original - pca, 2 components', customer_features_pca_2, channel),
    ('original - pca, 3 components', customer_features_pca_3, channel),
    ('original - no outliers, pca, 2 components', customer_features_outliers_removed_pca_2, channel_original_outliers_removed),
    ('original - no outliers, pca, 3 components', customer_features_outliers_removed_pca_3, channel_original_outliers_removed)
]

scaled_data_with_labels = [
    ('scaled', customer_sc, channel),
    ('scaled - no outliers', customer_sc_outliers_removed, channel_scaled_outliers_removed),
    ('scaled - pca, 2 components', customer_sc_pca_2, channel),
    ('scaled - pca, 3 components', customer_sc_pca_3, channel),
    ('scaled - no outliers, pca, 2 components', customer_sc_outliers_removed_pca_2, channel_scaled_outliers_removed),
    ('scaled - no outliers, pca, 3 components', customer_sc_outliers_removed_pca_3, channel_scaled_outliers_removed),
]

log_transformed_data_with_labels = [
    ('log transformed, scaled', customer_log_sc, channel),
    ('log transformed, scaled - no outliers', customer_log_sc_outliers_removed, channel_log_outliers_removed),
    ('log transformed, scaled - pca, 2 components', customer_log_sc_pca_2, channel),
    ('log transformed, scaled - pca, 3 components', customer_log_sc_pca_3, channel),
    ('log transformed, scaled - no outliers, pca, 2 components', customer_log_sc_outliers_removed_pca_2, channel_log_outliers_removed),
    ('log transformed, scaled - no outliers, pca, 3 components', customer_log_sc_outliers_removed_pca_3, channel_log_outliers_removed),
]

box_cox_transformed_data_with_labels = [
    ('box-cox transformed, scaled', customer_box_cox_sc, channel),
    ('box-cox transformed, scaled - no outliers', customer_box_cox_sc_outliers_removed, channel_box_cox_outliers_removed),
    ('box-cox transformed, scaled - pca, 2 components', customer_box_cox_sc_pca_2, channel),
    ('box-cox transformed, scaled - pca, 3 components', customer_box_cox_sc_pca_3, channel),
    ('box-cox transformed, scaled - no outliers, pca, 2 components', customer_box_cox_sc_outliers_removed_pca_2, channel_box_cox_outliers_removed),
    ('box-cox transformed, scaled - no outliers, pca, 3 components', customer_box_cox_sc_outliers_removed_pca_3, channel_box_cox_outliers_removed),
]

In [15]:
n = 2

results_2_accuracy = []

for name, data, label in original_data_with_labels:
    results_2_accuracy.append({
        'name' : name, 
        'n' : n, 
        'accuracy' : fit_and_score_predictions(data, label, n)
    })

for name, data, label in scaled_data_with_labels:
    results_2_accuracy.append({
        'name' : name, 
        'n' : n, 
        'accuracy' : fit_and_score_predictions(data, label, n)
    })
    
for name, data, label in log_transformed_data_with_labels:
    results_2_accuracy.append({
        'name' : name, 
        'n' : n, 
        'accuracy' : fit_and_score_predictions(data, label, n)
    })
    
for name, data, label in box_cox_transformed_data_with_labels:
    results_2_accuracy.append({
        'name' : name, 
        'n' : n, 
        'accuracy' : fit_and_score_predictions(data, label, n)
    })



In [16]:
pd.DataFrame(results_2_accuracy).sort_values('accuracy', ascending=False)

Unnamed: 0,accuracy,n,name
12,0.895455,2,"log transformed, scaled"
13,0.891954,2,"log transformed, scaled - no outliers"
18,0.890909,2,"box-cox transformed, scaled"
19,0.890411,2,"box-cox transformed, scaled - no outliers"
23,0.878995,2,"box-cox transformed, scaled - no outliers, pca..."
16,0.878161,2,"log transformed, scaled - no outliers, pca, 2 ..."
22,0.876712,2,"box-cox transformed, scaled - no outliers, pca..."
21,0.872727,2,"box-cox transformed, scaled - pca, 3 components"
14,0.870455,2,"log transformed, scaled - pca, 2 components"
20,0.870455,2,"box-cox transformed, scaled - pca, 2 components"


![](complex_pipe_1.png)

![](complex_pipe_2.png)

![](complex_pipe_3.png)

![](complex_pipe_4.png)

![](complex_pipe_5.png)
