### Import packages

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score

### Set-up

In [2]:
infile = 'https://www.dropbox.com/s/6grsxawwozuz661/credit_default_model_data.csv?dl=1'

target = 'default payment next month'

sns.set(style='darkgrid')

### Read data

In [3]:
df = pd.read_csv(infile)

X = df[df.columns[1:]].drop(target, axis=1)

del df

In [4]:
X.shape

(30000, 55)

### Standardize

In [5]:
X_scaler = StandardScaler()

X_std = X_scaler.fit_transform(X.astype(float))

### Principal Component Analysis (PCA)

In [6]:
# Based on the results from one of the previous analyses, we will keep the top 30 components
components_to_keep = 30

pca = PCA(n_components=components_to_keep, random_state=314)

X_pca = pca.fit_transform(X_std)

### k-means Clustering

In [7]:
range_n_clusters = [2, 3, 4, 5, 6, 7]

for n_clus in range_n_clusters:
    
    # Define the k-means model
    clusterer = KMeans(n_clusters=n_clus, random_state=314)
    
    # Get assigned cluster numbers for each record
    clus_labels = clusterer.fit_predict(X_pca)
    
    # Overall (average) Silhouette score
    silhouette_avg = silhouette_score(X_pca, clus_labels)
    
    print('Clusters:', n_clus, ', Silhouette Score:', silhouette_avg)

Clusters: 2 , Silhouette Score: 0.1504477349228697
Clusters: 3 , Silhouette Score: 0.15122483758919925
Clusters: 4 , Silhouette Score: 0.15769872582172456
Clusters: 5 , Silhouette Score: 0.15808057559331234
Clusters: 6 , Silhouette Score: 0.14731699956081895
Clusters: 7 , Silhouette Score: 0.10266483811831893


In [None]:
def perform_clus(components_to_keep_all, range_n_clusters_all):
    
    for components_to_keep in components_to_keep_all:

        pca = PCA(n_components=components_to_keep, random_state=314)

        X_pca = pca.fit_transform(X_std)

        for n_clus in range_n_clusters_all:

            # Define the k-means model
            clusterer = KMeans(n_clusters=n_clus, random_state=314)

            # Get assigned cluster numbers for each record
            clus_labels = clusterer.fit_predict(X_pca)

            # Overall (average) Silhouette score
            silhouette_avg = silhouette_score(X_pca, clus_labels)

            print('Components:', components_to_keep, ', Clusters:', n_clus, ', Silhouette Score:', silhouette_avg)

    
perform_clus([10, 20], [2, 3, 4, 5])

In [None]:
def perform_clus(components_to_keep_all, range_n_clusters_all):
    
    for components_to_keep in components_to_keep_all:

        pca = PCA(n_components=components_to_keep, random_state=314)

        X_pca = pca.fit_transform(X_std)

        for n_clus in range_n_clusters_all:

            # Define the k-means model
            clusterer = KMeans(n_clusters=n_clus, random_state=314)

            # Get assigned cluster numbers for each record
            clus_labels = clusterer.fit_predict(X_pca)

            # Overall (average) Silhouette score
            silhouette_avg = silhouette_score(X_pca, clus_labels)

            print('Components:', components_to_keep, ', Clusters:', n_clus, ', Silhouette Score:', silhouette_avg)

    
perform_clus([5, 10], [2, 3, 4, 5])

In [None]:
def perform_clus(components_to_keep_all, range_n_clusters_all):
    
    for components_to_keep in components_to_keep_all:

        pca = PCA(n_components=components_to_keep, random_state=314)

        X_pca = pca.fit_transform(X_std)

        for n_clus in range_n_clusters_all:

            # Define the k-means model
            clusterer = KMeans(n_clusters=n_clus, random_state=314)

            # Get assigned cluster numbers for each record
            clus_labels = clusterer.fit_predict(X_pca)

            # Overall (average) Silhouette score
            silhouette_avg = silhouette_score(X_pca, clus_labels)

            print('Components:', components_to_keep, ', Clusters:', n_clus, ', Silhouette Score:', silhouette_avg)

    
perform_clus([2, 3], [2, 3, 4, 5])

Let's pick 2 principal components and 3 clusters.

In [None]:
components_to_keep = 2
n_clus = 3

pca = PCA(n_components=components_to_keep, random_state=314)

X_pca = pca.fit_transform(X_std)

# Define the k-means model
clusterer = KMeans(n_clusters=n_clus, random_state=314)

# Get assigned cluster numbers for each record
clus_labels = clusterer.fit_predict(X_pca)

# Overall (average) Silhouette score
silhouette_avg = silhouette_score(X_pca, clus_labels)

print(silhouette_avg)

In [None]:
X['cluster'] = clus_labels

X.head()

In [None]:
X.cluster.value_counts()

In [None]:
X.cluster.value_counts() / len(X)

In [None]:
X_core = X.iloc[:, :-1][X['cluster'] != 1]

X_core.columns

In [None]:
def perform_clus(indf, components_to_keep_all, range_n_clusters_all):
    
    X_std = X_scaler.fit_transform(indf.astype(float))
    
    for components_to_keep in components_to_keep_all:

        pca = PCA(n_components=components_to_keep, random_state=314)

        X_pca = pca.fit_transform(X_std)

        for n_clus in range_n_clusters_all:

            # Define the k-means model
            clusterer = KMeans(n_clusters=n_clus, random_state=314)

            # Get assigned cluster numbers for each record
            clus_labels = clusterer.fit_predict(X_pca)

            # Overall (average) Silhouette score
            silhouette_avg = silhouette_score(X_pca, clus_labels)

            print('Components:', components_to_keep, ', Clusters:', n_clus, ', Silhouette Score:', silhouette_avg)

    
perform_clus(X_core, [2, 3], [2, 3, 4, 5])

Let's pick 2 principal components and 3 clusters.

In [None]:
components_to_keep = 2
n_clus = 3

pca = PCA(n_components=components_to_keep, random_state=314)

X_std = X_scaler.fit_transform(X_core.astype(float))

X_pca = pca.fit_transform(X_std)

# Define the k-means model
clusterer = KMeans(n_clusters=n_clus, random_state=314)

# Get assigned cluster numbers for each record
clus_labels = clusterer.fit_predict(X_pca)

# Overall (average) Silhouette score
silhouette_avg = silhouette_score(X_pca, clus_labels)

print(silhouette_avg)

X_core['cluster'] = clus_labels

X_core.cluster.value_counts() / len(X_core)

Let's bring the two dataframes together.

In [None]:
X_outlier = X[X['cluster'] == 1]

X_outlier['cluster'] = -1

X_final = X_core.append(X_outlier)

X_final.cluster.value_counts(sort=False) / len(X_final)

### Cluster Profiles

In [None]:
X_final.groupby('cluster').mean()