In [38]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px

from sklearn.cluster import KMeans
from sklearn.pipeline import make_pipeline
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from category_encoders import OneHotEncoder

In [12]:
df = pd.read_csv("../data/clean_data.csv")
df.head()

Unnamed: 0,CustGender,CustAccountBalance,TransactionAmount (INR),Age
0,0,17874.44,459.0,20
1,0,866503.21,2060.0,43
2,0,973.46,566.0,24
3,1,95075.54,148.0,34
4,1,4279.22,289.11,32


### Split

In [13]:
cols = ["CustAccountBalance", "TransactionAmount (INR)", "Age"]
X = df[cols]

### Build Model

In [15]:
n_clusters = range(2, 10)
inertia_errors = []
silhouette_errors = []

for n in n_clusters:
    # build a model
    model = make_pipeline(
        StandardScaler(),
        KMeans(n_clusters=n, random_state=42)
    )
    # fit the model
    model.fit(X)

    # calculate the inertia
    i = model.inertia_
    inertia_errors.append(i)
    # calculate silhouette score
    ss = silhouette_score(X, model.labels_)
    silhouette_errors.append(ss)


#### Inertia

In [37]:
fig = px.line(
        x = n_clusters,
        y=inertia_errors,
        title="Inertia vs Number of Clusters"
)

fig.update_layout(xaxis_title="Clusters", yaxis_title="Inertia")
fig.show()

Silhouette Score

In [36]:
fig = px.line(
    x=n_clusters,
    y=silhouette_errors,
    title="Silhouette Scores vs Number of Clusters"
)
fig.update_layout(xaxis_title="Clusters", yaxis_title="Silhouette")
fig.show()

From the above observations we can see that the best inertia and silhouette score occurs around 3 and 4, that the line starts to flatten. so we can decide that the number of cluster is 3.

In [34]:
final_model = make_pipeline(
    StandardScaler(),
    KMeans(n_clusters=3, random_state=42)
)

final_model.fit(X)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('kmeans', KMeans(n_clusters=3, random_state=42))])

##### Labels

In [44]:
labels = final_model.named_steps["kmeans"].labels_.astype(str)
labels

array(['1', '2', '1', ..., '1', '1', '0'], dtype='<U11')

##### Inertia

In [35]:
final_model.named_steps["kmeans"].inertia_

136241.61450079203

##### Silhouette Score

In [33]:
silhouette_score(X, final_model.named_steps["kmeans"].labels_)

0.23143535621583544

### PCA

In [39]:
# Instantiate
pca = PCA(n_components=2, random_state=42)
# Transform 'X'
X_t = pca.fit_transform(X)
# Put 'X_t' into dataframe
X_pca = pd.DataFrame(X_t, columns=["PC1", "PC2"])
X_pca.head()

Unnamed: 0,PC1,PC2
0,-76697.655896,-717.064848
1,771932.591206,-479.435095
2,-93598.442147,-582.910442
3,502.844953,-1152.08622
4,-90293.131184,-865.106995


In [45]:
fig = px.scatter(
    x = "PC1",
    y = "PC2",
    data_frame=X_pca,
    color=labels
)

fig.show()