In [58]:
import pandas as pd
import numpy as np

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots


In [16]:
df = pd.read_csv('df_after_eda.csv', index_col='Unnamed: 0')

In [17]:
df.head()

Unnamed: 0,ID,Education,GradorPost,Marital_Status,Relationship,Dt_Customer,days,Year_Birth,Age,Income,...,NumStorePurchases,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Response,SumAcceptedCmp
0,5524,Graduation,Undergrad Degree,Single,Single,2012-04-09,971,1957,57,58138.0,...,4,7,0,0,0,0,0,0,1,1
1,2174,Graduation,Undergrad Degree,Single,Single,2014-08-03,125,1954,60,46344.0,...,2,5,0,0,0,0,0,0,0,0
2,4141,Graduation,Undergrad Degree,Together,Relationship,2013-08-21,472,1965,49,71613.0,...,10,4,0,0,0,0,0,0,0,0
3,6182,Graduation,Undergrad Degree,Together,Relationship,2014-10-02,65,1984,30,26646.0,...,4,6,0,0,0,0,0,0,0,0
4,5324,PhD,PostGrad Degree,Married,Relationship,2014-01-19,321,1981,33,58293.0,...,6,5,0,0,0,0,0,0,0,0


In [18]:
df.dtypes

ID                       int64
Education               object
GradorPost              object
Marital_Status          object
Relationship            object
Dt_Customer             object
days                     int64
Year_Birth               int64
Age                      int64
Income                 float64
Kidhome                  int64
Teenhome                 int64
Recency                  int64
MntWines                 int64
MntFruits                int64
MntMeatProducts          int64
MntFishProducts          int64
MntSweetProducts         int64
MntGoldProds             int64
NumDealsPurchases        int64
NumWebPurchases          int64
NumCatalogPurchases      int64
NumStorePurchases        int64
NumWebVisitsMonth        int64
AcceptedCmp3             int64
AcceptedCmp4             int64
AcceptedCmp5             int64
AcceptedCmp1             int64
AcceptedCmp2             int64
Complain                 int64
Response                 int64
SumAcceptedCmp           int64
dtype: o

In [57]:
df.dtypes

ID                       int64
Education               object
GradorPost              object
Marital_Status          object
Relationship            object
Dt_Customer             object
days                     int64
Year_Birth               int64
Age                      int64
Income                 float64
Kidhome                  int64
Teenhome                 int64
Recency                  int64
MntWines                 int64
MntFruits                int64
MntMeatProducts          int64
MntFishProducts          int64
MntSweetProducts         int64
MntGoldProds             int64
NumDealsPurchases        int64
NumWebPurchases          int64
NumCatalogPurchases      int64
NumStorePurchases        int64
NumWebVisitsMonth        int64
AcceptedCmp3             int64
AcceptedCmp4             int64
AcceptedCmp5             int64
AcceptedCmp1             int64
AcceptedCmp2             int64
Complain                 int64
Response                 int64
SumAcceptedCmp           int64
cluster 

dfWe change date to year, month and day, and we remove id:

In [23]:
df2 = df.drop(columns=['ID', 'Dt_Customer', 'Year_Birth', 'Relationship', 'GradorPost'])

In [24]:
df_dummies = pd.get_dummies(df2)

### Scaling

In [33]:
scaler = StandardScaler()
X = scaler.fit_transform(df_dummies.to_numpy())

### K-Means

In [38]:
N_CLUSTERS_TO_TRY_OUT = range(1, 11)

In [40]:
models = []
for n_clusters in N_CLUSTERS_TO_TRY_OUT: # 1 to 10 clusters
    model = KMeans(n_clusters)
    model.fit(X)
    models.append(model)

In [50]:
px.line(
    x=N_CLUSTERS_TO_TRY_OUT,
    y=list(map(lambda x: x.inertia_, models)),
    labels={'x': 'n_clusters', 'y': 'wcss'},
    title='Elbow Method'
)

We´ll choose 5 clusters

In [83]:
FINAL_N_CLUSTERS = 5

In [84]:
df2['cluster'] = models[FINAL_N_CLUSTERS-1].predict(X)

In [85]:
px.bar(
    df2['cluster'].value_counts(),
    y='cluster',
    labels={'index': 'cluster', 'cluster': '#'},
    title='Number of occurences per cluster'
)

In [89]:
features = ['Income', 'Age', 'days', 'SumAcceptedCmp']

fig = make_subplots(
    len(features),
    1,
    subplot_titles=features
)

for i, feature in enumerate(features):

    fig.add_trace(
        go.Box(
            x=df2['cluster'],
            y=df2[feature],
            showlegend=False,
            name=feature
        ),
        row=i+1,
        col=1
    )


fig.update_layout(
    title='Cluster observation',
    height=1500,
    width=1000
)

fig.show()

In [97]:
campaign_columns = list(filter(lambda x: 'Accepted' in x, df2.columns))+['Response']
campaign_columns.remove('SumAcceptedCmp')
campaign_columns.sort()
campaign_columns

['AcceptedCmp1',
 'AcceptedCmp2',
 'AcceptedCmp3',
 'AcceptedCmp4',
 'AcceptedCmp5',
 'Response']

In [109]:
df2_melted_for_campaigns = pd.melt(
    df2,
    id_vars=['cluster'],
    value_vars=campaign_columns
)
df2_melted_for_campaigns = df2_melted_for_campaigns.loc[df2_melted_for_campaigns['value'] == 1]
df2_melted_for_campaigns = df2_melted_for_campaigns.drop(columns=['value'])
df2_melted_for_campaigns = df2_melted_for_campaigns.rename(columns={'variable': 'campaign'})
df2_melted_for_campaigns['campaign'] = df2_melted_for_campaigns['campaign'].replace({campaign_columns[i]: str(i) for i in range(len(campaign_columns))})
df2_melted_for_campaigns['count'] = 1
df2_melted_for_campaigns.groupby(['cluster', 'campaign']).sum()


Unnamed: 0_level_0,Unnamed: 1_level_0,count
cluster,campaign,Unnamed: 2_level_1
0,2,6
0,5,2
1,0,14
1,2,34
1,3,70
1,4,3
1,5,60
2,0,34
2,2,25
2,3,12


In [114]:
px.bar(
    df2_melted_for_campaigns,
    x='cluster',
    y='count',
    color='campaign',
    barmode='group'
)