In [8]:
import pandas as pd
import numpy as np

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots


In [9]:
df_after_eda = pd.read_csv('df_after_eda.csv', index_col='Unnamed: 0')
df_selected = pd.read_csv('df_selectedvariables', index_col=0)
df_all = pd.read_csv('df_selected_all', index_col=0)

In [10]:
df_after_eda.head()

Unnamed: 0,ID,Education,GradorPost,Marital_Status,Relationship,Dt_Customer,days,Year_Birth,Age,Income,...,NumStorePurchases,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Response,SumAcceptedCmp
0,5524,Graduation,Undergrad Degree,Single,Single,2012-04-09,971,1957,57,58138.0,...,4,7,0,0,0,0,0,0,1,1
1,2174,Graduation,Undergrad Degree,Single,Single,2014-08-03,125,1954,60,46344.0,...,2,5,0,0,0,0,0,0,0,0
2,4141,Graduation,Undergrad Degree,Together,Relationship,2013-08-21,472,1965,49,71613.0,...,10,4,0,0,0,0,0,0,0,0
3,6182,Graduation,Undergrad Degree,Together,Relationship,2014-10-02,65,1984,30,26646.0,...,4,6,0,0,0,0,0,0,0,0
4,5324,PhD,PostGrad Degree,Married,Relationship,2014-01-19,321,1981,33,58293.0,...,6,5,0,0,0,0,0,0,0,0


In [11]:
df_selected.head()

Unnamed: 0,GradorPost,Relationship,Age,days_enrolled,kidsteenHome,Income,Total_Spent,NumAllPurchases,AverageCheck,NumWebVisitsMonth,ShareDealsPurchases,SumAcceptedOffers
0,Undergrad Degree,Single,57,971,0,58138.0,1617,22,73.5,7,13.6,1
1,Undergrad Degree,Single,60,125,2,46344.0,27,4,6.8,5,50.0,0
2,Undergrad Degree,Relationship,49,472,0,71613.0,776,20,38.8,4,5.0,0
3,Undergrad Degree,Relationship,30,65,1,26646.0,53,6,8.8,6,33.3,0
4,PostGrad Degree,Relationship,33,321,1,58293.0,422,14,30.1,5,35.7,0


In [12]:
df_all.head()

Unnamed: 0,GradorPost,Relationship,Age,days,kidsteenHome,Income,MntTotal,NumAllPurchases,AverageCheck,NumWebVisitsMonth,...,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Response
0,Undergrad Degree,Single,57,971,0,58138.0,1617,22,73.5,7,...,8,10,4,0,0,0,0,0,0,1
1,Undergrad Degree,Single,60,125,2,46344.0,27,4,6.8,5,...,1,1,2,0,0,0,0,0,0,0
2,Undergrad Degree,Relationship,49,472,0,71613.0,776,20,38.8,4,...,8,2,10,0,0,0,0,0,0,0
3,Undergrad Degree,Relationship,30,65,1,26646.0,53,6,8.8,6,...,2,0,4,0,0,0,0,0,0,0
4,PostGrad Degree,Relationship,33,321,1,58293.0,422,14,30.1,5,...,5,3,6,0,0,0,0,0,0,0


In [6]:
df2 = df.drop(columns=['ID', 'Dt_Customer', 'Year_Birth', 'Relationship', 'GradorPost'])

In [20]:
df2 = df_selected.copy()

In [25]:
df_dummies = pd.get_dummies(df_selected, drop_first=True)

In [26]:
df_dummies.head()

Unnamed: 0,Age,days_enrolled,kidsteenHome,Income,Total_Spent,NumAllPurchases,AverageCheck,NumWebVisitsMonth,ShareDealsPurchases,SumAcceptedOffers,GradorPost_Undergrad Degree,Relationship_Single
0,57,971,0,58138.0,1617,22,73.5,7,13.6,1,1,1
1,60,125,2,46344.0,27,4,6.8,5,50.0,0,1,1
2,49,472,0,71613.0,776,20,38.8,4,5.0,0,1,0
3,30,65,1,26646.0,53,6,8.8,6,33.3,0,1,0
4,33,321,1,58293.0,422,14,30.1,5,35.7,0,0,0


### Scaling

In [27]:
scaler = StandardScaler()
X = scaler.fit_transform(df_dummies.to_numpy())

### Clustering

In [67]:
from sklearn.cluster import AgglomerativeClustering

In [28]:
N_CLUSTERS_TO_TRY_OUT = range(1, 11)

In [68]:
models = []
for n_clusters in N_CLUSTERS_TO_TRY_OUT: # 1 to 10 clusters
    model = AgglomerativeClustering(n_clusters)
    model.fit(X)
    models.append(model)

In [79]:
pd.Series(models[4].labels_).value_counts()

0    692
2    590
1    444
3    324
4    182
dtype: int64

In [69]:
px.line(
    x=N_CLUSTERS_TO_TRY_OUT,
    y=list(map(lambda x: x.inertia_, models)),
    labels={'x': 'n_clusters', 'y': 'wcss'},
    title='Elbow Method'
)

AttributeError: 'AgglomerativeClustering' object has no attribute 'inertia_'

We´ll choose 5 clusters

In [80]:
FINAL_N_CLUSTERS = 4

In [82]:
#df2['cluster'] = models[FINAL_N_CLUSTERS-1].predict(X)
df2['cluster'] = models[FINAL_N_CLUSTERS-1].labels_

In [83]:
px.bar(
    df2['cluster'].value_counts(),
    y='cluster',
    labels={'index': 'cluster', 'cluster': '#'},
    title='Number of occurences per cluster'
)

In [84]:
features = ['Income', 'Age', 'NumAllPurchases', 'Total_Spent', 'AverageCheck']
features = list(df2.columns)

fig = make_subplots(
    len(features),
    1,
    subplot_titles=features
)

for i, feature in enumerate(features):

    fig.add_trace(
        go.Box(
            x=df2['cluster'],
            y=df2[feature],
            showlegend=False,
            name=feature
        ),
        row=i+1,
        col=1
    )


fig.update_layout(
    title='Cluster observation',
    height=2500,
    width=1000
)

fig.show()

In [16]:
campaign_columns = list(filter(lambda x: 'Accepted' in x, df2.columns))+['Response']
campaign_columns.remove('SumAcceptedCmp')
campaign_columns.sort()
campaign_columns

['AcceptedCmp1',
 'AcceptedCmp2',
 'AcceptedCmp3',
 'AcceptedCmp4',
 'AcceptedCmp5',
 'Response']

In [17]:
df2_melted_for_campaigns = pd.melt(
    df2,
    id_vars=['cluster'],
    value_vars=campaign_columns
)
df2_melted_for_campaigns = df2_melted_for_campaigns.loc[df2_melted_for_campaigns['value'] == 1]
df2_melted_for_campaigns = df2_melted_for_campaigns.drop(columns=['value'])
df2_melted_for_campaigns = df2_melted_for_campaigns.rename(columns={'variable': 'campaign'})
df2_melted_for_campaigns['campaign'] = df2_melted_for_campaigns['campaign'].replace({campaign_columns[i]: str(i) for i in range(len(campaign_columns))})
df2_melted_for_campaigns['count'] = 1
df2_melted_for_campaigns.groupby(['cluster', 'campaign']).sum()


Unnamed: 0_level_0,Unnamed: 1_level_0,count
cluster,campaign,Unnamed: 2_level_1
0,0,16
0,1,3
0,2,34
0,3,74
0,4,3
0,5,61
1,0,97
1,1,25
1,2,30
1,3,76


In [18]:
px.bar(
    df2_melted_for_campaigns,
    x='cluster',
    y='count',
    color='campaign',
    barmode='group'
)

In [19]:
num_columns = list(filter(lambda x: 'Num' in x, df2.columns))
num_columns.sort()
num_columns


['NumCatalogPurchases',
 'NumDealsPurchases',
 'NumStorePurchases',
 'NumWebPurchases',
 'NumWebVisitsMonth']

In [27]:
df_for_nums = df2.loc[:, ['cluster']+num_columns].groupby(['cluster']).sum().reset_index()
df_melted_for_nums = pd.melt(
    df_for_nums,
    id_vars=['cluster'],
    value_vars=num_columns
)
df_melted_for_nums

Unnamed: 0,cluster,variable,value
0,0,NumCatalogPurchases,1760
1,1,NumCatalogPurchases,954
2,2,NumCatalogPurchases,504
3,3,NumCatalogPurchases,2719
4,4,NumCatalogPurchases,26
5,0,NumDealsPurchases,2273
6,1,NumDealsPurchases,193
7,2,NumDealsPurchases,1954
8,3,NumDealsPurchases,691
9,4,NumDealsPurchases,97


In [29]:
px.bar(
    df_melted_for_nums,
    x='cluster',
    y='value',
    color='variable',
    barmode='group'
)

## Scatter 3D

In [85]:
df2.columns

Index(['GradorPost', 'Relationship', 'Age', 'days_enrolled', 'kidsteenHome',
       'Income', 'Total_Spent', 'NumAllPurchases', 'AverageCheck',
       'NumWebVisitsMonth', 'ShareDealsPurchases', 'SumAcceptedOffers',
       'cluster'],
      dtype='object')

In [86]:
variables_for_scatter_3d = ['AverageCheck', 'NumAllPurchases', 'SumAcceptedOffers']

In [87]:
fig_3d = px.scatter_3d(
    df2,
    x=variables_for_scatter_3d[0],
    y=variables_for_scatter_3d[1],
    z=variables_for_scatter_3d[2],
    color='cluster'
)

In [88]:
import dash
from dash import html, dcc

In [89]:
app = dash.Dash(__name__)

In [90]:
app.layout = html.Div(
    children=[
        html.H1('Prueba'),
        dcc.Graph(
            figure=fig_3d
        )
    ]
)

In [91]:
app.run_server()

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:8050
Press CTRL+C to quit
