In [35]:
# Numerical packages
import numpy as np
import pandas as pd 

# Graphing
import plotly.express as px
# Clustering & PCA
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans 

# Machine Learning
import pycaret.classification as clf

### 1.0 Load Data

In [26]:
data = pd.read_pickle('data/marketing_dataset_cleaned.pkl')
data.head()

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Response,Duration,Age,Total_Campaigns,Frequency,Monetary,PropWines,PropFruits,PropMeatProducts,PropFishProducts,PropSweetProducts,PropGoldProds
0,5524,1957,2,1,58138.0,0,0,2012-09-04,58,635,88,546,172,88,88,3,8,10,4,7,0,0,0,0,0,0,1,663,57,0,25,1617,0.392703,0.054422,0.337662,0.10637,0.054422,0.054422
1,2174,1954,2,1,46344.0,1,1,2014-03-08,38,11,1,6,2,1,6,2,1,1,2,5,0,0,0,0,0,0,0,113,60,0,6,27,0.407407,0.037037,0.222222,0.074074,0.037037,0.222222
2,4141,1965,2,2,71613.0,0,0,2013-08-21,26,426,49,127,111,21,42,1,8,2,10,4,0,0,0,0,0,0,0,312,49,0,21,776,0.548969,0.063144,0.16366,0.143041,0.027062,0.054124
3,6182,1984,2,2,26646.0,1,0,2014-02-10,26,11,4,20,10,3,5,2,2,0,4,6,0,0,0,0,0,0,0,139,30,0,8,53,0.207547,0.075472,0.377358,0.188679,0.056604,0.09434
4,5324,1981,4,2,58293.0,1,0,2014-01-19,94,173,43,118,46,27,15,5,5,3,6,5,0,0,0,0,0,0,0,161,33,0,19,422,0.409953,0.101896,0.279621,0.109005,0.063981,0.035545


In [18]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2212 entries, 0 to 2239
Data columns (total 38 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   ID                   2212 non-null   int64         
 1   Year_Birth           2212 non-null   int64         
 2   Education            2212 non-null   int32         
 3   Marital_Status       2212 non-null   int32         
 4   Income               2212 non-null   float64       
 5   Kidhome              2212 non-null   int64         
 6   Teenhome             2212 non-null   int64         
 7   Dt_Customer          2212 non-null   datetime64[ns]
 8   Recency              2212 non-null   int64         
 9   MntWines             2212 non-null   int64         
 10  MntFruits            2212 non-null   int64         
 11  MntMeatProducts      2212 non-null   int64         
 12  MntFishProducts      2212 non-null   int64         
 13  MntSweetProducts     2212 non-null   i

The data has already been cleaned and pre-processed in a previous file.  

In [27]:
# Lets drop columns that are not useful for our analysis
#
corr_data = data.corr()
# Strip out the diagonal values for the next step
for x in range(len(data.columns)):
    corr_data.iloc[x,x] = 0.0
# Find the max correlation value
print(f"The maximum absolute correlation value is {corr_data.abs().max().max()}.")

The maximum absolute correlation value is 1.000000000000001.


In [33]:
data = data.drop(['ID', 'Dt_Customer', 'Year_Birth'], axis=1)
corr_data = data.corr()
# Strip out the diagonal values for the next step
for x in range(len(data.columns)):
    corr_data.iloc[x,x] = 0.0
print(f"The maximum absolute correlation value is {corr_data.abs().max().max()}.")

The maximum absolute correlation value is 0.8929961009357751.


In [39]:
px.imshow(
    corr_data, 
    width=1000,
    height=1000,
    template='plotly_dark',
    title="Feature Correlations"
)

## 2.0 CLUSTERING FOR CUSTOMER SEGMENTATION
Note: all the columns in the dataset are numeric, and the dataset has only 2200 rows so no need for additional feature encoding or reducing the size for computation reasons
- 2.1 Apply a Standard Scaler for the data to be used later in feature reduction and a clustering algorithm
- 2.2 Apply a clustering algorithm (K-Means)
- 2.3 Apply a clustering algorithm (K-means) after feature reduction (PCA)

In [40]:
scaler = StandardScaler()

df = pd.DataFrame(
    scaler.fit_transform(data),
    columns=data.columns
)
df.head()

Unnamed: 0,Education,Marital_Status,Income,Kidhome,Teenhome,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Response,Duration,Age,Total_Campaigns,Frequency,Monetary,PropWines,PropFruits,PropMeatProducts,PropFishProducts,PropSweetProducts,PropGoldProds
0,-0.45885,-1.418411,0.287105,-0.822754,-0.929699,0.310353,0.97766,1.552041,1.690293,2.453472,1.483713,0.852576,0.35103,1.426865,2.503607,-0.555814,0.692181,-0.282048,-0.282981,-0.280175,-0.261914,-0.117256,-0.09552,2.375425,1.527721,1.018352,-0.43916,1.317945,1.676245,-0.290121,0.088947,0.70429,0.445503,0.059596,-0.602949
1,-0.45885,-1.418411,-0.260882,1.040021,0.908097,-0.380813,-0.872618,-0.637461,-0.71823,-0.651004,-0.634019,-0.733642,-0.168701,-1.12642,-0.57134,-1.17116,-0.132545,-0.282048,-0.282981,-0.280175,-0.261914,-0.117256,-0.09552,-0.420977,-1.189011,1.274785,-0.43916,-1.159273,-0.963297,-0.225781,-0.222814,-0.213734,0.031554,-0.225676,0.939234
2,-0.45885,0.062951,0.913196,-0.822754,-0.929699,-0.795514,0.357935,0.57054,-0.178542,1.339513,-0.147184,-0.037254,-0.688432,1.426865,-0.229679,1.290224,-0.544908,-0.282048,-0.282981,-0.280175,-0.261914,-0.117256,-0.09552,-0.420977,-0.206048,0.33453,-0.43916,0.796425,0.28011,0.393603,0.245369,-0.679444,0.915537,-0.389362,-0.605688
3,-0.45885,0.062951,-1.176114,1.040021,-0.929699,-0.795514,-0.872618,-0.561961,-0.655787,-0.504911,-0.585335,-0.752987,-0.168701,-0.761665,-0.913,-0.555814,0.279818,-0.282048,-0.282981,-0.280175,-0.261914,-0.117256,-0.09552,-0.420977,-1.060584,-1.289547,-0.43916,-0.898513,-0.920135,-1.100244,0.466436,1.019969,1.500499,0.095402,-0.236081
4,1.533251,0.062951,0.294307,1.040021,-0.929699,1.554453,-0.392257,0.41954,-0.218684,0.152508,-0.001133,-0.559545,1.390492,0.3326,0.111982,0.059532,-0.132545,-0.282048,-0.282981,-0.280175,-0.261914,-0.117256,-0.09552,-0.420977,-0.951915,-1.033114,-0.43916,0.535666,-0.307562,-0.214645,0.940298,0.242722,0.479276,0.216458,-0.776437


In [41]:
len(df.columns)

35

In [43]:
model_kmeans = KMeans(n_clusters=4, random_state = 357) 

model_kmeans.fit(df)
labels = model_kmeans.labels_
df_kmeans = data.assign(label = lambda x: pd.Categorical(labels))

In [44]:
# Plot
plot_kmeans = px.scatter(
        df_kmeans.query("Income < 600000"), 
        x = 'Monetary',
        y = 'Income',
        color = "label",
        opacity=0.4,
        template="plotly_dark",
        title = "K-MEANS ONLY"
)
plot_kmeans 