# Marketing Case Study

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt

import warnings
warnings.filterwarnings("ignore")

## Customer Segmentation

A key objective is to create a predictive model which allows the company to maximize the profits of the next marketing campaign.

In [3]:
# load data
with open("processed_data.csv", "r") as f:
    data = pd.read_csv(f)

data.head()

Unnamed: 0.1,Unnamed: 0,Education,Marital_Status,Income,Kidhome,Teenhome,Recency,MntWines,MntFruits,MntMeatProducts,...,AcceptedCmp2,Response,Complain,Country,Age,Length_customer_yrs,Dependents,Total_Spent,total_purchases,Total_Accepted_Camp
0,0,Graduation,Divorced,84835.0,0,0,0,15.88,8.74,31.85,...,0,1,0,SP,52,8.03,0,1190,14,0
1,1,Graduation,Single,57091.0,0,0,0,80.42,0.87,11.09,...,1,1,0,CA,61,8.03,0,577,17,1
2,2,Graduation,Married,67267.0,0,1,0,53.39,4.38,23.51,...,0,0,0,US,64,8.12,1,251,10,0
3,3,Graduation,Together,32474.0,1,1,0,90.91,0.0,9.09,...,0,0,0,AUS,55,8.13,2,11,3,0
4,4,Graduation,Single,21474.0,1,0,0,6.59,17.58,26.37,...,0,1,0,SP,33,8.22,1,91,6,1


In [4]:
data.drop(columns=["Unnamed: 0", "Response"], inplace=True)

Unnamed: 0,Education,Marital_Status,Income,Kidhome,Teenhome,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,...,AcceptedCmp2,Response,Complain,Country,Age,Length_customer_yrs,Dependents,Total_Spent,total_purchases,Total_Accepted_Camp
0,Graduation,Divorced,84835.0,0,0,0,15.88,8.74,31.85,9.33,...,0,1,0,SP,52,8.03,0,1190,14,0
1,Graduation,Single,57091.0,0,0,0,80.42,0.87,11.09,1.21,...,1,1,0,CA,61,8.03,0,577,17,1
2,Graduation,Married,67267.0,0,1,0,53.39,4.38,23.51,5.98,...,0,0,0,US,64,8.12,1,251,10,0
3,Graduation,Together,32474.0,1,1,0,90.91,0.0,9.09,0.0,...,0,0,0,AUS,55,8.13,2,11,3,0
4,Graduation,Single,21474.0,1,0,0,6.59,17.58,26.37,12.09,...,0,1,0,SP,33,8.22,1,91,6,1


In [6]:
data.columns.to_list()

['Education',
 'Marital_Status',
 'Income',
 'Kidhome',
 'Teenhome',
 'Recency',
 'MntWines',
 'MntFruits',
 'MntMeatProducts',
 'MntFishProducts',
 'MntSweetProducts',
 'MntGoldProds',
 'NumDealsPurchases',
 'NumWebPurchases',
 'NumCatalogPurchases',
 'NumStorePurchases',
 'NumWebVisitsMonth',
 'AcceptedCmp3',
 'AcceptedCmp4',
 'AcceptedCmp5',
 'AcceptedCmp1',
 'AcceptedCmp2',
 'Response',
 'Complain',
 'Country',
 'Age',
 'Length_customer_yrs',
 'Dependents',
 'Total_Spent',
 'total_purchases',
 'Total_Accepted_Camp']

In [7]:
# The KMeans algorithm does not perform well with binary data.

seg_cols = ['Education',
 'Marital_Status',
 'Income',
 'Recency',
 'MntWines',
 'MntFruits',
 'MntMeatProducts',
 'MntFishProducts',
 'MntSweetProducts',
 'MntGoldProds',
 'NumDealsPurchases',
 'NumWebVisitsMonth',
 'Complain',
 'Country',
 'Age',
 'Length_customer_yrs',
 'Dependents',
 'Total_Spent',
 'total_purchases',
 'Total_Accepted_Camp']

In [8]:
seg_df = pd.DataFrame(data[seg_cols])

In [9]:
seg_df.head()

Unnamed: 0,Education,Marital_Status,Income,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebVisitsMonth,Complain,Country,Age,Length_customer_yrs,Dependents,Total_Spent,total_purchases,Total_Accepted_Camp
0,Graduation,Divorced,84835.0,0,15.88,8.74,31.85,9.33,15.88,18.32,1,1,0,SP,52,8.03,0,1190,14,0
1,Graduation,Single,57091.0,0,80.42,0.87,11.09,1.21,0.0,6.41,1,5,0,CA,61,8.03,0,577,17,1
2,Graduation,Married,67267.0,0,53.39,4.38,23.51,5.98,0.8,11.95,1,2,0,US,64,8.12,1,251,10,0
3,Graduation,Together,32474.0,0,90.91,0.0,9.09,0.0,0.0,0.0,1,7,0,AUS,55,8.13,2,11,3,0
4,Graduation,Single,21474.0,0,6.59,17.58,26.37,12.09,0.0,37.36,2,7,0,SP,33,8.22,1,91,6,1


In [10]:
seg_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2190 entries, 0 to 2189
Data columns (total 20 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Education            2190 non-null   object 
 1   Marital_Status       2190 non-null   object 
 2   Income               2190 non-null   float64
 3   Recency              2190 non-null   int64  
 4   MntWines             2190 non-null   float64
 5   MntFruits            2190 non-null   float64
 6   MntMeatProducts      2190 non-null   float64
 7   MntFishProducts      2190 non-null   float64
 8   MntSweetProducts     2190 non-null   float64
 9   MntGoldProds         2190 non-null   float64
 10  NumDealsPurchases    2190 non-null   int64  
 11  NumWebVisitsMonth    2190 non-null   int64  
 12  Complain             2190 non-null   int64  
 13  Country              2190 non-null   object 
 14  Age                  2190 non-null   int64  
 15  Length_customer_yrs  2190 non-null   f

In [11]:
del data

## Pre-Processing Data

Performing customer segmentation using machine learning requires some preprocessing of the data.
- First, if the data are skewed, then the data needs to be transformed
- Next, there are a few categorical features that need to be encoded
- Finally, the data are centered and scaled using StandardScaler() method.

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.cluster import KMeans

In [None]:
# scale numeric data
# scaler = StandardScaler().fit_transform(seg_df.values)
# scaled_df = pd.DataFrame(scaler, index=seg_df.index, columns=seg_df.columns)

Use sklearn pipeline to streamline the pre-processing of data needed before modeling.

https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html#sphx-glr-auto-examples-compose-plot-column-transformer-mixed-types-py

In [None]:
# use pipeline to prepare data for unsupervised learning

# scale numerical features
num_features = seg_df.select_dtypes(include=np.number)
num_transformer = Pipeline(
    steps=[('scaler', StandardScaler())]
)

# nominal categorical data
nom_cat = ['Marital_Status', 'Country']
one_hot_encoder = OneHotEncoder(handle_unknown="ignore")

# ordered categorical data
ord_cat = ['Education']
ord_encoder = OrdinalEncoder()

preprocessor = ColumnTransformer(
    transformers= [
        ("num", num_transformer, num_features),
        ("nom", one_hot_encoder, nom_cat),
        ("ord", ord_encoder, ord_cat)
    ]
)

In [None]:
clf = Pipeline(
    steps=[("preprocessor", preprocessor), ]
)