# Customer Segmentation


In [275]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings

warnings.filterwarnings(action="ignore")

## Load the data


In [276]:
data = pd.read_csv("marketing_campaign.csv", sep="\t")
data.head()

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response
0,5524,1957,Graduation,Single,58138.0,0,0,04-09-2012,58,635,...,7,0,0,0,0,0,0,3,11,1
1,2174,1954,Graduation,Single,46344.0,1,1,08-03-2014,38,11,...,5,0,0,0,0,0,0,3,11,0
2,4141,1965,Graduation,Together,71613.0,0,0,21-08-2013,26,426,...,4,0,0,0,0,0,0,3,11,0
3,6182,1984,Graduation,Together,26646.0,1,0,10-02-2014,26,11,...,6,0,0,0,0,0,0,3,11,0
4,5324,1981,PhD,Married,58293.0,1,0,19-01-2014,94,173,...,5,0,0,0,0,0,0,3,11,0


### PEOPLE

- ID: Customer's unique identifier.
- Year_Birth: Customer's birth year.
- Education: Customer's education level.
- Marital_Status: Customer's marital status.
- Income: Customer's yearly household income.
- Kidhome: Number of children in customer's household.
- Teenhome: Number of teenagers in customer's household.
- Dt_Customer: Date of customer's enrollment with the company.
- Recency: Number of days since customer's last purchase.
- Complain: 1 if customer complained in the last 2 years, 0 otherwise.

### PRODUCTS

- MntWines: Amount spent on wine in last 2 years.
- MntFruits: Amount spent on fruits in last 2 years.
- MntMeatProducts: Amount spent on meat in last 2 years.
- MntFishProducts: Amount spent on fish in last 2 years.
- MntSweetProducts: Amount spent on sweets in last 2 years.
- MntGoldProds: Amount spent on gold in last 2 years.

### PROMOTION

- NumDealsPurchases: Number of purchases made with a discount.
- AcceptedCmp1: 1 if customer accepted the offer in the 1st campaign, 0 otherwise.
- AcceptedCmp2: 1 if customer accepted the offer in the 2nd campaign, 0 otherwise.
- AcceptedCmp3: 1 if customer accepted the offer in the 3rd campaign, 0 otherwise.
- AcceptedCmp4: 1 if customer accepted the offer in the 4th campaign, 0 otherwise.
- AcceptedCmp5: 1 if customer accepted the offer in the 5th campaign, 0 otherwise.
- Response: 1 if customer accepted the offer in the last campaign, 0 otherwise.

### PLACE

- NumWebPurchases: Number of purchases made through the company’s web site.
- NumCatalogPurchases: Number of purchases made using a catalogue.
- NumStorePurchases: Number of purchases made directly in stores.
- NumWebVisitsMonth: Number of visits to company’s web site in the last month.


In [277]:
print(f"The Shape of the DataFrame is {data.shape}")

The Shape of the DataFrame is (2240, 29)


In [278]:
print(f"Summary of data")
data.info(memory_usage="deep")

Summary of data
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2240 entries, 0 to 2239
Data columns (total 29 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   2240 non-null   int64  
 1   Year_Birth           2240 non-null   int64  
 2   Education            2240 non-null   object 
 3   Marital_Status       2240 non-null   object 
 4   Income               2216 non-null   float64
 5   Kidhome              2240 non-null   int64  
 6   Teenhome             2240 non-null   int64  
 7   Dt_Customer          2240 non-null   object 
 8   Recency              2240 non-null   int64  
 9   MntWines             2240 non-null   int64  
 10  MntFruits            2240 non-null   int64  
 11  MntMeatProducts      2240 non-null   int64  
 12  MntFishProducts      2240 non-null   int64  
 13  MntSweetProducts     2240 non-null   int64  
 14  MntGoldProds         2240 non-null   int64  
 15  NumDealsPurchases    2

In [279]:
print("Basic Statistics")
data.describe()

Basic Statistics


Unnamed: 0,ID,Year_Birth,Income,Kidhome,Teenhome,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,...,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response
count,2240.0,2240.0,2216.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,...,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0
mean,5592.159821,1968.805804,52247.251354,0.444196,0.50625,49.109375,303.935714,26.302232,166.95,37.525446,...,5.316518,0.072768,0.074554,0.072768,0.064286,0.013393,0.009375,3.0,11.0,0.149107
std,3246.662198,11.984069,25173.076661,0.538398,0.544538,28.962453,336.597393,39.773434,225.715373,54.628979,...,2.426645,0.259813,0.262728,0.259813,0.245316,0.114976,0.096391,0.0,0.0,0.356274
min,0.0,1893.0,1730.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,11.0,0.0
25%,2828.25,1959.0,35303.0,0.0,0.0,24.0,23.75,1.0,16.0,3.0,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,11.0,0.0
50%,5458.5,1970.0,51381.5,0.0,0.0,49.0,173.5,8.0,67.0,12.0,...,6.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,11.0,0.0
75%,8427.75,1977.0,68522.0,1.0,1.0,74.0,504.25,33.0,232.0,50.0,...,7.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,11.0,0.0
max,11191.0,1996.0,666666.0,2.0,2.0,99.0,1493.0,199.0,1725.0,259.0,...,20.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,11.0,1.0


### Missing Values


In [280]:
print("Checking for missing values")
missing = data.isna().sum().sort_values(ascending=False)
missing

Checking for missing values


Income                 24
ID                      0
NumDealsPurchases       0
Z_Revenue               0
Z_CostContact           0
Complain                0
AcceptedCmp2            0
AcceptedCmp1            0
AcceptedCmp5            0
AcceptedCmp4            0
AcceptedCmp3            0
NumWebVisitsMonth       0
NumStorePurchases       0
NumCatalogPurchases     0
NumWebPurchases         0
MntGoldProds            0
Year_Birth              0
MntSweetProducts        0
MntFishProducts         0
MntMeatProducts         0
MntFruits               0
MntWines                0
Recency                 0
Dt_Customer             0
Teenhome                0
Kidhome                 0
Marital_Status          0
Education               0
Response                0
dtype: int64

In [281]:
print("Percentage of missing values")
round(((missing / data.isna().count()) * 100).sort_values(ascending=False), 3)

Percentage of missing values


Income                 1.071
AcceptedCmp1           0.000
MntMeatProducts        0.000
Z_CostContact          0.000
Year_Birth             0.000
Teenhome               0.000
Response               0.000
Recency                0.000
NumWebVisitsMonth      0.000
NumWebPurchases        0.000
NumStorePurchases      0.000
NumDealsPurchases      0.000
NumCatalogPurchases    0.000
MntWines               0.000
MntSweetProducts       0.000
MntGoldProds           0.000
AcceptedCmp2           0.000
MntFruits              0.000
MntFishProducts        0.000
Marital_Status         0.000
Kidhome                0.000
ID                     0.000
Education              0.000
Dt_Customer            0.000
Complain               0.000
AcceptedCmp5           0.000
AcceptedCmp4           0.000
AcceptedCmp3           0.000
Z_Revenue              0.000
dtype: float64

The missing data is present in Income column and it is only 1%
We can either remove it or fill it


In [282]:
data["Income"] = data["Income"].fillna(data["Income"].median())

In [283]:
data.isna().any()

ID                     False
Year_Birth             False
Education              False
Marital_Status         False
Income                 False
Kidhome                False
Teenhome               False
Dt_Customer            False
Recency                False
MntWines               False
MntFruits              False
MntMeatProducts        False
MntFishProducts        False
MntSweetProducts       False
MntGoldProds           False
NumDealsPurchases      False
NumWebPurchases        False
NumCatalogPurchases    False
NumStorePurchases      False
NumWebVisitsMonth      False
AcceptedCmp3           False
AcceptedCmp4           False
AcceptedCmp5           False
AcceptedCmp1           False
AcceptedCmp2           False
Complain               False
Z_CostContact          False
Z_Revenue              False
Response               False
dtype: bool

Now we have sucessfully removed missing values


### Duplicated Values


In [284]:
data.duplicated().sum()

0

No duplicates found


### Unique values in each feature


In [285]:
data.nunique().sort_values(ascending=True)

Z_Revenue                 1
Z_CostContact             1
Response                  2
AcceptedCmp3              2
AcceptedCmp4              2
AcceptedCmp2              2
Complain                  2
AcceptedCmp1              2
AcceptedCmp5              2
Kidhome                   3
Teenhome                  3
Education                 5
Marital_Status            8
NumCatalogPurchases      14
NumStorePurchases        14
NumDealsPurchases        15
NumWebPurchases          15
NumWebVisitsMonth        16
Year_Birth               59
Recency                 100
MntFruits               158
MntSweetProducts        177
MntFishProducts         182
MntGoldProds            213
MntMeatProducts         558
Dt_Customer             663
MntWines                776
Income                 1975
ID                     2240
dtype: int64

- Here Z_Revenue and Z_CostContact has only 1 values , So having them is useless
- Columns with 2 unique values are mostly binaries
- The only categorical values are Education and Marital Status , so we try to reduce no of unique values in them
- All the other columns are numerical values


In [286]:
data = data.drop(columns=["Z_Revenue", "Z_CostContact"], axis=1)
data.shape

(2240, 27)

In [287]:
data.columns

Index(['ID', 'Year_Birth', 'Education', 'Marital_Status', 'Income', 'Kidhome',
       'Teenhome', 'Dt_Customer', 'Recency', 'MntWines', 'MntFruits',
       'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
       'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
       'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth',
       'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1',
       'AcceptedCmp2', 'Complain', 'Response'],
      dtype='object')

## Univariate Analysis


#### Year_Birth


In [288]:
data["Year_Birth"].nunique()

59

In [289]:
fig = px.histogram(data_frame=data, x=data["Year_Birth"], text_auto=True)
fig.show()

We can clearly see there are some outliers


In [290]:
fig = px.box(data_frame=data, x=data["Year_Birth"], title="Year of Birth")
fig.show()

#### Education


In [291]:
print("The number of unique values are ", data["Education"].nunique())
data["Education"].unique()

The number of unique values are  5


array(['Graduation', 'PhD', 'Master', 'Basic', '2n Cycle'], dtype=object)

In [292]:
fig = px.histogram(
    data_frame=data,
    x=data["Education"],
    title="Education before reducing",
    text_auto=True,
)
fig.show()

There are 5 category and we can try to reduce it


In [293]:
data["Education"] = data["Education"].replace(
    ["PhD", "Master", "2n Cycle", "Graduation"], "Post Graduate"
)
data["Education"] = data["Education"].replace("Basic", "Under Graduate")

In [294]:
fig = px.histogram(
    data_frame=data,
    x=data["Education"],
    title="Education after reducing",
    text_auto=True,
)
fig.show()

In [295]:
ug_percent = (
    round(
        data["Education"].value_counts()[1] / data["Education"].value_counts().sum(), 4
    )
    * 100
)
pg_percent = (
    round(
        data["Education"].value_counts()[0] / data["Education"].value_counts().sum(), 4
    )
    * 100
)

print(f"The percentage of Undergraduates = {ug_percent}%")
print(f"The percentage of Postgraduates = {pg_percent}%")

The percentage of Undergraduates = 2.41%
The percentage of Postgraduates = 97.59%


Majority of the customers are PostGraduates


#### Marital Status


In [296]:
print(
    f"The number of unique values in Marital Status = {data['Marital_Status'].nunique()}"
)
data["Marital_Status"].unique()

The number of unique values in Marital Status = 8


array(['Single', 'Together', 'Married', 'Divorced', 'Widow', 'Alone',
       'Absurd', 'YOLO'], dtype=object)

We can reduce the no of unique values


In [297]:
fig = px.histogram(
    data_frame=data,
    x=data["Marital_Status"],
    title="Marital Status before reducing",
    text_auto=True,
)
fig.show()

In [298]:
data["Marital_Status"] = data["Marital_Status"].replace(
    ["Single", "Divorced", "Alone", "YOLO", "Absurd", "Widow"], "Single"
)
data["Marital_Status"] = data["Marital_Status"].replace(
    ["Together", "Married"], "Relationship"
)

In [299]:
fig = px.histogram(
    data_frame=data,
    x=data["Marital_Status"],
    title="Marital Status after reducing",
    text_auto=True,
)
fig.show()

In [300]:
ug_percent = (
    round(
        data["Marital_Status"].value_counts()[1]
        / data["Marital_Status"].value_counts().sum(),
        4,
    )
    * 100
)
pg_percent = (
    round(
        data["Marital_Status"].value_counts()[0]
        / data["Marital_Status"].value_counts().sum(),
        4,
    )
    * 100
)

print(f"The percentage of people who are Single = {ug_percent}%")
print(f"The percentage of people who are in a Relationship = {pg_percent}%")

The percentage of people who are Single = 35.54%
The percentage of people who are in a Relationship = 64.46%


#### Income


In [301]:
fig = px.histogram(data_frame=data, x=data["Income"])
fig.show()

In [302]:
fig = ff.create_distplot(
    hist_data=[data["Income"]],
    group_labels=["Income"],
    show_hist=False,
    curve_type="kde",
    show_rug=False,
)
fig.update_layout(title="Income")
fig.show()

In [303]:
fig = px.box(data_frame=data, x=data["Income"], title="Income")
fig.show()

We seems to have some outliers


#### Kidhome and Teenhome


In [304]:
data["Kidhome"].value_counts()

Kidhome
0    1293
1     899
2      48
Name: count, dtype: int64

In [305]:
data["Teenhome"].value_counts()

Teenhome
0    1158
1    1030
2      52
Name: count, dtype: int64

In [306]:
data["Children"] = data["Kidhome"] + data["Teenhome"]

In [307]:
fig = px.histogram(
    data_frame=data, x=data["Children"], title="No of Children", text_auto=True
)
fig.update_layout(bargap=0.2)
fig.show()

In [308]:
for i in range(0, 4):
    percent = round(
        data["Children"].value_counts()[i]
        / data["Children"].value_counts().sum()
        * 100,
        2,
    )
    print(f"Percentage of people with {i} children {percent} %")

Percentage of people with 0 children 28.48 %
Percentage of people with 1 children 50.36 %
Percentage of people with 2 children 18.79 %
Percentage of people with 3 children 2.37 %


#### Spendings


In [309]:
data.columns

Index(['ID', 'Year_Birth', 'Education', 'Marital_Status', 'Income', 'Kidhome',
       'Teenhome', 'Dt_Customer', 'Recency', 'MntWines', 'MntFruits',
       'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
       'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
       'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth',
       'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1',
       'AcceptedCmp2', 'Complain', 'Response', 'Children'],
      dtype='object')

In [310]:
data["TotalSpending"] = (
    data["MntFishProducts"]
    + data["MntFruits"]
    + data["MntGoldProds"]
    + data["MntMeatProducts"]
    + data["MntSweetProducts"]
    + data["MntWines"]
)

In [311]:
fig = ff.create_distplot(
    [data["TotalSpending"]], group_labels=["Total Spending"], show_rug=False
)
fig.update_layout(title="Total Spending", height=500)
fig.show()

In [312]:
fig = px.box(
    data_frame=data, x=data["TotalSpending"], title="Total Spending", points="all"
)
fig.show()

#### Accepted Complaints


In [313]:
data["TotalAcceptedCmp"] = (
    data["AcceptedCmp1"]
    + data["AcceptedCmp2"]
    + data["AcceptedCmp3"]
    + data["AcceptedCmp4"]
    + data["AcceptedCmp5"]
)

In [314]:
fig = px.histogram(
    data_frame=data,
    x=data["TotalAcceptedCmp"],
    title="Total number of accepted campaign",
    text_auto=True,
)
fig.update_layout(bargap=0.2)
fig.show()

In [315]:
for i in range(0, 5):
    percent = round(
        data["TotalAcceptedCmp"].value_counts()[i]
        / data["TotalAcceptedCmp"].value_counts().sum()
        * 100,
        2,
    )
    print(f"Percentage of people that accepted campaign {i} = {percent} %")

Percentage of people that accepted campaign 0 = 79.33 %
Percentage of people that accepted campaign 1 = 14.51 %
Percentage of people that accepted campaign 2 = 3.71 %
Percentage of people that accepted campaign 3 = 1.96 %
Percentage of people that accepted campaign 4 = 0.49 %


#### Number of Purchases


In [316]:
data.columns

Index(['ID', 'Year_Birth', 'Education', 'Marital_Status', 'Income', 'Kidhome',
       'Teenhome', 'Dt_Customer', 'Recency', 'MntWines', 'MntFruits',
       'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
       'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
       'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth',
       'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1',
       'AcceptedCmp2', 'Complain', 'Response', 'Children', 'TotalSpending',
       'TotalAcceptedCmp'],
      dtype='object')

In [317]:
data["TotalPurchases"] = np.sum(
    data.loc[
        :,
        [
            "NumDealsPurchases",
            "NumWebPurchases",
            "NumCatalogPurchases",
            "NumStorePurchases",
        ],
    ],
    axis=1,
)

In [318]:
fig = ff.create_distplot(
    [data["TotalPurchases"]], group_labels=["TotalPurchases"], show_rug=False
)
fig.update_layout(title="Total Purchases")
fig.show()

## Preprocessing


#### Removing unnecessary columns


In [321]:
unwanted_columns = [
    "ID",
    "MntWines",
    "MntFruits",
    "MntMeatProducts",
    "MntFishProducts",
    "MntSweetProducts",
    "MntGoldProds",
    "NumDealsPurchases",
    "NumWebPurchases",
    "NumCatalogPurchases",
    "NumStorePurchases",
    "NumWebVisitsMonth",
    "AcceptedCmp3",
    "AcceptedCmp4",
    "AcceptedCmp5",
    "AcceptedCmp1",
    "AcceptedCmp2",
    "Kidhome",
    "Teenhome",
]

In [322]:
data = data.drop(unwanted_columns, axis=1)

In [323]:
print("Remaining columns are")
for i in data.columns:
    print(i)

Remaining columns are
Year_Birth
Education
Marital_Status
Income
Dt_Customer
Recency
Complain
Response
Children
TotalSpending
TotalAcceptedCmp
TotalPurchases


#### Converting Dt_Customer to correct datatype


In [325]:
data["Dt_Customer"].dtype
data["Dt_Customer"] = pd.to_datetime(data["Dt_Customer"], format="%d-%m-%Y")

dtype('O')

In [329]:
data["Dt_Customer"].dtype  ## Converted to DateTime datatype

dtype('<M8[ns]')

#### Creating DaysEngaged columns


In [356]:
data["Days_Engaged"] = (pd.Timestamp("now") - data["Dt_Customer"]).dt.days  # type: ignore

#### Creating Age column


In [361]:
data["Age"] = np.floor(data["Days_Engaged"] / 365)

In [373]:
data["Age"].value_counts()

Age
10.0    1161
11.0     591
9.0      488
Name: count, dtype: int64

In [375]:
fig = px.histogram(data_frame=data, x=data["Age"], title="Customer Age", text_auto=True)
fig.update_layout(bargap=0.2)
fig.show()

#### Removing unwanted columns


In [377]:
unwanted_columns = ["Year_Birth", "Dt_Customer", "Recency", "Complain", "Response"]
data = data.drop(unwanted_columns, axis=1)

In [378]:
data.columns.values

array(['Education', 'Marital_Status', 'Income', 'Children',
       'TotalSpending', 'TotalAcceptedCmp', 'TotalPurchases',
       'Days_Engaged', 'Age'], dtype=object)

## Bivariate Analysis


#### Education and Income


In [383]:
fig = px.histogram(
    data_frame=data, x=data["Income"], color=data["Education"], barmode="overlay"
)
fig.show()

Undergraduates have very low Income


#### Education vs Expenses


In [397]:
fig = px.histogram(
    data_frame=data, x=data["TotalSpending"], color=data["Education"], barmode="overlay"
)
fig.show()

#### Marital Status and Income


In [401]:
fig = px.histogram(
    data_frame=data, x=data["Income"], color=data["Marital_Status"], barmode="group"
)
fig.show()