<a href="https://colab.research.google.com/github/joanitolopo/customer-segmentation/blob/main/processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
#!pip install kmodes

In [3]:
# import main library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# import librries for clustering
from kmodes.kprototypes import KPrototypes
from sklearn.cluster import KMeans
from kmodes.kmodes import KModes
from tqdm.auto import tqdm

from sklearn.preprocessing import LabelEncoder, StandardScaler

# Processing Data

```Link Dataset: ``` https://www.kaggle.com/imakash3011/customer-personality-analysis

In [5]:
# You can download data in kaggle or juts clone data from my github repository
#!git clone https://github.com/joanitolopo/customer-segmentation.git

In [49]:
# After download, the data will appear in left-panel and saved in data folder
data = pd.read_csv("/content/customer-segmentation/data/marketing_campaign.csv", index_col=["ID"], delimiter="\t")
data.head()

Unnamed: 0_level_0,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
5524,1957,Graduation,Single,58138.0,0,0,04-09-2012,58,635,88,546,172,88,88,3,8,10,4,7,0,0,0,0,0,0,3,11,1
2174,1954,Graduation,Single,46344.0,1,1,08-03-2014,38,11,1,6,2,1,6,2,1,1,2,5,0,0,0,0,0,0,3,11,0
4141,1965,Graduation,Together,71613.0,0,0,21-08-2013,26,426,49,127,111,21,42,1,8,2,10,4,0,0,0,0,0,0,3,11,0
6182,1984,Graduation,Together,26646.0,1,0,10-02-2014,26,11,4,20,10,3,5,2,2,0,4,6,0,0,0,0,0,0,3,11,0
5324,1981,PhD,Married,58293.0,1,0,19-01-2014,94,173,43,118,46,27,15,5,5,3,6,5,0,0,0,0,0,0,3,11,0


Aggregate Binary Columns

In [50]:
# Based on information above, for simplicity we will aggregate two columns, it was just a binary data, so we can count on it
accepted_list = (data.columns[data.columns.str.startswith("Accepted")]).tolist() + ["Response"]
num_purchase = (data.columns[data.columns.str.startswith("Num")]).tolist()
data["TotalAcceptedCmp"] = data[accepted_list].agg("sum", axis="columns")
data["NumTotalPurchases"] = data[num_purchase].agg("sum", axis="columns")
data=data.drop(columns=accepted_list)
data=data.drop(columns=num_purchase)

# After aggregate, dont forget to drop remain columns and also we will drop unuseful columns 
data = data.drop(columns=["Year_Birth", "Dt_Customer", "Z_CostContact", "Z_Revenue"])

Find null data and drop it

In [51]:
# Find null data in columns, and then drop rows that have low percent of null values
null_data_list = [data.columns[i] for i in range(len(data.columns)) if data.isna().sum()[i] != 0]
null_data = data[null_data_list].isna().sum()
print(f"Total of null data in each columns: \n{null_data/len(data)*100}\n")

Total of null data in each columns: 
Income    1.071429
dtype: float64



In [52]:
# Base on that, we just drop the null value with dropna method
data = data.dropna()

Make a segmentation for some of categorics

In [53]:
# We can make a short categoric for columns become First, Second, and Third
data["Education"] = data["Education"].replace(["Basic"], 'First')
data["Education"] = data["Education"].replace(["Graduation", "Master", "2n Cycle"], 'Second')
data["Education"] = data["Education"].replace(["PhD"], 'Third')

# We do same as before
data['Marital_Status'] = data['Marital_Status'].replace(['Married', 'Together'],'relationship')
data['Marital_Status'] = data['Marital_Status'].replace(['Divorced', 'Widow', 'Alone', 'YOLO', 'Absurd'],'Single')

In [54]:
data.head()

Unnamed: 0_level_0,Education,Marital_Status,Income,Kidhome,Teenhome,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,Complain,TotalAcceptedCmp,NumTotalPurchases
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
5524,Second,Single,58138.0,0,0,58,635,88,546,172,88,88,0,1,32
2174,Second,Single,46344.0,1,1,38,11,1,6,2,1,6,0,0,11
4141,Second,relationship,71613.0,0,0,26,426,49,127,111,21,42,0,0,25
6182,Second,relationship,26646.0,1,0,26,11,4,20,10,3,5,0,0,14
5324,Third,relationship,58293.0,1,0,94,173,43,118,46,27,15,0,0,24


In [56]:
data.to_csv("data_cleaned.csv")