In [1]:
# import required libraries
import pandas as pd
import numpy as np
import seaborn as sns

<!-- ### E-commerce Costumer Segmentation System - is an intelligent model that groups costumers into meaningful clusters. -->

In [2]:
df = pd.read_csv("smartcart_customers.csv")
df.head()

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,Complain,Response
0,5524,1957,Graduation,Single,58138.0,0,0,04-09-2012,58,635,...,172,88,88,3,8,10,4,7,0,1
1,2174,1954,Graduation,Single,46344.0,1,1,08-03-2014,38,11,...,2,1,6,2,1,1,2,5,0,0
2,4141,1965,Graduation,Together,71613.0,0,0,21-08-2013,26,426,...,111,21,42,1,8,2,10,4,0,0
3,6182,1984,Graduation,Together,26646.0,1,0,10-02-2014,26,11,...,10,3,5,2,2,0,4,6,0,0
4,5324,1981,PhD,Married,58293.0,1,0,19-01-2014,94,173,...,46,27,15,5,5,3,6,5,0,0


In [3]:
## see missing values
df.isnull().sum()

ID                      0
Year_Birth              0
Education               0
Marital_Status          0
Income                 24
Kidhome                 0
Teenhome                0
Dt_Customer             0
Recency                 0
MntWines                0
MntFruits               0
MntMeatProducts         0
MntFishProducts         0
MntSweetProducts        0
MntGoldProds            0
NumDealsPurchases       0
NumWebPurchases         0
NumCatalogPurchases     0
NumStorePurchases       0
NumWebVisitsMonth       0
Complain                0
Response                0
dtype: int64

## Phase 1 -->> Data PreProcessing

## 1.1 Handling Missing Values :
### In Phase 1, we fill we handling the missing values, find the attributes with missing values and fill them accordingly.
### for int/float type attributes -->> fill missing values with either mean or median of the values of that attribute.
### for object(category type) type attributes -->> fill missing values with either mode of the values of that attribute.

In [5]:
# Attributes with Missing values --->>
# 1. Income -->> type - int/float -->> fill with mean or median

df["Income"] = df["Income"].fillna(df["Income"].median())

## Phase 1.2 : Feature Engineering

In [11]:
# "Year_Birth" - attribute have year value, from this attribute we will create a new attribute called Age, that defines costumers(users) age
df["Age"] = 2026-df["Year_Birth"]

In [51]:
# "Dt_Customer" - attribute have date-value  that defines that date of joining of the costumer on the platform, from this attribute, we will create a new attribute called Costumer_Tenure that defines number of days users have been joined in the platform.

# first we will convert the attribute to date type attribute 
df["Dt_Customer"] = pd.to_datetime(df["Dt_Customer"], dayfirst= True)
reference_data =  df["Dt_Customer"].max()

# create new feature (Costumer_Tenure)
df["Costumer_Tenure"] = (reference_data - df["Dt_Customer"]).dt.days

# total spending
df["total_spending"] = df["MntWines"] + df["MntFruits"] + df["MntMeatProducts"] + df["MntFishProducts"] + df["MntSweetProducts"] + df["MntGoldProds"]

# total children
df["total_child"] = df["Kidhome"] + df["Teenhome"] 

# Education
df["Education"] = df["Education"].replace({
    "Basic":"Undergraduate",
    "2n Cycle":"Undergraduate",
    "Graduation":"Graduate",
    "Master":"Postgraduate",
    "PhD":"Postgraduate",
})

# Marital Status
df["Living_With"] = df["Marital_Status"].replace({
    "Married":"Partner",
    "Together":"Partner",
    "Single":"Alone",
    "Divorced":"Alone",
    "Widow":"Alone",
    "Absurd":"Alone",
    "YOLO":"Alone",
})


## Phase 1.3 : Drop Columns

In [None]:
cols = ["ID", "Year_Birth", "Marital_Status", "Kidhome", "Teenhome", "Dt_Customer"]
purchase_cols = ["MntWines","MntFruits","MntFruits","MntFishProducts","MntSweetProducts","MntGoldProds"]

columns_to_drop = cols + purchase_cols 
df.drop(columns=columns_to_drop)

In [56]:
df.columns

Index(['ID', 'Year_Birth', 'Education', 'Marital_Status', 'Income', 'Kidhome',
       'Teenhome', 'Dt_Customer', 'Recency', 'MntWines', 'MntFruits',
       'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
       'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
       'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth',
       'Complain', 'Response', 'Age', 'Costumer_Tenure', 'total_spending',
       'total_child', 'Living_With'],
      dtype='object')

In [58]:
df.head()
# df.columns

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,Complain,Response,Age,Costumer_Tenure,total_spending,total_child,Living_With
0,5524,1957,Graduate,Alone,58138.0,0,0,2012-09-04,58,635,...,10,4,7,0,1,69,663,1617,0,Alone
1,2174,1954,Graduate,Alone,46344.0,1,1,2014-03-08,38,11,...,1,2,5,0,0,72,113,27,2,Alone
2,4141,1965,Graduate,Partner,71613.0,0,0,2013-08-21,26,426,...,2,10,4,0,0,61,312,776,0,Partner
3,6182,1984,Graduate,Partner,26646.0,1,0,2014-02-10,26,11,...,0,4,6,0,0,42,139,53,1,Partner
4,5324,1981,Postgraduate,Partner,58293.0,1,0,2014-01-19,94,173,...,3,6,5,0,0,45,161,422,1,Partner
