1 - Import Relevant Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import pickle

2 - Load Data

In [2]:
df_purchase = pd.read_csv('purchase-data.csv')

In [3]:
df_purchase.head()

Unnamed: 0,ID,Day,Incidence,Brand,Quantity,Last_Inc_Brand,Last_Inc_Quantity,Price_1,Price_2,Price_3,...,Promotion_3,Promotion_4,Promotion_5,Sex,Marital status,Age,Education,Income,Occupation,Settlement size
0,200000001,1,0,0,0,0,0,1.59,1.87,2.01,...,0,0,0,0,0,47,1,110866,1,0
1,200000001,11,0,0,0,0,0,1.51,1.89,1.99,...,0,0,0,0,0,47,1,110866,1,0
2,200000001,12,0,0,0,0,0,1.51,1.89,1.99,...,0,0,0,0,0,47,1,110866,1,0
3,200000001,16,0,0,0,0,0,1.52,1.89,1.98,...,0,0,0,0,0,47,1,110866,1,0
4,200000001,18,0,0,0,0,0,1.52,1.89,1.99,...,0,0,0,0,0,47,1,110866,1,0


In [4]:
df_purchase.isnull().sum()

ID                   0
Day                  0
Incidence            0
Brand                0
Quantity             0
Last_Inc_Brand       0
Last_Inc_Quantity    0
Price_1              0
Price_2              0
Price_3              0
Price_4              0
Price_5              0
Promotion_1          0
Promotion_2          0
Promotion_3          0
Promotion_4          0
Promotion_5          0
Sex                  0
Marital status       0
Age                  0
Education            0
Income               0
Occupation           0
Settlement size      0
dtype: int64

3 - Data Segmentation - Import Pickle Module

In [5]:
scaler = pickle.load(open('scaler.pickle', 'rb'))
pca = pickle.load(open('pca.pickle', 'rb'))
kmeans_pca = pickle.load(open('kmeans_pca.pickle', 'rb'))

4 - Data Segmentation - Standardization

In [6]:
# Standardization of Age and Income
df_purchase_segm_std = df_purchase[['Sex', 'Marital status', 'Age', 'Education', 'Income', 'Occupation', 'Settlement size']]

In [7]:
df_purchase_segm_std

Unnamed: 0,Sex,Marital status,Age,Education,Income,Occupation,Settlement size
0,0,0,47,1,110866,1,0
1,0,0,47,1,110866,1,0
2,0,0,47,1,110866,1,0
3,0,0,47,1,110866,1,0
4,0,0,47,1,110866,1,0
...,...,...,...,...,...,...,...
58688,0,0,42,1,120946,1,0
58689,0,0,42,1,120946,1,0
58690,0,0,42,1,120946,1,0
58691,0,0,42,1,120946,1,0


In [8]:
features = df_purchase_segm_std[['Age', 'Income']]
unscaled_inputs = scaler.transform(features)

In [9]:
pd.DataFrame(unscaled_inputs)

Unnamed: 0,0,1
0,0.946616,-0.264793
1,0.946616,-0.264793
2,0.946616,-0.264793
3,0.946616,-0.264793
4,0.946616,-0.264793
...,...,...
58688,0.519866,-0.000221
58689,0.519866,-0.000221
58690,0.519866,-0.000221
58691,0.519866,-0.000221


In [10]:
df_purchase_segm_std['Age'] = unscaled_inputs[:,0]
df_purchase_segm_std['Income'] = unscaled_inputs[:,1]
df_purchase_segm_std

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_purchase_segm_std['Age'] = unscaled_inputs[:,0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_purchase_segm_std['Income'] = unscaled_inputs[:,1]


Unnamed: 0,Sex,Marital status,Age,Education,Income,Occupation,Settlement size
0,0,0,0.946616,1,-0.264793,1,0
1,0,0,0.946616,1,-0.264793,1,0
2,0,0,0.946616,1,-0.264793,1,0
3,0,0,0.946616,1,-0.264793,1,0
4,0,0,0.946616,1,-0.264793,1,0
...,...,...,...,...,...,...,...
58688,0,0,0.519866,1,-0.000221,1,0
58689,0,0,0.519866,1,-0.000221,1,0
58690,0,0,0.519866,1,-0.000221,1,0
58691,0,0,0.519866,1,-0.000221,1,0


5 - Data Segmentation - PCA

In [11]:
# We apply pca on the purchase data and obtain 3 principal components for each row in the table from 7 categories
df_purchase_segm_pca = pca.transform(df_purchase_segm_std)

In [12]:
pd.DataFrame(df_purchase_segm_pca)

Unnamed: 0,0,1,2
0,0.144515,0.939955,-0.672035
1,0.144515,0.939955,-0.672035
2,0.144515,0.939955,-0.672035
3,0.144515,0.939955,-0.672035
4,0.144515,0.939955,-0.672035
...,...,...,...
58688,0.107374,0.559186,-0.494754
58689,0.107374,0.559186,-0.494754
58690,0.107374,0.559186,-0.494754
58691,0.107374,0.559186,-0.494754


6 - Data Segmentation - K-Means PCA

In [13]:
# Based on the principal components, we use the predict method from pca to segment the purchase data into the four segments.
purchase_segm_kmeans_pca = kmeans_pca.predict(df_purchase_segm_pca)



In [14]:
purchase_segm_kmeans_pca

array([3, 3, 3, ..., 3, 3, 3])

7 - New Dataset

In [15]:
# Copy resulting data frame as a purchase predictors data frame.
# We'll be changing the predictors data frame, by adding and changing columns.
# Therefore, we want to keep an original copy.
df_purchase_predictors = df_purchase.copy()
df_purchase_predictors['Segment'] = purchase_segm_kmeans_pca
df_purchase_predictors

Unnamed: 0,ID,Day,Incidence,Brand,Quantity,Last_Inc_Brand,Last_Inc_Quantity,Price_1,Price_2,Price_3,...,Promotion_4,Promotion_5,Sex,Marital status,Age,Education,Income,Occupation,Settlement size,Segment
0,200000001,1,0,0,0,0,0,1.59,1.87,2.01,...,0,0,0,0,47,1,110866,1,0,3
1,200000001,11,0,0,0,0,0,1.51,1.89,1.99,...,0,0,0,0,47,1,110866,1,0,3
2,200000001,12,0,0,0,0,0,1.51,1.89,1.99,...,0,0,0,0,47,1,110866,1,0,3
3,200000001,16,0,0,0,0,0,1.52,1.89,1.98,...,0,0,0,0,47,1,110866,1,0,3
4,200000001,18,0,0,0,0,0,1.52,1.89,1.99,...,0,0,0,0,47,1,110866,1,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58688,200000500,703,0,0,0,2,1,1.41,1.85,2.01,...,0,0,0,0,42,1,120946,1,0,3
58689,200000500,710,0,0,0,0,0,1.36,1.84,2.09,...,0,0,0,0,42,1,120946,1,0,3
58690,200000500,717,0,0,0,0,0,1.50,1.80,2.14,...,0,0,0,0,42,1,120946,1,0,3
58691,200000500,722,1,2,3,0,0,1.51,1.82,2.09,...,0,0,0,0,42,1,120946,1,0,3


In [16]:
df_purchase_predictors.to_csv("customer-segmented.csv", sep=',', index=False)