In [24]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from lazypredict.Supervised import LazyClassifier
from sklearn.cluster import KMeans

from utils import *

# Customer segmentation (RFM analysis)

Customers who use your platform have different needs and they have their own different profile. Your should adapt your actions depending on that.
You can do many different segmentations according to what you are trying to achieve. If you want to increase retention rate, you can do a segmentation based on churn probability and take actions. But there are very common and useful segmentation methods as well. Now we are going to implement one of them to our business: RFM.
RFM stands for Recency - Frequency - Monetary Value

Just like before, we'll be using the datasets from Group 1 as the basis for our visualizations and analyses.

In [25]:
# Group 1:
# items1 = pd.read_csv('data/Created in part 01/group1_items.csv', index_col='Invoice', parse_dates=['InvoiceDate'])
invoices = pd.read_csv('../data/Created in part 01/group1_invoices.csv', index_col='Invoice', parse_dates=['InvoiceDate'])

In [26]:
invoices = (
    invoices
    .pipe(adjust_time_window)
    .pipe(normalize_invoicedate)
    .pipe(clean_customer_id)
)

invoices.head(3)

Unnamed: 0_level_0,Quantity,Price,Customer ID,InvoiceDate
Invoice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
496349,228,65.51,14739,2010-01-02
496351,79,80.05,14370,2010-01-02
496354,98,25.61,12810,2010-01-02


- # Recency

In [27]:
invoices_by_user = pd.DataFrame({'CustomerID': invoices['Customer ID'].unique()})

In [28]:
invoices_max_date = (
    invoices
    .groupby('Customer ID')
    ['InvoiceDate']
    .max()
    .rename('MaxPurchase')
)

In [29]:
invoices_by_user = invoices_by_user.merge(invoices_max_date, left_on='CustomerID', right_on='Customer ID')

invoices_by_user.head()

Unnamed: 0,CustomerID,MaxPurchase
0,14739,2010-10-31
1,14370,2010-09-15
2,12810,2010-06-23
3,16684,2010-11-25
4,14047,2010-08-17


In [30]:
invoices_by_user['Recency'] = (invoices_by_user['MaxPurchase'].max() - invoices_by_user['MaxPurchase']).dt.days

In [31]:
px.histogram(data_frame=invoices_by_user, x='Recency')

2 ways to do:

In [32]:
wccs={}
df_for_clusters = invoices_by_user[['Recency']]
for i in range(1, 12):
    kmeans = KMeans(n_clusters=i, max_iter=500).fit(df_for_clusters)
    df_for_clusters["Clusters"] = kmeans.labels_
    wccs[i] = kmeans.inertia_ 

px.line(x=wccs.keys(), y=wccs.values())

In [33]:
# give credits to https://jtemporal.com/kmeans-and-elbow-method/
def calculate_wcss(data):
    wcss = []
    for n in range(2, 21):
        kmeans = KMeans(n_clusters=n)
        kmeans.fit(X=data)
        wcss.append(kmeans.inertia_)
    return wcss

def optimal_number_of_clusters(wcss):
    x1, y1 = 2, wcss[0]
    x2, y2 = 20, wcss[len(wcss)-1]
    distances = []
    for i in range(len(wcss)):
        x0 = i+2
        y0 = wcss[i]
        numerator = np.abs((y2-y1)*x0 - (x2-x1)*y0 + x2*y1 - y2*x1)
        denominator = np.sqrt((y2 - y1)**2 + (x2 - x1)**2)
        distances.append(numerator/denominator)
    return distances.index(max(distances)) + 2

In [34]:
optimal_number_of_clusters(calculate_wcss(invoices_by_user['Recency'].values.reshape(-1,1)))

5

Therefore, clusters = 5

In [35]:
kmeans = KMeans(n_clusters=5)
kmeans.fit(invoices_by_user[['Recency']])
invoices_by_user['RecencyCluster'] = kmeans.predict(invoices_by_user[['Recency']])

In [36]:
invoices_by_user.groupby('RecencyCluster').describe()

Unnamed: 0_level_0,Recency,Recency,Recency,Recency,Recency,Recency,Recency,Recency
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
RecencyCluster,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0,453.0,202.32,22.24,165.0,186.0,204.0,221.0,242.0
1,1656.0,14.14,11.19,0.0,5.0,12.0,23.0,37.0
2,621.0,126.62,20.32,95.0,110.0,125.0,144.0,163.0
3,385.0,281.86,26.24,244.0,259.0,279.0,303.0,332.0
4,1005.0,60.57,15.66,39.0,46.0,61.0,74.0,93.0


# REVIEW, cluster number changes on every func call

In [37]:
invoices_by_user['RecencyCluster'] = invoices_by_user['RecencyCluster'] + 5

In [38]:
invoices_by_user.groupby('RecencyCluster').describe()

Unnamed: 0_level_0,Recency,Recency,Recency,Recency,Recency,Recency,Recency,Recency
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
RecencyCluster,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
5,453.0,202.32,22.24,165.0,186.0,204.0,221.0,242.0
6,1656.0,14.14,11.19,0.0,5.0,12.0,23.0,37.0
7,621.0,126.62,20.32,95.0,110.0,125.0,144.0,163.0
8,385.0,281.86,26.24,244.0,259.0,279.0,303.0,332.0
9,1005.0,60.57,15.66,39.0,46.0,61.0,74.0,93.0


- 9 should become 0
- 5 should become 1
- 7 should become 2
- 6 should become 3
- 8 should become 4

In [39]:
invoices_by_user['RecencyCluster'] = invoices_by_user['RecencyCluster'].replace([9, 5, 7, 6, 8], [0, 1, 2, 3, 4])
invoices_by_user.groupby('RecencyCluster').describe()

Unnamed: 0_level_0,Recency,Recency,Recency,Recency,Recency,Recency,Recency,Recency
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
RecencyCluster,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0,1005.0,60.57,15.66,39.0,46.0,61.0,74.0,93.0
1,453.0,202.32,22.24,165.0,186.0,204.0,221.0,242.0
2,621.0,126.62,20.32,95.0,110.0,125.0,144.0,163.0
3,1656.0,14.14,11.19,0.0,5.0,12.0,23.0,37.0
4,385.0,281.86,26.24,244.0,259.0,279.0,303.0,332.0


In [40]:
invoices_by_user

Unnamed: 0,CustomerID,MaxPurchase,Recency,RecencyCluster
0,14739,2010-10-31,30,3
1,14370,2010-09-15,76,0
2,12810,2010-06-23,160,2
3,16684,2010-11-25,5,3
4,14047,2010-08-17,105,2
...,...,...,...,...
4115,15939,2010-11-30,0,3
4116,17826,2010-11-30,0,3
4117,16473,2010-11-30,0,3
4118,17820,2010-11-30,0,3


- # Frequency

In [41]:
frequency_to_merge = (
    invoices
    .groupby('Customer ID')
    ['InvoiceDate']
    .count()
    .rename('Frequency')
)

frequency_to_merge

Customer ID
12346    2
12347    2
12348    1
12349    2
12351    1
        ..
18283    6
18284    1
18285    1
18286    1
18287    4
Name: Frequency, Length: 4120, dtype: int64

In [42]:
invoices_by_user = invoices_by_user.merge(frequency_to_merge, left_on='CustomerID', right_on='Customer ID')

In [43]:
invoices_by_user

Unnamed: 0,CustomerID,MaxPurchase,Recency,RecencyCluster,Frequency
0,14739,2010-10-31,30,3,16
1,14370,2010-09-15,76,0,6
2,12810,2010-06-23,160,2,2
3,16684,2010-11-25,5,3,24
4,14047,2010-08-17,105,2,10
...,...,...,...,...,...
4115,15939,2010-11-30,0,3,1
4116,17826,2010-11-30,0,3,1
4117,16473,2010-11-30,0,3,1
4118,17820,2010-11-30,0,3,1


In [48]:
px.histogram(data_frame=invoices_by_user.query('Frequency < 100'), x='Frequency')