In [1]:
# Final Project - Machine Learning - Task 2
# Create a churn risk prediction model using the segmented customer data from Task 1
# Authors
# Jeevanthi Panawala
# Sreedharani

In [None]:
# All experiments revealed that recency, frequency and monetary, both as absolute values
# and as scores, are strong predictors of customer churn. 
# Reference Paper:APPLICATION OF MACHINE LEARNING FOR CHURN PREDICTION BASED ON TRANSACTIONAL DATA (RFM ANALYSIS)
# The refereced research paper has used RFM score as a feature.
# Instead we are using the Cluster value as an input categorical variable


In [1]:
# Library Imports
import pandas as pd
from datetime import datetime,timedelta
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.model_selection import GridSearchCV
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
#----------------------- Load Dataset -Resulting Customer Segmentation Dataset from Task 1--------------#
df=pd.read_csv("../Data/customer_segments.csv")
df.head()

Unnamed: 0,Customer ID,Recency,Amount,frequency,Recency_Log,Amount_Log,frequency_Log,Cluster,Archetype
0,12346.0,134,77556.46,34,4.905275,11.258774,3.555348,0,Loyal Luxury Shoppers
1,12347.0,55,3146.75,155,4.025352,8.054443,5.049856,0,Loyal Luxury Shoppers
2,12348.0,57,1709.4,48,4.060443,7.444483,3.89182,2,Lost Luxury Shoppers
3,12349.0,216,2671.14,102,5.379897,7.890635,4.634729,2,Lost Luxury Shoppers
4,12350.0,118,334.4,17,4.779123,5.815324,2.890372,1,Long Lost Rare Shoppers


In [3]:
df.shape

(4935, 9)

In [4]:
# Load the original cleaned dataset 
# Null values removed
# Purcahse returened removed
df_original_refined=pd.read_csv("../Data/refined_data.csv") 

In [5]:
df_original_refined.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085.0,United Kingdom
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom


In [9]:
print(df_original_refined.isnull().sum())

Invoice        0
StockCode      0
Description    0
Quantity       0
InvoiceDate    0
Price          0
Customer ID    0
Country        0
dtype: int64


In [None]:
#--------------- Select data after 2011-06-01-----------------------------------------------#

In [6]:
# Convert TransactionDate to datetime
df_original_refined['InvoiceDate'] = pd.to_datetime(df_original_refined['InvoiceDate'])


In [7]:
filter_date=pd.to_datetime('2011-06-01')
df_original_refined = df_original_refined[df_original_refined['InvoiceDate'] > filter_date]
df_original_refined.shape

(252462, 8)

In [13]:
print(df_original_refined['InvoiceDate'].describe())

count                           252462
mean     2011-09-23 19:07:56.863131904
min                2011-06-01 07:37:00
25%                2011-08-10 16:19:00
50%                2011-10-04 15:55:00
75%                2011-11-10 12:37:00
max                2011-12-09 12:50:00
Name: InvoiceDate, dtype: object


In [20]:
#------------- Churn Condition----------------------------------#
# If the CustomerID in the customer segmentation dataset, is present in the df_original_refined dataset
# That means, he/she has done a purchase within 6 months
# So they are not in the churned list

In [8]:
# Check if 'Customer ID' in df exists in df_original_refined and set Chrurn =0
df['Churn'] = (~df['Customer ID'].isin(df_original_refined['Customer ID'])).astype(int)
df.head()

Unnamed: 0,Customer ID,Recency,Amount,frequency,Recency_Log,Amount_Log,frequency_Log,Cluster,Archetype,Churn
0,12346.0,134,77556.46,34,4.905275,11.258774,3.555348,0,Loyal Luxury Shoppers,1
1,12347.0,55,3146.75,155,4.025352,8.054443,5.049856,0,Loyal Luxury Shoppers,0
2,12348.0,57,1709.4,48,4.060443,7.444483,3.89182,2,Lost Luxury Shoppers,0
3,12349.0,216,2671.14,102,5.379897,7.890635,4.634729,2,Lost Luxury Shoppers,0
4,12350.0,118,334.4,17,4.779123,5.815324,2.890372,1,Long Lost Rare Shoppers,1


In [9]:
df.shape

(4935, 10)

In [10]:
# Check churned and retained shopper counts

churn_counts = df['Churn'].value_counts()
churn_0_count = churn_counts.get(0, 0)  # Rows with churn = 0
churn_1_count = churn_counts.get(1, 0)  # Rows with churn = 1

print(f"Retained Count = 0: {churn_0_count}")
print(f"Churned Count = 1: {churn_1_count}")

Retained Count = 0: 2602
Churned Count = 1: 2333


In [11]:
# Convert archetype to a column with ordinal values
# Loyal Luxury Shoppers are less likely to churn while Long Lost Rare Shoppers are most likely to churn
archetype_mapping = {
    "Loyal Luxury Shoppers": 0,  # Least likely to churn
    "Moderate Recent Shoppers": 1,  # Moderately likely to churn
    "Lost Luxury Shoppers ": 2,  # Likely to churn
    "Long Lost Rare Shoppers": 3  # Most likely to churn
}

df['Archetype_Value'] = df['Archetype'].map(archetype_mapping)
df.head()

Unnamed: 0,Customer ID,Recency,Amount,frequency,Recency_Log,Amount_Log,frequency_Log,Cluster,Archetype,Churn,Archetype_Value
0,12346.0,134,77556.46,34,4.905275,11.258774,3.555348,0,Loyal Luxury Shoppers,1,0
1,12347.0,55,3146.75,155,4.025352,8.054443,5.049856,0,Loyal Luxury Shoppers,0,0
2,12348.0,57,1709.4,48,4.060443,7.444483,3.89182,2,Lost Luxury Shoppers,0,2
3,12349.0,216,2671.14,102,5.379897,7.890635,4.634729,2,Lost Luxury Shoppers,0,2
4,12350.0,118,334.4,17,4.779123,5.815324,2.890372,1,Long Lost Rare Shoppers,1,3


In [12]:
# Remove cluster column
df.drop('Cluster', axis=1, inplace=True)
df.head()

Unnamed: 0,Customer ID,Recency,Amount,frequency,Recency_Log,Amount_Log,frequency_Log,Archetype,Churn,Archetype_Value
0,12346.0,134,77556.46,34,4.905275,11.258774,3.555348,Loyal Luxury Shoppers,1,0
1,12347.0,55,3146.75,155,4.025352,8.054443,5.049856,Loyal Luxury Shoppers,0,0
2,12348.0,57,1709.4,48,4.060443,7.444483,3.89182,Lost Luxury Shoppers,0,2
3,12349.0,216,2671.14,102,5.379897,7.890635,4.634729,Lost Luxury Shoppers,0,2
4,12350.0,118,334.4,17,4.779123,5.815324,2.890372,Long Lost Rare Shoppers,1,3


In [38]:
# Feature Set - Recency, Amount, Frequency and Archetype_Value
# Target Variable Churn
# Reference Papers: 1. APPLICATION OF MACHINE LEARNING FOR CHURN PREDICTION BASED ON TRANSACTIONAL DATA (RFM ANALYSIS)
# 2.Customer churn prediction system: a machine learning approach
# Both the papers recommended 2- class boosted decision tree for high accuracy in churn analysis
# So GradientBoostingClassifier is used for churn classification/prediction

In [None]:
# Feature scaling is not needed because decision tree algorithms are not sensitive to the magnitude of feature values
# Data preprocessing was done in Task 1

In [13]:
# Data Preparation
X=df[['Recency','Amount','frequency','Archetype_Value']] # Features
y=df['Churn'] # Target 
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [14]:
X_test.head()

Unnamed: 0,Recency,Amount,frequency,Archetype_Value
1972,94,1592.15,79,2
4683,204,171.71,24,3
151,187,276.5,13,3
2423,202,514.22,138,2
1412,463,9530.08,13,2


In [15]:
# Create the Gradient Boosting Classifier
gradient_boost_model = GradientBoostingClassifier(
    n_estimators=100,       # Number of boosting stages
    learning_rate=0.1,      # Shrinks the contribution of each tree
    max_depth=3,            # Depth of each tree
    subsample=0.8,          # Fraction of samples for training each tree
    random_state=42         # Reproducibility
)

# Train the model
gradient_boost_model.fit(X_train, y_train)


In [16]:
# Evaluate the model
# Predictions
y_predicted = gradient_boost_model.predict(X_test)
y_predicted_probability = gradient_boost_model.predict_proba(X_test)[:, 1]  # Probabilities for the positive class

# Confusion Matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_predicted))

# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_predicted))

# ROC-AUC Score
print("\nROC-AUC Score:")
print(roc_auc_score(y_test, y_predicted_probability))

Confusion Matrix:
[[383 132]
 [115 357]]

Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.74      0.76       515
           1       0.73      0.76      0.74       472

    accuracy                           0.75       987
   macro avg       0.75      0.75      0.75       987
weighted avg       0.75      0.75      0.75       987


ROC-AUC Score:
0.8240784926773078


In [17]:
# For improving the model performance, we decided to try hyper parameter tuning of Gradient Boost Classifier

# Define the model
model = GradientBoostingClassifier()

# Hyperparameter grid
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'subsample': [0.8, 1.0]
}

# Grid search
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='roc_auc', verbose=2)
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Evaluate
y_pred = best_model.predict(X_test)
print("Best Parameters:", grid_search.best_params_)
print("ROC-AUC Score:", roc_auc_score(y_test, y_pred))


Fitting 5 folds for each of 54 candidates, totalling 270 fits
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8; total time=   0.4s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8; total time=   0.3s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8; total time=   0.3s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8; total time=   0.3s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8; total time=   0.3s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, subsample=1.0; total time=   0.4s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, subsample=1.0; total time=   0.3s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, subsample=1.0; total time=   0.4s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, subsample=1.0; total time=   0.3s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, subsample=1.0; total time=   0.4

In [18]:
# Tried the gradient boost classifier with the best parameters given in hyper parameter tuning step
model1 = GradientBoostingClassifier(
    n_estimators=100,       # Number of boosting stages
    learning_rate=0.05,      # Shrinks the contribution of each tree
    max_depth=3,            # Depth of each tree
    subsample=0.8,          # Fraction of samples for training each tree
    random_state=42         # Reproducibility
)

# Train the model
model1.fit(X_train, y_train)

In [19]:
# Evaluate the new model
# Predictions
y_pred = model1.predict(X_test)
y_pred_probability = model1.predict_proba(X_test)[:, 1]  # Probabilities for the positive class

# Confusion Matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# ROC-AUC Score
print("\nROC-AUC Score:")
print(roc_auc_score(y_test, y_pred_probability))

Confusion Matrix:
[[377 138]
 [113 359]]

Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.73      0.75       515
           1       0.72      0.76      0.74       472

    accuracy                           0.75       987
   macro avg       0.75      0.75      0.75       987
weighted avg       0.75      0.75      0.75       987


ROC-AUC Score:
0.8303315780812901


In [None]:
# Gradient Boost model with the Tuned hyper parameters, has changed ROC-AUC Score marginally
# Precision, recall and F1 score for both churn and non churn classes are similar in both the models
# Marginal improvement in ROC-AUC value is taken into consideration and decided that the model after tuning
# Better in identifying churners and non-churners

In [20]:
# To prepare marketing recommendations report, 
# Churn probabily is calculated for the dataset (with RFM and customer segment values)
# using the churn prediction model after hyper parameter tuning
model1.fit(X, y)
churn_probabilities= model1.predict_proba(X)[:, 1]  # Probabilities for the positive class
df['churn_probability'] = churn_probabilities
df.head()


Unnamed: 0,Customer ID,Recency,Amount,frequency,Recency_Log,Amount_Log,frequency_Log,Archetype,Churn,Archetype_Value,churn_probability
0,12346.0,134,77556.46,34,4.905275,11.258774,3.555348,Loyal Luxury Shoppers,1,0,0.672486
1,12347.0,55,3146.75,155,4.025352,8.054443,5.049856,Loyal Luxury Shoppers,0,0,0.114948
2,12348.0,57,1709.4,48,4.060443,7.444483,3.89182,Lost Luxury Shoppers,0,2,0.256723
3,12349.0,216,2671.14,102,5.379897,7.890635,4.634729,Lost Luxury Shoppers,0,2,0.388233
4,12350.0,118,334.4,17,4.779123,5.815324,2.890372,Long Lost Rare Shoppers,1,3,0.641128


In [22]:
# Save the dataset for marketing recommendation report preparation
df.to_csv('../Data/churn_probabilities.csv', index=False)