In [1]:
import pandas as pd
import numpy as np
import datetime as dt

df = pd.read_csv("../data/processed/online_retail_clean.csv")
df["InvoiceDate"] = pd.to_datetime(df["InvoiceDate"], errors="coerce")

orders = (
    df.groupby(["CustomerID", "InvoiceNo"], as_index=False)
      .agg(
          InvoiceDate=("InvoiceDate", "min"),
          Revenue=("Revenue", "sum")
      )
)

In [2]:
snapshot_date = orders["InvoiceDate"].max() + dt.timedelta(days=1)
snapshot_date

Timestamp('2011-12-10 12:50:00')

In [3]:
rfm = orders.groupby("CustomerID").agg(
    Recency=("InvoiceDate", lambda x: (snapshot_date - x.max()).days),
    Frequency=("InvoiceNo", "nunique"),
    Monetary=("Revenue", "sum")
)

In [4]:
# target variable - churn label
CHURN_THRESHOLD = 90  # days

rfm["Churn"] = (rfm["Recency"] > CHURN_THRESHOLD).astype(int)
rfm["Churn"].value_counts(normalize=True)

Churn
0    0.666052
1    0.333948
Name: proportion, dtype: float64

In [5]:
# train/test split
from sklearn.model_selection import train_test_split

X = rfm[["Recency", "Frequency", "Monetary"]]
y = rfm["Churn"]

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [6]:
# baseline model - logistic regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

baseline = LogisticRegression(max_iter=1000)
baseline.fit(X_train, y_train)

baseline_preds = baseline.predict_proba(X_val)[:, 1]
print("Baseline ROC-AUC:", roc_auc_score(y_val, baseline_preds))

Baseline ROC-AUC: 1.0


In [7]:
# XGBoost
from xgboost import XGBClassifier

model = XGBClassifier(
    n_estimators=300,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss",
    random_state=42
)

model.fit(X_train, y_train)

preds = model.predict_proba(X_val)[:, 1]
print("XGBoost ROC-AUC:", roc_auc_score(y_val, preds))

XGBoost ROC-AUC: 1.0


In [8]:
from sklearn.metrics import precision_recall_curve

precision, recall, thresholds = precision_recall_curve(y_val, preds)

pd.DataFrame({
    "threshold": thresholds,
    "precision": precision[:-1],
    "recall": recall[:-1]
}).head()

Unnamed: 0,threshold,precision,recall
0,0.000149,0.334101,1.0
1,0.000165,0.353659,1.0
2,0.000166,0.35409,1.0
3,0.000168,0.355392,1.0
4,0.000169,0.356265,1.0


In [10]:
# model explaination
import pandas as pd

importance = pd.Series(
    model.feature_importances_,
    index=["Recency", "Frequency", "Monetary"]
).sort_values(ascending=False)

importance

Recency      0.955429
Frequency    0.042003
Monetary     0.002568
dtype: float32

In [12]:
# who should we target?
# identifying high risk customers
rfm["ChurnProbability"] = model.predict_proba(X)[:, 1]

high_risk = rfm[rfm["ChurnProbability"] >= 0.7]
high_risk.head()

Unnamed: 0_level_0,Recency,Frequency,Monetary,Churn,ChurnProbability
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
12346.0,326,1,77183.6,1,0.998975
12350.0,310,1,334.4,1,0.999227
12353.0,204,1,89.0,1,0.9994
12354.0,232,1,1079.4,1,0.999047
12355.0,214,1,459.4,1,0.999125


In [13]:
# revenue at risk
revenue_at_risk = high_risk["Monetary"].sum()
total_revenue = rfm["Monetary"].sum()

print(f"Revenue at Risk: £{revenue_at_risk:,.0f}")
print(f"% of Revenue at Risk: {revenue_at_risk / total_revenue:.2%}")

Revenue at Risk: £1,035,270
% of Revenue at Risk: 11.62%


In [14]:
rfm_segments = pd.read_csv("../data/processed/rfm_segments.csv", index_col="CustomerID")

churn_with_segments = rfm.join(rfm_segments["Segment"], how="left")

churn_with_segments.groupby("Segment")["ChurnProbability"].mean().sort_values(ascending=False)

Segment
Recent Big Spenders    0.999140
High-Value Loyal       0.209385
At-Risk Customers      0.137307
Low Engagement         0.027683
Name: ChurnProbability, dtype: float32

The model flagged **Recent Big Spenders** as the highest churn-risk segment. These customers make large one-time purchases but don’t return quickly, which the model correctly identifies as high churn risk. From a business perspective, this is actually a high-impact segment to target with post-purchase retention campaigns.