# CHURN RATE PREDICTION

In [2]:
import os
os.environ['LOKY_MAX_CPU_COUNT'] = '10'
import pandas as pd
import numpy as np
from datetime import timedelta
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score
from sklearn.model_selection import train_test_split

## Preparing data

In [3]:
def prepare_features_and_target(file_path='cleaned_retail_data.csv'):
    # READ DATA
    df = pd.read_csv(file_path)
    df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
    # SPLIT DATA 
    holdout_start_date = df['InvoiceDate'].max() - timedelta(days=30)
    df_development = df[df['InvoiceDate'] < holdout_start_date].copy()
    df_holdout = df[df['InvoiceDate'] >= holdout_start_date].copy()

    # FEATURE ENGINEERING
    snapshot_date_dev = df_development['InvoiceDate'].max() + timedelta(days=1)
    # RFM
    features = df_development.groupby('CustomerID').agg(
        Recency=('InvoiceDate', lambda x: (snapshot_date_dev - x.max()).days),
        Frequency=('InvoiceNo', 'nunique'),
        Monetary=('TotalPrice', 'sum')
    ).reset_index()

    # AVG BASKET VALUE
    avg_basket_value = df_development.groupby('CustomerID')['TotalPrice'].sum() / df_development.groupby('CustomerID')['InvoiceNo'].nunique()
    avg_basket_value = avg_basket_value.reset_index(name='avg_basket_value')

    # DISC PRODUCT
    distinct_products = df_development.groupby('CustomerID')['StockCode'].nunique().reset_index(name='distinct_products')

    # AVG BETWEEN DAY PURCHASE
    df_dev_sorted = df_development.sort_values(by=['CustomerID', 'InvoiceDate'])
    df_dev_sorted['days_between'] = df_dev_sorted.groupby('CustomerID')['InvoiceDate'].diff().dt.days
    avg_days_between = df_dev_sorted.groupby('CustomerID')['days_between'].mean().reset_index(name='avg_days_between_purchases')

    # MERGING
    features = pd.merge(features, avg_basket_value, on='CustomerID', how='left')
    features = pd.merge(features, distinct_products, on='CustomerID', how='left')
    features = pd.merge(features, avg_days_between, on='CustomerID', how='left')
    features.fillna(0, inplace=True) # Điền 0 cho các giá trị NaN

    # CREATE TARGET
    customers_in_holdout = df_holdout['CustomerID'].unique()
    features['will_buy'] = features['CustomerID'].isin(customers_in_holdout).astype(int)

    return features

## Training & Result

In [4]:
def train_and_evaluate_model(features):
    # Chuẩn bị dữ liệu X, y ban đầu (chưa có feature cluster)
    X = features[['Recency', 'Frequency', 'Monetary', 'avg_basket_value', 'distinct_products', 'avg_days_between_purchases']]
    y = features['will_buy']
    
    # Chia train/test TRƯỚC KHI làm clustering
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # K-MEANS
    rfm_cols = ['Recency', 'Frequency', 'Monetary']

    scaler_kmeans = StandardScaler()
    X_train_rfm_scaled = scaler_kmeans.fit_transform(X_train[rfm_cols])
    X_test_rfm_scaled = scaler_kmeans.transform(X_test[rfm_cols])

    k = 10
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_train_rfm_scaled)

    X_train['RFM_Cluster'] = kmeans.predict(X_train_rfm_scaled)
    X_test['RFM_Cluster'] = kmeans.predict(X_test_rfm_scaled)
    
    train_distances = kmeans.transform(X_train_rfm_scaled)
    X_train['Distance_to_Centroid'] = train_distances[np.arange(len(train_distances)), X_train['RFM_Cluster']]
    
    test_distances = kmeans.transform(X_test_rfm_scaled)
    X_test['Distance_to_Centroid'] = test_distances[np.arange(len(test_distances)), X_test['RFM_Cluster']]

    # RF
    best_params = {
        'n_estimators': 500, 'max_depth': 5, 'min_samples_leaf': 4,
        'class_weight': 'balanced', 'random_state': 42, 'n_jobs': 10
    }
    rf_model = RandomForestClassifier(**best_params)
    rf_model.fit(X_train, y_train)

    # LR
    scaler_lr = StandardScaler()
    X_train_scaled = scaler_lr.fit_transform(X_train)
    X_test_scaled = scaler_lr.transform(X_test)
    lr_model = LogisticRegression(random_state=42, class_weight='balanced', max_iter=1000)
    lr_model.fit(X_train_scaled, y_train)
    
    # RESULT
    print(f"{best_params}")
    # RF
    print("----- RF -----")
    y_pred_rf = rf_model.predict(X_test)
    y_pred_proba_rf = rf_model.predict_proba(X_test)[:, 1]
    print(classification_report(y_test, y_pred_rf, target_names=['NO(0)', 'YES(1)'])) 
    print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.3f}")
    print(f"AUC-ROC Score: {roc_auc_score(y_test, y_pred_proba_rf):.3f}")

    # LR
    print("\n----- LR -----")
    y_pred_lr = lr_model.predict(X_test_scaled)
    y_pred_proba_lr = lr_model.predict_proba(X_test_scaled)[:, 1]
    print(classification_report(y_test, y_pred_lr, target_names=['NO(0)', 'YES(1)']))
    print(f"Accuracy: {accuracy_score(y_test, y_pred_lr):.3f}")
    print(f"AUC-ROC Score: {roc_auc_score(y_test, y_pred_proba_lr):.3f}")

## Executing

In [5]:
if __name__ == "__main__":
    customer_features = prepare_features_and_target(file_path='cleaned_retail_data.csv')
    train_and_evaluate_model(customer_features)

{'n_estimators': 500, 'max_depth': 5, 'min_samples_leaf': 4, 'class_weight': 'balanced', 'random_state': 42, 'n_jobs': 10}
----- RF -----
              precision    recall  f1-score   support

       NO(0)       0.79      0.75      0.77       538
      YES(1)       0.56      0.62      0.59       276

    accuracy                           0.70       814
   macro avg       0.67      0.68      0.68       814
weighted avg       0.71      0.70      0.71       814

Accuracy: 0.704
AUC-ROC Score: 0.740

----- LR -----
              precision    recall  f1-score   support

       NO(0)       0.78      0.77      0.77       538
      YES(1)       0.56      0.58      0.57       276

    accuracy                           0.71       814
   macro avg       0.67      0.68      0.67       814
weighted avg       0.71      0.71      0.71       814

Accuracy: 0.705
AUC-ROC Score: 0.735


Both the Random Forest (RF) and Logistic Regression (LR) models are effective at predicting which customers will return. However, the Random Forest model is slightly better and is the recommended choice.

Key Metrics Explained
- AUC-ROC Score: This measures how well a model can distinguish between a customer who will return and one who will not. A higher score is better.

    + RF (0.740) is slightly better at telling these two groups apart than LR (0.735).

- Recall (for "YES"): This is the most important metric for this problem. It answers the question: "Of all the customers who actually returned, what percentage did our model correctly find?"

    + RF (0.62): Correctly identified 62% of all returning customers.

    + LR (0.58): Only found 58% of all returning customers.

    --> Conclusion: The Random Forest model is significantly better at finding potential opportunities and helps you miss fewer valuable customers.

- Precision (for "YES"): This answers the question: "When our model predicts a customer will return, how often is it correct?"

    + RF (0.56) & LR (0.56): Both models are equally precise. When they flag a customer, they are correct 56% of the time.