In [1]:
%load_ext autoreload
%autoreload 2
import os
import gc

os.chdir('../../')

In [2]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from sklearn.metrics import log_loss, roc_auc_score, average_precision_score, brier_score_loss, precision_recall_curve
from sklearn.calibration import calibration_curve, CalibratedClassifierCV
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from global_code.util import reduce_mem_usage, clf_metric_report, compute_and_plot_permutation_importance, plot_pr_calib_curve, plot_dis_probs, plot_shap_values
import optuna
import arfs.feature_selection.allrelevant as arfsgroot
import shap
import json
import joblib

sns.set(style='whitegrid')

  from .autonotebook import tqdm as notebook_tqdm


### Read the parquet file located at `./week_1/data/processed/feature_engineering_dataset.parquet`

In [3]:
df = pd.read_parquet('./week_1/data/processed/feature_engineering_dataset.parquet')
non_features_list = ['customer_id','name','date_of_birth','address','touchpoints','csat_scores','Usage','churn','next_date','days_diff', 'job']
df = df.drop(non_features_list, axis=1)
df.info()           

<class 'pandas.core.frame.DataFrame'>
Index: 5286530 entries, 2961532 to 5286529
Columns: 159 entries, Id to churn_18_months
dtypes: bool(1), category(2), datetime64[ns](2), float16(93), float32(37), int16(5), int32(3), int8(15), object(1)
memory usage: 2.0+ GB


### Train, Validation, Test Split

In [4]:
# target - Inactivity 365 days 
#train_max_date = '2022-01-01'
#validation_max_date = '2023-01-01'
#test_start_date = '2024-01-01'
#target = 'churn_365'

# target - no activity after 2022-06-01
#train_max_date = '2019-06-01'
#validation_max_date = '2022-06-01'
#test_start_date = '2024-01-01'
#target = 'churn_18_months'

# target - Inactivity >= 420 days target
train_max_date = '2021-10-01'
validation_max_date = '2022-10-01'
test_start_date = '2024-01-01'
target = 'churn_420'


# PROMPT: break this list in lines, add comments to group the features in categories
features = [
    # Basic customer information
    "interest_rate", "country", "customer_age", "broad_job_category", "tenure", "from_competitor",
    
    # Transaction counts
    "atm_transfer_in", "atm_transfer_out", "bank_transfer_in", "bank_transfer_out", 
    "crypto_in", "crypto_out",
    
    # Transaction volumes
    "bank_transfer_in_volume", "bank_transfer_out_volume",
    "crypto_in_volume", "crypto_out_volume",
    
    # Customer service and interactions
    "complaints", "churn_due_to_fraud", "model_predicted_fraud",
    "appointment", "email", "phone", "whatsapp", "days_between",
    
    # Historical interaction metrics
    "prior_emails", "prior_appointments", "prior_phones", "prior_whatsapps",
    "prior_touchpoints", "prior_count",
    
    # Historical balance and transaction metrics
    "prior_crypto_balance", "prior_bank_balance", "prior_mean_balance",
    "prior_sum_days_between", "prior_std_days_between", "prior_mean_days_between",
    "prior_min_days_between", "prior_max_days_between",
    
    # Prior mean transaction metrics
    "prior_mean_bank_transfer_in", "prior_mean_bank_transfer_out",
    "prior_mean_crypto_in", "prior_mean_crypto_out",
    "prior_mean_bank_transfer_in_volume", "prior_mean_bank_transfer_out_volume",
    "prior_mean_crypto_in_volume", "prior_mean_crypto_out_volume",
    
    # Prior sum volume metrics
    "prior_sum_bank_transfer_in_volume", "prior_sum_bank_transfer_out_volume",
    "prior_sum_crypto_in_volume", "prior_sum_crypto_out_volume",
    
    # 10-day window metrics
    "prior_10D_count", "prior_10D_sum_days_between", "prior_10D_mean_days_between",
    "prior_10D_std_days_between", "prior_10D_max_days_between", "prior_10D_min_days_between",
    "prior_10D_mean_bank_transfer_in", "prior_10D_mean_bank_transfer_out",
    "prior_10D_mean_crypto_in", "prior_10D_mean_crypto_out",
    "prior_10D_mean_bank_transfer_in_volume", "prior_10D_mean_bank_transfer_out_volume",
    "prior_10D_mean_crypto_in_volume", "prior_10D_mean_crypto_out_volume",
    "prior_10D_sum_bank_transfer_in_volume", "prior_10D_sum_bank_transfer_out_volume",
    "prior_10D_sum_crypto_in_volume", "prior_10D_sum_crypto_out_volume",
    "prior_10D_bank_balance", "prior_10D_mean_balance",
    
    # 90-day window metrics
    "prior_90D_count", "prior_90D_sum_days_between", "prior_90D_mean_days_between",
    "prior_90D_std_days_between", "prior_90D_max_days_between", "prior_90D_min_days_between",
    "prior_90D_mean_bank_transfer_in", "prior_90D_mean_bank_transfer_out",
    "prior_90D_mean_crypto_in", "prior_90D_mean_crypto_out",
    "prior_90D_mean_bank_transfer_in_volume", "prior_90D_mean_bank_transfer_out_volume",
    "prior_90D_mean_crypto_in_volume", "prior_90D_mean_crypto_out_volume",
    "prior_90D_sum_bank_transfer_in_volume", "prior_90D_sum_bank_transfer_out_volume",
    "prior_90D_sum_crypto_in_volume", "prior_90D_sum_crypto_out_volume",
    "prior_90D_bank_balance", "prior_90D_mean_balance",
    
    # 180-day window metrics
    "prior_180D_count", "prior_180D_sum_days_between", "prior_180D_mean_days_between",
    "prior_180D_std_days_between", "prior_180D_max_days_between", "prior_180D_min_days_between",
    "prior_180D_mean_bank_transfer_in", "prior_180D_mean_bank_transfer_out",
    "prior_180D_mean_crypto_in", "prior_180D_mean_crypto_out",
    "prior_180D_mean_bank_transfer_in_volume", "prior_180D_mean_bank_transfer_out_volume",
    "prior_180D_mean_crypto_in_volume", "prior_180D_mean_crypto_out_volume",
    "prior_180D_sum_bank_transfer_in_volume", "prior_180D_sum_bank_transfer_out_volume",
    "prior_180D_sum_crypto_in_volume", "prior_180D_sum_crypto_out_volume",
    "prior_180D_bank_balance", "prior_180D_mean_balance",
    
    # 365-day window metrics
    "prior_365D_count", "prior_365D_sum_days_between", "prior_365D_mean_days_between",
    "prior_365D_std_days_between", "prior_365D_max_days_between", "prior_365D_min_days_between",
    "prior_365D_mean_bank_transfer_in", "prior_365D_mean_bank_transfer_out",
    "prior_365D_mean_crypto_in", "prior_365D_mean_crypto_out",
    "prior_365D_mean_bank_transfer_in_volume", "prior_365D_mean_bank_transfer_out_volume",
    "prior_365D_mean_crypto_in_volume", "prior_365D_mean_crypto_out_volume",
    "prior_365D_sum_bank_transfer_in_volume", "prior_365D_sum_bank_transfer_out_volume",
    "prior_365D_sum_crypto_in_volume", "prior_365D_sum_crypto_out_volume",
    "prior_365D_bank_balance", "prior_365D_mean_balance",
    
    # 450-day window metrics
    "prior_450D_count", "prior_450D_sum_days_between", "prior_450D_mean_days_between",
    "prior_450D_std_days_between", "prior_450D_max_days_between", "prior_450D_min_days_between",
    "prior_450D_mean_bank_transfer_in", "prior_450D_mean_bank_transfer_out",
    "prior_450D_mean_crypto_in", "prior_450D_mean_crypto_out",
    "prior_450D_mean_bank_transfer_in_volume", "prior_450D_mean_bank_transfer_out_volume",
    "prior_450D_mean_crypto_in_volume", "prior_450D_mean_crypto_out_volume",
    "prior_450D_sum_bank_transfer_in_volume", "prior_450D_sum_bank_transfer_out_volume",
    "prior_450D_sum_crypto_in_volume", "prior_450D_sum_crypto_out_volume",
    "prior_450D_bank_balance", "prior_450D_mean_balance",
    
    # Current week metrics
    "this_week_bank_volume", "this_week_crypto_volume"
]

train_df = df.loc[df['date'] < train_max_date, features + [target]]
validation_df = df.loc[(df['date'] >= train_max_date) & (df['date'] < validation_max_date), features + [target]]

# Get Train and Validation Subsamples to speed up the process
_, train_df = train_test_split(train_df, test_size=0.10, random_state=42, stratify=train_df[target])
validation_df, calibration_df = train_test_split(validation_df, test_size=0.20, random_state=42, stratify=validation_df[target])

print('Train Shape: ', train_df.shape)
print('Validation shape: ', calibration_df.shape)
print('Calibration shape: ', validation_df.shape)

Train Shape:  (306649, 153)
Validation shape:  (70878, 153)
Calibration shape:  (283510, 153)


In [5]:
# Save training data
train_df.to_parquet('./week_1/data/processed/train_df.parquet')

# Save validation data
validation_df.to_parquet('./week_1/data/processed/validation_df.parquet')

# Save calibration data
calibration_df.to_parquet('./week_1/data/processed/calibration_df.parquet')
