In [5]:
import os
# os.chdir(r"/content/drive/MyDrive/billing_features/raw/")
import math
import time
import numpy as np
import pandas as pd
from tqdm import tqdm
import seaborn as sns
import pandas as pd
import missingno as msno
import pickle
import lightgbm
import xgboost as xgb
#tuning hyperparameters
from bayes_opt import BayesianOptimization
from skopt  import BayesSearchCV 

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score,average_precision_score
from sklearn.metrics import precision_recall_fscore_support 
from sklearn.metrics import roc_curve,precision_recall_curve
from sklearn.metrics import auc as auc_score
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

import utils

In [6]:
data_dir="/app/models/dij22"
df_buffer_0=pd.read_pickle(os.path.join(data_dir,"df_buffer_0_pickle"))
df_buffer_1=pd.read_pickle(os.path.join(data_dir,"df_buffer_1_pickle"))
df_buffer_2=pd.read_pickle(os.path.join(data_dir,"df_buffer_2_pickle"))
df_buffer_3=pd.read_pickle(os.path.join(data_dir,"df_buffer_3_pickle"))

In [7]:
def train_test_data(data,feature_type,test_yr):
    df=data.copy()
    all_var=df.columns.tolist()
    exclude_cols=['policy_id', 'pivot_date', 'churn',"year","month","orig_policy_eff_dt", "policy_anniv_dt", "policy_term_dt"]
    exclude_var=[]
    
    if feature_type=="original":
        for col in all_var:
            if col[:2] in ["L1","L2","L3","L6","L12",'d1','d2','d3','d6','d12',"r1","r2","r3","r6","r12"]:
                exclude_var.append(col)
                
    elif feature_type=="original+rolling window":
        for col in all_var:
            if col[:2] in ['d1','d2','d3','d6','d12',"r1","r2","r3","r6","r12"]:
                exclude_var.append(col)
    
    elif feature_type=="original+rolling window+delta":
        for col in all_var:
            if col[:2] in ["r1","r2","r3","r6","r12"]:
                exclude_var.append(col)
                
    elif feature_type=="original+rolling window+delta+ratio":
        exclude_var=[]
    
    else:
        raise NotImplemented("Unknown feature type.")
                
    df.drop(exclude_var, axis=1,inplace=True)
    train_data=df[df["year"]!=test_yr]
    test_data=df[df["year"]==test_yr]

    y_train=train_data.loc[:,"churn"]
    y_test=test_data.loc[:,"churn"]
    X_train=train_data.drop(exclude_cols, axis=1)
    X_test=test_data.drop(exclude_cols, axis=1)

    return X_train, X_test, y_train, y_test


def model_eval(X_train,X_test,y_train,y_test):
    opt_params = utils.bayes_parameter_opt_lgb(X_train, y_train, init_round=5, opt_round=10, n_folds=3, random_seed=6,n_estimators=10000)
    train_data = lightgbm.Dataset(X_train, label=y_train)
    test_data = lightgbm.Dataset(X_test, label=y_test)
    model = lightgbm.train(opt_params,
                           train_data,
                           valid_sets=[train_data,test_data],
                           num_boost_round=5000,
                           early_stopping_rounds=100)

    feature_importance= (pd.DataFrame({
        'feature': model.feature_name(),
        'importance': model.feature_importance(),
        }).sort_values('importance', ascending=False))
    feature_importance["rank"]=list(range(len(model.feature_name())))
    feature_importance=feature_importance.loc[:,["rank","feature","importance"]].reset_index(drop=True)
    train_preds = model.predict(X_train)
    test_preds = model.predict(X_test)

    train_eval=utils.model_evaluate(y_train, train_preds)
    test_eval=utils.model_evaluate(y_test, test_preds)
    
    return model, feature_importance, train_eval, test_eval

def evaluation_table(eval_v1,eval_v2,eval_v3,eval_v4,type):
    dict_data={}
    dict_data["Features"]=["original feature","original + rolling window feature","original + rolling window + delta feature","original + rolling window + delta  + ratio feature"]
    # dict_data["# of feature"]=[len(feat_1),len(feat_2),len(feat_3)] 
    dict_data["# of sample"]=[eval_v1['nb_example'],eval_v2['nb_example'],eval_v3['nb_example'],eval_v4['nb_example']]
    # dict_data["true_prediction"]=[eval_v1['true_prediction'],eval_v2['true_prediction'],eval_v3['true_prediction']]
    # dict_data["false_prediction"]=[eval_v1['false_prediction'],eval_v2['false_prediction'],eval_v3['false_prediction']]
    # dict_data["accuracy"]=[eval_v1['accuracy'],eval_v2['accuracy'],eval_v3['accuracy']]
    dict_data["precision"]=[eval_v1['precision'],eval_v2['precision'],eval_v3['precision'],eval_v4['precision']]  
    dict_data["recall"]=[eval_v1['recall'],eval_v2['recall'],eval_v3['recall'],eval_v4['recall']] 
    dict_data["f1_score"]=[eval_v1['f1_score'],eval_v2['f1_score'],eval_v3['f1_score'],eval_v4['f1_score']] 
    dict_data["ROC-AUC"]=[eval_v1['AUC'],eval_v2['AUC'],eval_v3['AUC'],eval_v4['AUC']] 
    dict_data["pr-auc"]=[eval_v1['pr_auc'],eval_v2['pr_auc'],eval_v3['pr_auc'],eval_v4['pr_auc']] 
    data_df=pd.DataFrame(dict_data)
    # data_df=data_df.set_index("Model Type")
    # data_df.style.format({"# of sample":"{:,}","true_prediction":"{:,}","false_prediction":"{:,}","accuracy":"{:.2%}","precision":"{:.2%}","recall":"{:.2%}","f1_score":"{:.2%}","ROC-AUC":"{:.2%}","pr-auc":"{:.2%}"})\
    return data_df.style.format({"# of sample":"{:,}","precision":"{:.2%}","recall":"{:.2%}","f1_score":"{:.2%}","ROC-AUC":"{:.2%}","pr-auc":"{:.2%}"})\
    .set_caption(f"Model Performance Comparison {type}")\
    .set_table_styles([{
        'selector': 'caption',
        'props': [
            ('color', 'red'),
            ('font-size', '20px')
        ]
    }])

# 3 month buffer

In [8]:
X_train, X_test, y_train, y_test=train_test_data(df_buffer_3,feature_type="original",test_yr=2022)
print("{:<30}{:<20,}".format('training features: ', len(X_train)))
print("{:<30}{:<20,}".format('testing features: ', len(X_test)))

pd.DataFrame(y_test, columns=["churn"])["churn"].value_counts(dropna=False,normalize=True).to_frame().style.format({"churn":"{:.2%}"})

training features:            291,135             
testing features:             35,112              


Unnamed: 0,churn
0,96.79%
1,3.21%


In [9]:
model_v03, feature_importance_v03, train_eval_v03, test_eval_v03=model_eval(X_train,X_test,y_train,y_test)

|   iter    |  target   | baggin... | featur... | learni... |  max_bin  | max_depth | min_da... | min_su... | num_le... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.683   [0m | [0m 0.9738  [0m | [0m 0.2812  [0m | [0m 0.5985  [0m | [0m 49.98   [0m | [0m 24.1    [0m | [0m 10.26   [0m | [0m 35.74   [0m | [0m 74.94   [0m | [0m 0.4615  [0m |
| [0m 2       [0m | [0m 0.6673  [0m | [0m 0.9909  [0m | [0m 0.7939  [0m | [0m 0.9862  [0m | [0m 84.63   [0m | [0m 12.59   [0m | [0m 86.15   [0m | [0m 12.12   [0m | [0m 67.99   [0m | [0m 0.258   [0m |
| [95m 3       [0m | [95m 0.6891  [0m | [95m 0.548   [0m | [95m 0.8548  [0m | [95m 0.8278  [0m | [95m 56.28   [0m | [95m 26.84   [0m | [95m 62.05   [0m | [95m 45.01   [0m | [95m 62.09   [0m | [95m 0.4252  [0m |
| [95m 4       [0m | [95m 0.6905  [0m | [95m 0.820

In [10]:
X_train, X_test, y_train, y_test=train_test_data(df_buffer_3,feature_type="original+rolling window",test_yr=2022)
model_v13, feature_importance_v13, train_eval_v13, test_eval_v13=model_eval(X_train,X_test,y_train,y_test)

|   iter    |  target   | baggin... | featur... | learni... |  max_bin  | max_depth | min_da... | min_su... | num_le... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.7108  [0m | [0m 0.9738  [0m | [0m 0.2812  [0m | [0m 0.5985  [0m | [0m 49.98   [0m | [0m 24.1    [0m | [0m 10.26   [0m | [0m 35.74   [0m | [0m 74.94   [0m | [0m 0.4615  [0m |
| [0m 2       [0m | [0m 0.6894  [0m | [0m 0.9909  [0m | [0m 0.7939  [0m | [0m 0.9862  [0m | [0m 84.63   [0m | [0m 12.59   [0m | [0m 86.15   [0m | [0m 12.12   [0m | [0m 67.99   [0m | [0m 0.258   [0m |
| [0m 3       [0m | [0m 0.7066  [0m | [0m 0.548   [0m | [0m 0.8548  [0m | [0m 0.8278  [0m | [0m 56.28   [0m | [0m 26.84   [0m | [0m 62.05   [0m | [0m 45.01   [0m | [0m 62.09   [0m | [0m 0.4252  [0m |
| [95m 4       [0m | [95m 0.7176  [0m | [95m 0.8202  [0m | 

In [11]:
X_train, X_test, y_train, y_test=train_test_data(df_buffer_3,feature_type="original+rolling window+delta",test_yr=2022)
model_v23, feature_importance_v23, train_eval_v23, test_eval_v23=model_eval(X_train,X_test,y_train,y_test)

|   iter    |  target   | baggin... | featur... | learni... |  max_bin  | max_depth | min_da... | min_su... | num_le... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.7592  [0m | [0m 0.9738  [0m | [0m 0.2812  [0m | [0m 0.5985  [0m | [0m 49.98   [0m | [0m 24.1    [0m | [0m 10.26   [0m | [0m 35.74   [0m | [0m 74.94   [0m | [0m 0.4615  [0m |
| [0m 2       [0m | [0m 0.732   [0m | [0m 0.9909  [0m | [0m 0.7939  [0m | [0m 0.9862  [0m | [0m 84.63   [0m | [0m 12.59   [0m | [0m 86.15   [0m | [0m 12.12   [0m | [0m 67.99   [0m | [0m 0.258   [0m |
| [0m 3       [0m | [0m 0.7484  [0m | [0m 0.548   [0m | [0m 0.8548  [0m | [0m 0.8278  [0m | [0m 56.28   [0m | [0m 26.84   [0m | [0m 62.05   [0m | [0m 45.01   [0m | [0m 62.09   [0m | [0m 0.4252  [0m |
| [95m 4       [0m | [95m 0.7697  [0m | [95m 0.8202  [0m | 

In [12]:
X_train, X_test, y_train, y_test=train_test_data(df_buffer_3,feature_type="original+rolling window+delta+ratio",test_yr=2022)
model_v33, feature_importance_v33, train_eval_v33, test_eval_v33=model_eval(X_train,X_test,y_train,y_test)

|   iter    |  target   | baggin... | featur... | learni... |  max_bin  | max_depth | min_da... | min_su... | num_le... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.7582  [0m | [0m 0.9738  [0m | [0m 0.2812  [0m | [0m 0.5985  [0m | [0m 49.98   [0m | [0m 24.1    [0m | [0m 10.26   [0m | [0m 35.74   [0m | [0m 74.94   [0m | [0m 0.4615  [0m |
| [0m 2       [0m | [0m 0.7357  [0m | [0m 0.9909  [0m | [0m 0.7939  [0m | [0m 0.9862  [0m | [0m 84.63   [0m | [0m 12.59   [0m | [0m 86.15   [0m | [0m 12.12   [0m | [0m 67.99   [0m | [0m 0.258   [0m |
| [0m 3       [0m | [0m 0.7502  [0m | [0m 0.548   [0m | [0m 0.8548  [0m | [0m 0.8278  [0m | [0m 56.28   [0m | [0m 26.84   [0m | [0m 62.05   [0m | [0m 45.01   [0m | [0m 62.09   [0m | [0m 0.4252  [0m |
| [95m 4       [0m | [95m 0.774   [0m | [95m 0.8202  [0m | 

In [13]:
evaluation_table(train_eval_v03,train_eval_v13,train_eval_v23,train_eval_v33,"Training Set")

Unnamed: 0,Features,# of sample,precision,recall,f1_score,ROC-AUC,pr-auc
0,original feature,291135,49.99%,33.54%,40.14%,75.72%,46.06%
1,original + rolling window feature,291135,55.41%,35.91%,43.58%,78.86%,49.88%
2,original + rolling window + delta feature,291135,59.82%,51.54%,55.37%,87.72%,63.11%
3,original + rolling window + delta + ratio feature,291135,51.54%,43.82%,47.36%,82.55%,53.34%


In [14]:
evaluation_table(test_eval_v03,test_eval_v13,test_eval_v23,test_eval_v33,"Test Set")

Unnamed: 0,Features,# of sample,precision,recall,f1_score,ROC-AUC,pr-auc
0,original feature,35112,92.03%,24.60%,38.82%,71.47%,31.82%
1,original + rolling window feature,35112,79.54%,27.62%,41.00%,74.52%,35.05%
2,original + rolling window + delta feature,35112,65.27%,26.20%,37.39%,75.38%,32.53%
3,original + rolling window + delta + ratio feature,35112,80.06%,23.53%,36.38%,75.64%,31.48%


In [15]:
def feature_importance(model):
    df_feature_importance = (
        pd.DataFrame({
            'feature': model.feature_name(),
            'importance': model.feature_importance(),
        }).sort_values('importance', ascending=False)
    )
    df_feature_importance["rank"]=list(range(len(model.feature_name())))
    df_feature_importance=df_feature_importance.loc[:,["rank","feature","importance"]].reset_index(drop=True)
    return df_feature_importance

In [16]:
df_feature_importance_v0=feature_importance(model_v03)
df_feature_importance_v1=feature_importance(model_v13)
df_feature_importance_v2=feature_importance(model_v23)
df_feature_importance_v3=feature_importance(model_v33)
f0=df_feature_importance_v0.loc[:30,['rank','feature']].rename(columns={"feature":"original feature"})
f1=df_feature_importance_v1.loc[:30,['rank','feature']].rename(columns={"feature":"original + rolling window feature"})
f2=df_feature_importance_v2.loc[:30,['rank','feature']].rename(columns={"feature":"original + rolling window + delta feature"})
f3=df_feature_importance_v3.loc[:30,['rank','feature']].rename(columns={"feature":"original + rolling window + delta + ratio feature"})

feature_importance=pd.merge(f0,f1,how="inner",on="rank")
feature_importance=pd.merge(feature_importance,f2,how="inner",on="rank")
feature_importance=pd.merge(feature_importance,f3,how="inner",on="rank")
# feature_importance.style.format().set_caption("Top 20 important Features").set_table_styles([{
#     'selector': 'caption',
#     'props': [
#         ('color', 'red'),
#         ('font-size', '20px')
#     ]
# }])
feature_importance

Unnamed: 0,rank,original feature,original + rolling window feature,original + rolling window + delta feature,original + rolling window + delta + ratio feature
0,0,survival_month,survival_month,survival_month,survival_month
1,1,Lag12_cntPaidFull,Lag12_cntBills,d12_Lag12_cntBills,L12_AvgPaidFullCnt
2,2,Lag12_cntBills,L12_AvgPaidFullCnt,d12_Lag12_cntPaidFull,r12_Lag12_cntBills
3,3,AvgPdBilldueDays,L12_PaidBillLastGenDays,d1_AvgPdBilldueDays,r12_Lag12_cntPaidFull
4,4,CurrPaidAmt,Lag12_cntPaidFull,d12_CurrPaidAmt,r6_Lag12_cntBills
5,5,Lag12_cntFirstGenPaidFull,L6_AvgPdBilldueDays,d6_AvgPdBillLstGenDays,d12_Lag12_cntBills
6,6,Lag12_cntBillGens,Lag12_cntBillGens,L12_AvgPdBillLstGenDays,r6_Lag12_cntPaidFull
7,7,OrigBillAmt,L12_PaidBillDueDays,L12_AvgPaidFullCnt,r6_OrigBillAmt
8,8,CurrBillAmt,L12_CurrBillAmt,d12_Lag12_cntFirstGenPaidFull,r12_CurrPaidAmt
9,9,PaidBillLastGenDays,L2_AvgPdBilldueDays,d12_AvgPdBilldueDays,Lag12_cntPaidFull


# 1 month buffer

In [17]:
X_train, X_test, y_train, y_test=train_test_data(df_buffer_1,feature_type="original",test_yr=2022)
print("{:<30}{:<20,}".format('training features: ', len(X_train)))
print("{:<30}{:<20,}".format('testing features: ', len(X_test)))

pd.DataFrame(y_test, columns=["churn"])["churn"].value_counts(dropna=False,normalize=True).to_frame().style.format({"churn":"{:.2%}"})

training features:            293,523             
testing features:             35,694              


Unnamed: 0,churn
0,95.21%
1,4.79%


In [18]:
model_v01, feature_importance_v01, train_eval_v01, test_eval_v01=model_eval(X_train,X_test,y_train,y_test)

|   iter    |  target   | baggin... | featur... | learni... |  max_bin  | max_depth | min_da... | min_su... | num_le... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.9799  [0m | [0m 0.9738  [0m | [0m 0.2812  [0m | [0m 0.5985  [0m | [0m 49.98   [0m | [0m 24.1    [0m | [0m 10.26   [0m | [0m 35.74   [0m | [0m 74.94   [0m | [0m 0.4615  [0m |
| [0m 2       [0m | [0m 0.9684  [0m | [0m 0.9909  [0m | [0m 0.7939  [0m | [0m 0.9862  [0m | [0m 84.63   [0m | [0m 12.59   [0m | [0m 86.15   [0m | [0m 12.12   [0m | [0m 67.99   [0m | [0m 0.258   [0m |
| [0m 3       [0m | [0m 0.9797  [0m | [0m 0.548   [0m | [0m 0.8548  [0m | [0m 0.8278  [0m | [0m 56.28   [0m | [0m 26.84   [0m | [0m 62.05   [0m | [0m 45.01   [0m | [0m 62.09   [0m | [0m 0.4252  [0m |
| [95m 4       [0m | [95m 0.981   [0m | [95m 0.8202  [0m | 

In [19]:
X_train, X_test, y_train, y_test=train_test_data(df_buffer_1,feature_type="original+rolling window",test_yr=2022)
model_v11, feature_importance_v11, train_eval_v11, test_eval_v11=model_eval(X_train,X_test,y_train,y_test)

|   iter    |  target   | baggin... | featur... | learni... |  max_bin  | max_depth | min_da... | min_su... | num_le... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.9831  [0m | [0m 0.9738  [0m | [0m 0.2812  [0m | [0m 0.5985  [0m | [0m 49.98   [0m | [0m 24.1    [0m | [0m 10.26   [0m | [0m 35.74   [0m | [0m 74.94   [0m | [0m 0.4615  [0m |
| [0m 2       [0m | [0m 0.9707  [0m | [0m 0.9909  [0m | [0m 0.7939  [0m | [0m 0.9862  [0m | [0m 84.63   [0m | [0m 12.59   [0m | [0m 86.15   [0m | [0m 12.12   [0m | [0m 67.99   [0m | [0m 0.258   [0m |
| [0m 3       [0m | [0m 0.9823  [0m | [0m 0.548   [0m | [0m 0.8548  [0m | [0m 0.8278  [0m | [0m 56.28   [0m | [0m 26.84   [0m | [0m 62.05   [0m | [0m 45.01   [0m | [0m 62.09   [0m | [0m 0.4252  [0m |
| [95m 4       [0m | [95m 0.9836  [0m | [95m 0.8202  [0m | 

In [20]:
X_train, X_test, y_train, y_test=train_test_data(df_buffer_1,feature_type="original+rolling window+delta",test_yr=2022)
model_v21, feature_importance_v21, train_eval_v21, test_eval_v21=model_eval(X_train,X_test,y_train,y_test)

|   iter    |  target   | baggin... | featur... | learni... |  max_bin  | max_depth | min_da... | min_su... | num_le... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.9843  [0m | [0m 0.9738  [0m | [0m 0.2812  [0m | [0m 0.5985  [0m | [0m 49.98   [0m | [0m 24.1    [0m | [0m 10.26   [0m | [0m 35.74   [0m | [0m 74.94   [0m | [0m 0.4615  [0m |
| [0m 2       [0m | [0m 0.9705  [0m | [0m 0.9909  [0m | [0m 0.7939  [0m | [0m 0.9862  [0m | [0m 84.63   [0m | [0m 12.59   [0m | [0m 86.15   [0m | [0m 12.12   [0m | [0m 67.99   [0m | [0m 0.258   [0m |
| [0m 3       [0m | [0m 0.9831  [0m | [0m 0.548   [0m | [0m 0.8548  [0m | [0m 0.8278  [0m | [0m 56.28   [0m | [0m 26.84   [0m | [0m 62.05   [0m | [0m 45.01   [0m | [0m 62.09   [0m | [0m 0.4252  [0m |
| [95m 4       [0m | [95m 0.9857  [0m | [95m 0.8202  [0m | 

In [21]:
X_train, X_test, y_train, y_test=train_test_data(df_buffer_1,feature_type="original+rolling window+delta+ratio",test_yr=2022)
model_v31, feature_importance_v31, train_eval_v31, test_eval_v31=model_eval(X_train,X_test,y_train,y_test)

|   iter    |  target   | baggin... | featur... | learni... |  max_bin  | max_depth | min_da... | min_su... | num_le... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.9847  [0m | [0m 0.9738  [0m | [0m 0.2812  [0m | [0m 0.5985  [0m | [0m 49.98   [0m | [0m 24.1    [0m | [0m 10.26   [0m | [0m 35.74   [0m | [0m 74.94   [0m | [0m 0.4615  [0m |
| [0m 2       [0m | [0m 0.9732  [0m | [0m 0.9909  [0m | [0m 0.7939  [0m | [0m 0.9862  [0m | [0m 84.63   [0m | [0m 12.59   [0m | [0m 86.15   [0m | [0m 12.12   [0m | [0m 67.99   [0m | [0m 0.258   [0m |
| [0m 3       [0m | [0m 0.9825  [0m | [0m 0.548   [0m | [0m 0.8548  [0m | [0m 0.8278  [0m | [0m 56.28   [0m | [0m 26.84   [0m | [0m 62.05   [0m | [0m 45.01   [0m | [0m 62.09   [0m | [0m 0.4252  [0m |
| [95m 4       [0m | [95m 0.9859  [0m | [95m 0.8202  [0m | 

In [22]:
evaluation_table(train_eval_v01,train_eval_v11,train_eval_v21,train_eval_v31,"Training Set")

Unnamed: 0,Features,# of sample,precision,recall,f1_score,ROC-AUC,pr-auc
0,original feature,293523,88.99%,87.88%,88.43%,98.87%,94.80%
1,original + rolling window feature,293523,90.17%,88.29%,89.22%,98.94%,95.36%
2,original + rolling window + delta feature,293523,91.35%,88.75%,90.03%,99.16%,95.85%
3,original + rolling window + delta + ratio feature,293523,91.76%,93.93%,92.83%,99.69%,98.01%


In [23]:
evaluation_table(test_eval_v01,test_eval_v11,test_eval_v21,test_eval_v31,"Test Set")

Unnamed: 0,Features,# of sample,precision,recall,f1_score,ROC-AUC,pr-auc
0,original feature,35694,82.02%,72.37%,76.89%,98.12%,83.90%
1,original + rolling window feature,35694,83.57%,75.35%,79.25%,98.35%,86.43%
2,original + rolling window + delta feature,35694,80.90%,76.11%,78.43%,98.14%,85.37%
3,original + rolling window + delta + ratio feature,35694,78.34%,77.28%,77.81%,98.26%,85.21%


In [24]:
def feature_importance(model):
    df_feature_importance = (
        pd.DataFrame({
            'feature': model.feature_name(),
            'importance': model.feature_importance(),
        }).sort_values('importance', ascending=False)
    )
    df_feature_importance["rank"]=list(range(len(model.feature_name())))
    df_feature_importance=df_feature_importance.loc[:,["rank","feature","importance"]].reset_index(drop=True)
    return df_feature_importance

df_feature_importance_v0=feature_importance(model_v01)
df_feature_importance_v1=feature_importance(model_v11)
df_feature_importance_v2=feature_importance(model_v21)
df_feature_importance_v3=feature_importance(model_v31)
f0=df_feature_importance_v0.loc[:30,['rank','feature']].rename(columns={"feature":"original feature"})
f1=df_feature_importance_v1.loc[:30,['rank','feature']].rename(columns={"feature":"original + rolling window feature"})
f2=df_feature_importance_v2.loc[:30,['rank','feature']].rename(columns={"feature":"original + rolling window + delta feature"})
f3=df_feature_importance_v3.loc[:30,['rank','feature']].rename(columns={"feature":"original + rolling window + delta + ratio feature"})

feature_importance=pd.merge(f0,f1,how="inner",on="rank")
feature_importance=pd.merge(feature_importance,f2,how="inner",on="rank")
feature_importance=pd.merge(feature_importance,f3,how="inner",on="rank")
# feature_importance.style.format().set_caption("Top 20 important Features").set_table_styles([{
#     'selector': 'caption',
#     'props': [
#         ('color', 'red'),
#         ('font-size', '20px')
#     ]
# }])
feature_importance

Unnamed: 0,rank,original feature,original + rolling window feature,original + rolling window + delta feature,original + rolling window + delta + ratio feature
0,0,survival_month,survival_month,survival_month,survival_month
1,1,AvgPdBilldueDays,L2_AvgPdBillLstGenDays,d12_Lag12_cntBills,d1_PaidBillDueDays
2,2,PaidBillDueDays,L1_AvgPdBilldueDays,PaidBillLastGenDays,d2_AvgPdBilldueDays
3,3,PaidBillLastGenDays,L1_AvgPdBillLstGenDays,AvgPdBilldueDays,L2_AvgPdBilldueDays
4,4,AvgPdBillLstGenDays,L2_PaidBillLastGenDays,d1_PaidBillDueDays,L1_AvgPdBilldueDays
5,5,Lag12_cntBills,L2_AvgPdBilldueDays,d1_AvgPdBilldueDays,d12_AvgPdBilldueDays
6,6,OrigBillAmt,L6_AvgPdBilldueDays,d12_Lag12_cntFirstGenPaidFull,r2_PaidBillDueDays
7,7,Lag12_cntFirstGenPaidFull,L3_PaidBillDueDays,d3_PaidBillDueDays,r12_PaidBillLastGenDays
8,8,CurrBillAmt,PaidBillDueDays,Lag12_cntBills,r1_AvgPdBilldueDays
9,9,Lag12_cntBillGens,L2_PaidBillDueDays,PaidBillDueDays,d3_AvgPdBilldueDays


# 0 month buffer

In [25]:
X_train, X_test, y_train, y_test=train_test_data(df_buffer_0,feature_type="original",test_yr=2022)
print("{:<30}{:<20,}".format('training features: ', len(X_train)))
print("{:<30}{:<20,}".format('testing features: ', len(X_test)))

training features:            296,747             
testing features:             37,389              


In [26]:
model_v00, feature_importance_v00, train_eval_v00, test_eval_v00=model_eval(X_train,X_test,y_train,y_test)

|   iter    |  target   | baggin... | featur... | learni... |  max_bin  | max_depth | min_da... | min_su... | num_le... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.9974  [0m | [0m 0.9738  [0m | [0m 0.2812  [0m | [0m 0.5985  [0m | [0m 49.98   [0m | [0m 24.1    [0m | [0m 10.26   [0m | [0m 35.74   [0m | [0m 74.94   [0m | [0m 0.4615  [0m |
| [0m 2       [0m | [0m 0.9959  [0m | [0m 0.9909  [0m | [0m 0.7939  [0m | [0m 0.9862  [0m | [0m 84.63   [0m | [0m 12.59   [0m | [0m 86.15   [0m | [0m 12.12   [0m | [0m 67.99   [0m | [0m 0.258   [0m |
| [0m 3       [0m | [0m 0.9962  [0m | [0m 0.548   [0m | [0m 0.8548  [0m | [0m 0.8278  [0m | [0m 56.28   [0m | [0m 26.84   [0m | [0m 62.05   [0m | [0m 45.01   [0m | [0m 62.09   [0m | [0m 0.4252  [0m |
| [0m 4       [0m | [0m 0.9974  [0m | [0m 0.8202  [0m | [0m

In [27]:
X_train, X_test, y_train, y_test=train_test_data(df_buffer_0,feature_type="original+rolling window",test_yr=2022)
model_v10, feature_importance_v10, train_eval_v10, test_eval_v10=model_eval(X_train,X_test,y_train,y_test)

|   iter    |  target   | baggin... | featur... | learni... |  max_bin  | max_depth | min_da... | min_su... | num_le... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.998   [0m | [0m 0.9738  [0m | [0m 0.2812  [0m | [0m 0.5985  [0m | [0m 49.98   [0m | [0m 24.1    [0m | [0m 10.26   [0m | [0m 35.74   [0m | [0m 74.94   [0m | [0m 0.4615  [0m |
| [0m 2       [0m | [0m 0.9966  [0m | [0m 0.9909  [0m | [0m 0.7939  [0m | [0m 0.9862  [0m | [0m 84.63   [0m | [0m 12.59   [0m | [0m 86.15   [0m | [0m 12.12   [0m | [0m 67.99   [0m | [0m 0.258   [0m |
| [0m 3       [0m | [0m 0.9962  [0m | [0m 0.548   [0m | [0m 0.8548  [0m | [0m 0.8278  [0m | [0m 56.28   [0m | [0m 26.84   [0m | [0m 62.05   [0m | [0m 45.01   [0m | [0m 62.09   [0m | [0m 0.4252  [0m |
| [95m 4       [0m | [95m 0.9981  [0m | [95m 0.8202  [0m | 

In [28]:
X_train, X_test, y_train, y_test=train_test_data(df_buffer_0,feature_type="original+rolling window+delta",test_yr=2022)
model_v20, feature_importance_v20, train_eval_v20, test_eval_v20=model_eval(X_train,X_test,y_train,y_test)

|   iter    |  target   | baggin... | featur... | learni... |  max_bin  | max_depth | min_da... | min_su... | num_le... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.9983  [0m | [0m 0.9738  [0m | [0m 0.2812  [0m | [0m 0.5985  [0m | [0m 49.98   [0m | [0m 24.1    [0m | [0m 10.26   [0m | [0m 35.74   [0m | [0m 74.94   [0m | [0m 0.4615  [0m |
| [0m 2       [0m | [0m 0.9968  [0m | [0m 0.9909  [0m | [0m 0.7939  [0m | [0m 0.9862  [0m | [0m 84.63   [0m | [0m 12.59   [0m | [0m 86.15   [0m | [0m 12.12   [0m | [0m 67.99   [0m | [0m 0.258   [0m |
| [0m 3       [0m | [0m 0.9977  [0m | [0m 0.548   [0m | [0m 0.8548  [0m | [0m 0.8278  [0m | [0m 56.28   [0m | [0m 26.84   [0m | [0m 62.05   [0m | [0m 45.01   [0m | [0m 62.09   [0m | [0m 0.4252  [0m |
| [95m 4       [0m | [95m 0.9984  [0m | [95m 0.8202  [0m | 

In [29]:
X_train, X_test, y_train, y_test=train_test_data(df_buffer_0,feature_type="original+rolling window+delta+ratio",test_yr=2022)
model_v30, feature_importance_v30, train_eval_v30, test_eval_v30=model_eval(X_train,X_test,y_train,y_test)

|   iter    |  target   | baggin... | featur... | learni... |  max_bin  | max_depth | min_da... | min_su... | num_le... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.9983  [0m | [0m 0.9738  [0m | [0m 0.2812  [0m | [0m 0.5985  [0m | [0m 49.98   [0m | [0m 24.1    [0m | [0m 10.26   [0m | [0m 35.74   [0m | [0m 74.94   [0m | [0m 0.4615  [0m |
| [0m 2       [0m | [0m 0.9969  [0m | [0m 0.9909  [0m | [0m 0.7939  [0m | [0m 0.9862  [0m | [0m 84.63   [0m | [0m 12.59   [0m | [0m 86.15   [0m | [0m 12.12   [0m | [0m 67.99   [0m | [0m 0.258   [0m |
| [0m 3       [0m | [0m 0.9969  [0m | [0m 0.548   [0m | [0m 0.8548  [0m | [0m 0.8278  [0m | [0m 56.28   [0m | [0m 26.84   [0m | [0m 62.05   [0m | [0m 45.01   [0m | [0m 62.09   [0m | [0m 0.4252  [0m |
| [95m 4       [0m | [95m 0.9984  [0m | [95m 0.8202  [0m | 

In [30]:
evaluation_table(train_eval_v00,train_eval_v10,train_eval_v20,train_eval_v30,"Training Set")

Unnamed: 0,Features,# of sample,precision,recall,f1_score,ROC-AUC,pr-auc
0,original feature,296747,95.46%,97.38%,96.41%,99.93%,99.03%
1,original + rolling window feature,296747,97.97%,98.04%,98.01%,99.98%,99.80%
2,original + rolling window + delta feature,296747,98.30%,99.14%,98.72%,99.99%,99.88%
3,original + rolling window + delta + ratio feature,296747,97.88%,99.02%,98.44%,99.99%,99.84%


In [31]:
evaluation_table(test_eval_v00,test_eval_v10,test_eval_v20,test_eval_v30,"Test Set")

Unnamed: 0,Features,# of sample,precision,recall,f1_score,ROC-AUC,pr-auc
0,original feature,37389,79.48%,94.95%,86.53%,99.24%,91.20%
1,original + rolling window feature,37389,95.14%,92.10%,93.59%,99.72%,98.19%
2,original + rolling window + delta feature,37389,96.21%,90.16%,93.08%,99.67%,98.14%
3,original + rolling window + delta + ratio feature,37389,91.89%,93.92%,92.89%,99.65%,98.05%


In [32]:
def feature_importance(model):
    df_feature_importance = (
        pd.DataFrame({
            'feature': model.feature_name(),
            'importance': model.feature_importance(),
        }).sort_values('importance', ascending=False)
    )
    df_feature_importance["rank"]=list(range(len(model.feature_name())))
    df_feature_importance=df_feature_importance.loc[:,["rank","feature","importance"]].reset_index(drop=True)
    return df_feature_importance

df_feature_importance_v0=feature_importance(model_v00)
df_feature_importance_v1=feature_importance(model_v10)
df_feature_importance_v2=feature_importance(model_v20)
df_feature_importance_v3=feature_importance(model_v30)
f0=df_feature_importance_v0.loc[:30,['rank','feature']].rename(columns={"feature":"original feature"})
f1=df_feature_importance_v1.loc[:30,['rank','feature']].rename(columns={"feature":"original + rolling window feature"})
f2=df_feature_importance_v2.loc[:30,['rank','feature']].rename(columns={"feature":"original + rolling window + delta feature"})
f3=df_feature_importance_v3.loc[:30,['rank','feature']].rename(columns={"feature":"original + rolling window + delta + ratio feature"})

feature_importance=pd.merge(f0,f1,how="inner",on="rank")
feature_importance=pd.merge(feature_importance,f2,how="inner",on="rank")
feature_importance=pd.merge(feature_importance,f3,how="inner",on="rank")
# feature_importance.style.format().set_caption("Top 20 important Features").set_table_styles([{
#     'selector': 'caption',
#     'props': [
#         ('color', 'red'),
#         ('font-size', '20px')
#     ]
# }])
feature_importance

Unnamed: 0,rank,original feature,original + rolling window feature,original + rolling window + delta feature,original + rolling window + delta + ratio feature
0,0,survival_month,survival_month,survival_month,survival_month
1,1,OrigBillAmt,L12_PaidBillLastGenDays,d12_CurrPaidAmt,d12_AvgPdBilldueDays
2,2,CurrBillAmt,L3_AvgPdBillLstGenDays,d12_PaidBillDueDays,r1_Lag12_cntFirstGenPaidFull
3,3,AvgPdBillLstGenDays,L12_AvgPdBilldueDays,d12_AvgPdBilldueDays,L12_AvgPdBilldueDays
4,4,Lag12_cntFirstGenPaidFull,L2_AvgPdBilldueDays,d3_AvgPdBilldueDays,L2_AvgPdBilldueDays
5,5,Lag12_cntPaidFull,L12_AvgPdBillLstGenDays,L6_PaidBillLastGenDays,r2_AvgPdBillLstGenDays
6,6,PaidBillDueDays,CurrBillAmt,L2_PaidBillLastGenDays,r1_PaidBillDueDays
7,7,AvgPdBilldueDays,L2_PaidBillLastGenDays,AvgPdBillLstGenDays,r1_AvgPdBilldueDays
8,8,Lag12_cntBillGens,L6_AvgPdBillLstGenDays,d1_OrigBillAmt,d3_AvgPdBilldueDays
9,9,CurrPaidAmt,L3_PaidBillDueDays,d12_Lag12_cntBills,L2_PaidBillDueDays
