In [1]:
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, fbeta_score, confusion_matrix, ConfusionMatrixDisplay

from aml import AutoMLClassifier

import logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)


In [2]:
# sample data
cols = ['Attrition_Flag', 'Customer_Age', 'Gender',
       'Dependent_count', 'Education_Level', 'Marital_Status',
       'Income_Category', 'Card_Category', 'Months_on_book',
       'Total_Relationship_Count', 'Months_Inactive_12_mon',
       'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal',
       'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt',
       'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio']

churn_data = pd.read_csv('data/BankChurners.csv', usecols=cols)

In [None]:
#churn_data.info()

In [3]:
label = 'Attrition_Flag'

y = churn_data[label]
X = churn_data.drop(label, axis=1)

# convert to list
labels = list(y.unique())
y = y.apply(lambda x: labels.index(x))

categorical = ['Gender', 'Education_Level',
                       'Marital_Status', 'Income_Category', 'Card_Category']
numeric = ['Customer_Age', 'Dependent_count', 'Months_on_book', 'Total_Relationship_Count',
                   'Months_Inactive_12_mon', 'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal',
                   'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt', 'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio']


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y,random_state=1234, test_size=0.2)

In [5]:
def show_test_scores(m):
    for pipe, params, *_ in m.best_models:
        print("Model Type:", str(type(pipe[-1]))) #last step is a classifier
        print("Best pipeline parameters:", str(params))
        y_pred = pipe.predict(X_test)
        roc_auc = roc_auc_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        print("ROC AUC: ", roc_auc)
        print("Precision: ", precision)
        print("Recall :", recall)
        print("F1 score:", 2*precision*recall/(precision+recall))
        print()

In [6]:
aml = AutoMLClassifier('roc_auc', 10, try_LR=False, try_DT=False, try_RF=False, try_HGB=True, try_GB=False, try_SVC=False, try_KM=False, try_MLP=False)

In [7]:
%%time
#aml.fit(X_train, y_train, categorical, numeric)
aml.fit(X_train, y_train)

INFO:root:Trying <class 'sklearn.ensemble._hist_gradient_boosting.gradient_boosting.HistGradientBoostingClassifier'>...
INFO:root:Best parameters: {'clf__max_leaf_nodes': 20, 'clf__max_iter': 250, 'clf__max_depth': 75, 'clf__l2_regularization': 9}
INFO:root:Best CV score: 0.9940058148819293
CPU times: user 17.9 s, sys: 1.07 s, total: 18.9 s
Wall time: 22.6 s


In [8]:
show_test_scores(aml)

Model Type: <class 'sklearn.ensemble._hist_gradient_boosting.gradient_boosting.HistGradientBoostingClassifier'>
Best pipeline parameters: {'clf__max_leaf_nodes': 20, 'clf__max_iter': 250, 'clf__max_depth': 75, 'clf__l2_regularization': 9}
ROC AUC:  0.9271541627097183
Precision:  0.909967845659164
Recall : 0.8707692307692307
F1 score: 0.889937106918239

