In [None]:
#| default_exp train

# train

> Train tree models.

In [None]:
#|hide
from nbdev.showdoc import *

In [None]:
#|export
from typing import List
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
from feature_engine.imputation import ArbitraryNumberImputer
from feature_engine.imputation import CategoricalImputer
from feature_engine.encoding import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

def train_tree_models(df_train: pd.DataFrame, # train dataset
                      df_test: pd.DataFrame,  # test dataset
                      target: str,            # target variable
                      folds: int,             # cross-validation number of folds
                      cat_features: List,     # categorical features
                      num_features: List,     # numeric features
                      seed: int):             # seed
    
    features = cat_features + num_features
    X_train, y_train = df_train[features], df_train[target]
    X_test, y_test = df_test[features], df_test[target]

    base_pipe = Pipeline(steps=[
        ('numeric_imputer', ArbitraryNumberImputer(variables=num_features)),
        ('categoric_imputer', CategoricalImputer(variables=cat_features)),
        ('one_hot_encoder', OneHotEncoder(variables=cat_features))
    ])

    dt = Pipeline(steps=base_pipe.steps + [('dt', DecisionTreeClassifier(max_depth=5, random_state=seed))])
    rf = Pipeline(steps=base_pipe.steps + [('rf', RandomForestClassifier(n_estimators=300, max_depth=5, random_state=seed))])
    models = [dt, rf]

    metrics = {}
    for model in models:
        model.fit(X_train, y_train)
        y_prob_train = model.predict_proba(X_train)[:,1]
        y_prob_test = model.predict_proba(X_test)[:,1]

        metrics[model.steps[-1][0]] = {
            'auc': {
                'train': roc_auc_score(y_train, y_prob_train),
                'test': roc_auc_score(y_test, y_prob_test)
            }
        }

    return metrics    

# Basic usage

First, load the analytical base table:

In [None]:
from pathlib import Path
path = Path('..')
datasets_path = path/'datasets'

df = pd.read_csv(datasets_path/'churn_abt.csv')

In [None]:
df

Unnamed: 0,data_ref_safra,seller_id,uf,tot_orders_12m,tot_items_12m,tot_items_dist_12m,receita_12m,recencia,nao_revendeu_next_6m
0,2018-01-01,0015a82c2db000af6aaaf3ae2ecb0532,SP,3,3,1,2685.00,74,1
1,2018-01-01,001cca7ae9ae17fb1caed9dfb1094831,ES,171,207,9,21275.23,2,0
2,2018-01-01,002100f778ceb8431b7a1020ff7ab48f,SP,38,42,15,781.80,2,0
3,2018-01-01,003554e2dce176b5555353e4f3555ac8,GO,1,1,1,120.00,16,1
4,2018-01-01,004c9cd9d87a3c30c522c48c4fc07416,SP,130,141,75,16228.88,8,0
...,...,...,...,...,...,...,...,...,...
5364,2018-03-01,ff82e8873fba613f2261a9acc896fd84,MG,4,4,3,124.60,12,1
5365,2018-03-01,ffc470761de7d0232558ba5e786e57b7,SP,5,5,5,385.59,0,0
5366,2018-03-01,ffdd9f82b9a447f6f8d4b91554cc7dd3,PR,11,12,8,1450.20,7,0
5367,2018-03-01,ffeee66ac5d5a62fe688b9d26f83f534,SP,13,13,3,1709.87,0,0


Split into train and test or out of time datasets:

In [None]:
df_train = df.query('data_ref_safra < "2018-03-01"')
df_oot = df.query('data_ref_safra == "2018-03-01"')

Get features metadata and types:

In [None]:
key_vars = ['data_ref_safra', 'seller_id']
target = 'nao_revendeu_next_6m'
num_vars = [ var for var in df.select_dtypes(include='number').columns.tolist() if var not in [target] ]
cat_vars = [var for var in df.select_dtypes(exclude='number').columns.tolist() if var not in key_vars]

Train based tree models:

In [None]:
train_tree_models(df_train, df_oot, target=target, folds=5, cat_features=cat_vars, num_features=num_vars, seed=42)

{'dt': {'auc': {'train': 0.9139680595991275, 'test': 0.8968114296299949}},
 'rf': {'auc': {'train': 0.9072972070544887, 'test': 0.8964968670043654}}}