# Ejemplo del ejercicio utilizando XGBoost

Esta notebook contiene un análisis de las *features* aprendidas por el modelo, con el fin de extraer *insights* que puedan ser de utilidad para el negocio

In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import graphviz
import shap
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

## Lectura de datos

In [2]:
df_data = pd.read_csv('dataset_balanceado.csv')
df_data.head()

Unnamed: 0,admin_visits,intercom_conversations,products_with_description,total_products_with_images,total_product_categories,total_events_on_Android,total_events_on_Web,total_events_on_iOS,country_AR,country_BR,...,source_pulido_none,source_pulido_partners,creation_weekday_0,creation_weekday_1,creation_weekday_2,creation_weekday_3,creation_weekday_4,creation_weekday_5,creation_weekday_6,target
0,0.045455,0.0,0.0,0.0,0.0,0.0,0.001096,0.0,0,1,...,0,0,0,0,0,0,0,0,1,0
1,0.045455,0.0,0.0,0.0,0.0,0.0,0.001576,0.0,1,0,...,0,0,0,0,0,0,0,0,1,0
2,0.045455,0.0,3.5e-05,3.5e-05,0.0,0.0,0.001713,0.0,0,1,...,0,0,0,0,0,0,0,0,1,0
3,0.045455,0.0,3.5e-05,3.5e-05,0.0,0.0,0.004522,0.0,0,1,...,0,0,0,0,0,0,0,0,1,0
4,0.045455,0.0,0.0,0.0,0.0,0.002345,0.0,0.0,0,1,...,0,0,0,0,0,0,0,0,1,0


In [3]:
df_data.columns

Index(['admin_visits', 'intercom_conversations', 'products_with_description',
       'total_products_with_images', 'total_product_categories',
       'total_events_on_Android', 'total_events_on_Web', 'total_events_on_iOS',
       'country_AR', 'country_BR', 'country_CL', 'country_CO', 'country_MX',
       'creation_platform_desktop', 'creation_platform_mobile_app',
       'creation_platform_mobile_web', 'creation_platform_tablet',
       'source_pulido_0', 'source_pulido_Brand', 'source_pulido_Direct',
       'source_pulido_Facebook CPC', 'source_pulido_Google CPC Competitors',
       'source_pulido_Google CPC DSA', 'source_pulido_Google CPC no Brand',
       'source_pulido_Google Organic', 'source_pulido_Other',
       'source_pulido_Store Referral', 'source_pulido_none',
       'source_pulido_partners', 'creation_weekday_0', 'creation_weekday_1',
       'creation_weekday_2', 'creation_weekday_3', 'creation_weekday_4',
       'creation_weekday_5', 'creation_weekday_6', 'target'],
    

In [4]:
df_data = df_data[['admin_visits', 'intercom_conversations', 'products_with_description',
                   'total_products_with_images', 'total_product_categories','total_events_on_Android', 
                   'total_events_on_Web', 'total_events_on_iOS','country_AR', 'country_BR', 'country_CL',
                   'country_CO', 'country_MX', 'target']]

## Dataset entrenamiento y test

A continuación se define una función que divide el dataset en dos: uno para entrenamiento del modelo y otro para la evaluación del mismo

In [5]:
y = df_data['target']
X = df_data.drop(columns=['target'])
#Es de utilidad para XGBoost
data_dmatrix = xgb.DMatrix(data=X,label=y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3)

## Definición y entrenamiento del modelo

Se entrena un clasificador *XGBoost*. Primero se entrena con un modelo clásico dividiendo el dataset en *train* y *test*. Luego se entrena un modelo más robusto utilizando la estrategia de *kfolds*

In [6]:
clf = xgb.XGBClassifier(colsample_bytree = 0.3, learning_rate = 0.1, max_depth = 5, alpha = 10, n_estimators = 10)
clf.fit(X_train, y_train)

XGBClassifier(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.3, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=10, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=10,
              reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [7]:
param = {'max_depth': 5, 'eta': 1, 'objective': 'binary:logistic'}
param['nthread'] = 4
param['eval_metric'] = ['auc', 'aucpr']

In [8]:
cv_results = xgb.cv(
    dtrain=data_dmatrix, 
    params=param, 
    nfold=3,
    num_boost_round=50,
    early_stopping_rounds=10, 
    as_pandas=True
)

In [9]:
cv_results

Unnamed: 0,test-auc-mean,test-auc-std,test-aucpr-mean,test-aucpr-std,train-auc-mean,train-auc-std,train-aucpr-mean,train-aucpr-std
0,0.843467,0.001916,0.812899,0.003681,0.84526,0.000746,0.816104,0.001356
1,0.847583,0.001833,0.818485,0.002835,0.850505,0.00024,0.824656,0.000422
2,0.849196,0.00121,0.820948,0.002696,0.852677,0.000368,0.827704,0.001164
3,0.851205,0.001435,0.822649,0.00292,0.855376,0.000645,0.830432,0.000708
4,0.851973,0.000476,0.822814,0.00215,0.857065,0.000712,0.833248,0.000404


### Hiper parametrización del modelo

Se analizan diferentes configuraciones del algoritmo con el fin de encontrar aquella que tiene un mejor rendimiento

In [14]:
space={'max_depth': hp.quniform("max_depth", 3, 18, 1),
        'gamma': hp.uniform ('gamma', 1,9),
        'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
        'reg_lambda' : hp.uniform('reg_lambda', 0,1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
        'n_estimators': 180,
        'seed': 0
    }

In [16]:
def objective(space):
    clf=xgb.XGBClassifier(
                    n_estimators =space['n_estimators'], max_depth = int(space['max_depth']), gamma = space['gamma'],
                    reg_alpha = int(space['reg_alpha']),min_child_weight=int(space['min_child_weight']),
                    colsample_bytree=int(space['colsample_bytree']))
    
    evaluation = [( X_train, y_train), ( X_test, y_test)]
    
    clf.fit(X_train, y_train,
            eval_set=evaluation, eval_metric="auc",
            early_stopping_rounds=10,verbose=False)
    

    y_pred = clf.predict(X_test)
    f1_result = f1_score(y_test, y_pred, zero_division=1)
    print ("F1 score:", f1_result)
    return {'loss': -f1_result, 'status': STATUS_OK }

In [None]:
trials = Trials()

best_hyperparams = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 100,
                        trials = trials)


F1 score:                                              
0.7499631974090976                                     
F1 score:                                                                         
0.7420309398059708                                                                
F1 score:                                                                         
0.7436691157274458                                                                
F1 score:                                                                         
0.7474238268218548                                                                
F1 score:                                                                         
0.7506542331650142                                                                
F1 score:                                                                         
0.7514986576440734                                                                
F1 score:                                                 

In [None]:
param_tuning = {
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 5, 7, 10],
        'min_child_weight': [1, 3, 5],
        'subsample': [0.5, 0.7],
        'colsample_bytree': [0.3, 0.5, 0.7],
        'n_estimators' : [100, 200, 500],
        'objective': ['binary:logistic']
    }

In [None]:
xg_clf = xgb.XGBClassifier()

gsearch = GridSearchCV(estimator = xg_clf,
                           param_grid = param_tuning,                        
                           scoring = 'f1', #F1 score
                           cv = 5,
                           n_jobs = -1,
                           verbose = 1)

gsearch.fit(X_train,y_train)

In [None]:
gsearch.best_params_

In [None]:
xg_clf = xgb.train(params=gsearch.best_params_, dtrain=data_dmatrix, num_boost_round=10)

## Análisis de los features utilizados por el modelo

Para poder obtener *insights* que puedan ser de utilidad para el negocio se analizan la importancia que el modelo le asigna a cada uno de los *features*.

Para ello se utiliza la librería *shap* que permite realizar un análisis de la utilización de los *features* por el modelo

In [None]:
#Path for using shap library

model_bytearray = xg_clf.save_raw()[4:]
def myfun(self=None):
    return model_bytearray

xg_clf.save_raw = myfun

In [None]:
explainer = shap.TreeExplainer(xg_clf)

In [None]:
pred = xg_clf.predict(data_dmatrix, output_margin=True)
explainer = shap.TreeExplainer(xg_clf)
shap_values = explainer.shap_values(data_dmatrix)
np.abs(shap_values.sum(1) + explainer.expected_value - pred).max()

Primero se observa la importancia que le asigna el modelo a cada *feature*

In [None]:
shap.summary_plot(shap_values, X, plot_type='bar')

In [None]:
plt.rcParams['figure.figsize'] = [10,10]
xgb.plot_importance(xg_clf)

Luego se observa el impacto que cada *feature* tuvo para el modelo a la hora de generar las predicciones sobre el dataset

In [None]:
shap.summary_plot(shap_values, X)

In [None]:
# visualize the first prediction's explanation
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values[0,:], X.iloc[0,:])

In [None]:
plt.rcParams['figure.figsize'] = [50, 50]
xgb.plot_tree(xg_clf, num_trees=1)