# Construcción modelo XGBoost

## Importar librerías

In [None]:
import pandas as pd
import pyarrow
import numpy as np
import xgboost as xgb
import pickle
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, precision_score, recall_score
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
from sklearn import metrics

# Lectura de datos

In [None]:
df_train = pd.read_parquet(engine="pyarrow", path="data/data_modelo_train_cleaned.parquet")

In [None]:
df_test = pd.read_parquet(engine="pyarrow", path="data/data_modelo_test_cleaned.parquet")

In [None]:
df_train = df_train[df_train['SEMANA'].isin([17,16,15,14])]

# Transformación de datos

In [None]:
# Eliminamos columnas que no agregan valor al modelo
df_train.drop(columns=['CLIENTE_ID','PRODUCTO_ID','ANIO','SEMANA'], inplace=True)
df_test.drop(columns=['CLIENTE_ID','PRODUCTO_ID','ANIO','SEMANA'], inplace=True)

In [None]:
# Seleccionamos las columnas por RFECV
important_columns_xgb = ['FRECUENCIA', 'RECENCY', 'DURATION_CLIENT', 'TAMANIO',
       'UNIDAD_EMPAQUE', 'CANAL_RSR', 'SUBCANAL_OTHER',
       'SUBCANAL_VINATERÍA', 'MARCA_CIEL', 'MARCA_COCA-COLA',
       'MARCA_DEL VALLE', 'MARCA_FANTA', 'MARCA_FRESCA', 'MARCA_FUZE',
       'MARCA_MONSTER - PREDATOR', 'MARCA_MUNDET', 'MARCA_SANTA CLARA',
       'MARCA_SPRITE', 'MARCA_YOLI - SENZAO-AMEYAL', 'SABOR_COLA',
       'SABOR_FRUTAS', 'SABOR_LIMA-LIMON', 'SABOR_MANGO', 'SABOR_MORAS',
       'SABOR_NATURAL', 'SABOR_UVA', 'EMPAQUE_PET', 'EMPAQUE_REF PET',
       'EMPAQUE_VIDRIO', 'TIPO_CONSUMO_PERSONALES (SS)', 'CLUSTER_1',
       'CLUSTER_2', 'ROTATION_MEAN_CF', 'ROTATION_MEAN_DAYS', 'CF_LOG']

In [None]:
# Preparamos el set de variables independientes y variable objetivo
X_train= df_train.drop(columns=['PURCHASE'], axis=1)
X_train = X_train[important_columns_xgb]
y_train = df_train['PURCHASE']

X_test= df_test.drop(columns=['PURCHASE'], axis=1)
X_test = X_test[important_columns_xgb]
y_test = df_test['PURCHASE']

## Entrenamiento del modelo

In [None]:
np.random.seed(1234)
params = {
        'objective': 'binary:logistic',
        'booster': 'gbtree',
        'n_jobs': -1,
        'verbosity': 0,
        'tree_method': 'hist',
        'grow_policy': 'lossguide',
        'max_bin': 320, 'max_depth': 8, 'learning_rate': 0.024179414158163107, 'subsample': 0.5, 'colsample_bytree': 0.5, 'min_child_weight': 1, 
        'reg_alpha': 9.728680734039675e-07, 'reg_lambda': 1.7595368049095486, 'gamma': 0.020556145669101262, 'n_estimators': 123, 'scale_pos_weight': 2
}
clf_xgb =  xgb.XGBClassifier(**params)
clf_xgb.fit(X_train, y_train)
y_pred = clf_xgb.predict_proba(X_test)[:,1]
auc_score =  roc_auc_score(y_test, y_pred)

print('ROC AUC score:', auc_score)


## Evaluación del modelo

In [None]:
y_pred_proba = clf_xgb.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = round(metrics.roc_auc_score(y_test, y_pred_proba),2)
plt.style.use('bmh')

#create ROC curve
plt.figure(figsize=(10,6))
plt.plot(fpr,tpr,label="AUC="+str(auc))
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc=4)
plt.show()

## Guardado del modelo en formato pickle

In [None]:
filename = '/ml_models/lightgbm_model.sav'
pickle.dump(clf_rf, open(filename, 'wb'))
 