# Construcción Modelo Light GBM

## Importar librerías

In [None]:
import pandas as pd
import pyarrow
import numpy as np
import lightgbm as lgb
import pickle
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, precision_score, recall_score
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
from sklearn import metrics

# Lectura de Datos

In [None]:
df_train = pd.read_parquet(engine="pyarrow", path="data/data_modelo_train_cleaned.parquet")

In [None]:
df_test = pd.read_parquet(engine="pyarrow", path="data/data_modelo_test_cleaned.parquet")

In [None]:
df_train = df_train[df_train['SEMANA'].isin([17,16,15,14])]

# Transformación de datos

In [None]:
# Eliminamos columnas que no agregan valor al modelo
df_train.drop(columns=['CLIENTE_ID','PRODUCTO_ID','ANIO','SEMANA'], inplace=True)
df_test.drop(columns=['CLIENTE_ID','PRODUCTO_ID','ANIO','SEMANA'], inplace=True)

In [None]:
# Seleccionamos las columnas por RFECV
important_columns_lgb =['FRECUENCIA', 'RECENCY', 'DURATION_CLIENT', 'TAMANIO',
       'UNIDAD_EMPAQUE', 'CANAL_COMERCIO DE ABARROTES', 'CANAL_OTROS',
       'SUBCANAL_HOGAR CON VENTA', 'SUBCANAL_MINI SUPER INDEPENDIENTE',
       'SUBCANAL_MISCELÁNEA', 'SUBCANAL_OTHER', 'SUBCANAL_VINATERÍA',
       'MARCA_CIEL', 'MARCA_COCA-COLA', 'MARCA_DEL VALLE', 'MARCA_FANTA',
       'MARCA_FRESCA', 'MARCA_FUZE', 'MARCA_MEZCLADA/MULTIPRODUC',
       'MARCA_MONSTER - PREDATOR', 'MARCA_MUNDET', 'MARCA_SANTA CLARA',
       'MARCA_SPRITE', 'MARCA_YOLI - SENZAO-AMEYAL', 'SABOR_COLA',
       'SABOR_FRUTAS', 'SABOR_MANGO', 'SABOR_MANZANA', 'SABOR_MORAS',
       'SABOR_NATURAL', 'SABOR_OTROS', 'EMPAQUE_PET', 'EMPAQUE_REF PET',
       'EMPAQUE_VIDRIO', 'TIPO_CONSUMO_PERSONALES (SS)', 'CLUSTER_1',
       'CLUSTER_2', 'ROTATION_MEAN_CF', 'ROTATION_MEAN_DAYS',
       'ROTATION_MEDIAN_DAYS', 'CF_LOG']

In [None]:
# Preparamos el set de variables independientes y variable objetivo
X_train= df_train.drop(columns=['PURCHASE'], axis=1)
X_train = X_train[important_columns_lgb]
y_train = df_train['PURCHASE']

X_test= df_test.drop(columns=['PURCHASE'], axis=1)
X_test = X_test[important_columns_lgb]
y_test = df_test['PURCHASE']

## Entrenamiento del modelo

In [None]:
np.random.seed(1234)
params = {
        'boosting_type': 'goss',
        'objective': 'binary',
        'metric': 'binary_logloss',
        'verbosity': -1,'max_bin': 394, 'max_depth': 11,
        'learning_rate': 0.01364872413582553, 'subsample': 0.5, 'colsample_bytree': 1.0, 'min_child_samples': 118, 'reg_alpha': 1.6115057027697895e-08, 'reg_lambda': 0.0001452320544019507, 'num_leaves': 108, 'n_estimators': 874, 'scale_pos_weight': 2
 }
clf_lgb = lgb.LGBMClassifier(**params)
clf_lgb.fit(X_train, y_train)
y_pred = clf_lgb.predict_proba(X_test)[:,1]
auc_score =  roc_auc_score(y_test, y_pred)

print('ROC AUC score:', auc_score)


## Evaluación del modelo

In [None]:
y_pred_proba = clf_lgb.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = round(metrics.roc_auc_score(y_test, y_pred_proba),2)
plt.style.use('bmh')

#create ROC curve
plt.figure(figsize=(10,6))
plt.plot(fpr,tpr,label="AUC="+str(auc))
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc=4)
plt.show()

## Guardado del modelo en formato pickle

In [None]:
filename = '/ml_models/lightgbm_model.sav'
pickle.dump(clf_lgb, open(filename, 'wb'))
 