# Hiperparametrización Light GBM

## Importando librerías

In [9]:
import optuna
import pandas as pd
import pyarrow
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.model_selection import  StratifiedKFold
import matplotlib.pyplot as plt

In [10]:
from optuna.visualization import plot_optimization_history
from optuna.visualization import plot_param_importances

# Lectura de datos

In [25]:
df = pd.read_parquet(engine="pyarrow", path="/content/drive/My Drive/UCM/tfm_code/data/data_modelo_train_cleaned.parquet")

## Transformación de datos

In [26]:
df = df[df['SEMANA'].isin([17,16,15,14])]
df.drop(columns=['CLIENTE_ID','PRODUCTO_ID','ANIO','SEMANA'], inplace=True)

In [27]:
important_columns = ['FRECUENCIA', 'RECENCY', 'DURATION_CLIENT', 'TAMANIO',
       'UNIDAD_EMPAQUE', 'CANAL_COMERCIO DE ABARROTES', 'CANAL_OTROS',
       'SUBCANAL_HOGAR CON VENTA', 'SUBCANAL_MINI SUPER INDEPENDIENTE',
       'SUBCANAL_MISCELÁNEA', 'SUBCANAL_OTHER', 'SUBCANAL_VINATERÍA',
       'MARCA_CIEL', 'MARCA_COCA-COLA', 'MARCA_DEL VALLE', 'MARCA_FANTA',
       'MARCA_FRESCA', 'MARCA_FUZE', 'MARCA_MEZCLADA/MULTIPRODUC',
       'MARCA_MONSTER - PREDATOR', 'MARCA_MUNDET', 'MARCA_SANTA CLARA',
       'MARCA_SPRITE', 'MARCA_YOLI - SENZAO-AMEYAL', 'SABOR_COLA',
       'SABOR_FRUTAS', 'SABOR_MANGO', 'SABOR_MANZANA', 'SABOR_MORAS',
       'SABOR_NATURAL', 'SABOR_OTROS', 'EMPAQUE_PET', 'EMPAQUE_REF PET',
       'EMPAQUE_VIDRIO', 'TIPO_CONSUMO_PERSONALES (SS)', 'CLUSTER_1',
       'CLUSTER_2', 'ROTATION_MEAN_CF', 'ROTATION_MEAN_DAYS',
       'ROTATION_MEDIAN_DAYS', 'CF_LOG']

In [28]:
X = df.drop(columns=['PURCHASE'], axis=1)
X = X[important_columns]
y = df['PURCHASE']

## Uso de Optuna

In [35]:
np.random.seed(1234)

def objective(trial):
    params = {
        'boosting_type': 'goss',
        'objective': 'binary',
        'metric': 'binary_logloss',
        'verbosity': -1,
        'max_bin': trial.suggest_int('max_bin', 300, 512),
        'max_depth': trial.suggest_int('max_depth', 4, 16),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.5),
        'subsample': trial.suggest_discrete_uniform('subsample', 0.5, 1, 0.1),
        'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.5, 1, 0.1),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 300),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 100.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 100.0),
        'num_leaves': trial.suggest_int('num_leaves', 50, 256),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'scale_pos_weight': trial.suggest_int('scale_pos_weight', 2, 10)
    }
    
    # Inicializar StratifiedKFold con 4 folds
    skf = StratifiedKFold(n_splits=4)
    f1_scores = []
    
    # Hacer validación cruzada
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Entrenar y evaluar la clasificación basado en el score f1
        clf = lgb.LGBMClassifier(**params)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        f1 = f1_score(y_test, y_pred)
        f1_scores.append(f1)
    
    # Calcula el promedio del F1-score
    avg_f1 = np.mean(f1_scores)
    return avg_f1

# Hiperparametrización con Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Print the best hyperparameters and the corresponding ROC AUC score
print('Mejor F1-Score:', study.best_value)
print('Mejores hyperparametros:', study.best_params)

[32m[I 2023-03-30 08:38:16,827][0m A new study created in memory with name: no-name-a1996613-d914-423f-a8cf-d594352e57d1[0m
[32m[I 2023-03-30 08:38:26,291][0m Trial 0 finished with value: 0.6059747421659583 and parameters: {'max_bin': 413, 'max_depth': 8, 'learning_rate': 0.0627598493441382, 'subsample': 0.9, 'colsample_bytree': 0.8, 'min_child_samples': 149, 'reg_alpha': 0.3183195158327383, 'reg_lambda': 0.0003191006684649057, 'num_leaves': 155, 'n_estimators': 133, 'scale_pos_weight': 3}. Best is trial 0 with value: 0.6059747421659583.[0m
[32m[I 2023-03-30 08:38:47,795][0m Trial 1 finished with value: 0.5916784586916295 and parameters: {'max_bin': 471, 'max_depth': 11, 'learning_rate': 0.026827902418426846, 'subsample': 1.0, 'colsample_bytree': 0.8, 'min_child_samples': 70, 'reg_alpha': 0.36025276962607994, 'reg_lambda': 1.3997088725076523, 'num_leaves': 101, 'n_estimators': 342, 'scale_pos_weight': 4}. Best is trial 0 with value: 0.6059747421659583.[0m
[32m[I 2023-03-30 08

Best F1-Score: 0.6119862586995604
Best hyperparameters: {'max_bin': 394, 'max_depth': 11, 'learning_rate': 0.01364872413582553, 'subsample': 0.5, 'colsample_bytree': 1.0, 'min_child_samples': 118, 'reg_alpha': 1.6115057027697895e-08, 'reg_lambda': 0.0001452320544019507, 'num_leaves': 108, 'n_estimators': 874, 'scale_pos_weight': 2}


In [36]:
plot_optimization_history(study)

In [37]:
plot_param_importances(study)