In [1]:
# %pip install polars

In [2]:
import pandas as pd
import polars as pl
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit, StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer

import lightgbm as lgb

import optuna
from optuna.visualization import plot_optimization_history, plot_param_importances, plot_slice, plot_contour

from time import time

import pickle

In [3]:
# !gsutil cp /home/clas_giulia_s/buckets/b1/datasets/competencia_02_fe_v01_undersampled.parquet /home/clas_giulia_s/datasets/

In [4]:

# base_path = '/home/clas_giulia_s/buckets/b1/'
base_path = '/Users/ignacio/MAESTRIA/DMEF/'

dataset_path = base_path + 'datasets/'
modelos_path = base_path + 'modelos/'
db_path = base_path + 'db/'
# dataset_file = 'competencia_02_fe_v01_undersampled.parquet'
dataset_file = 'competencia_01_fe_modelito_undersampled.csv'

ganancia_acierto = 273000
costo_estimulo = 7000

# data = pd.read_parquet(dataset_path + dataset_file)
data = pd.read_csv(dataset_path + dataset_file)

In [5]:
# cantidad_semillas = 100
# semillas = [np.random.randint(0, 10000) for _ in range(cantidad_semillas)]
# print(semillas)

In [6]:
semillas = [5623, 292, 7494, 8504, 1663, 785, 5377, 4838, 2141, 2235, 9836, 1258, 3273, 8349, 1639, 1597, 3195, 40, 5186, 9278, 6281, 7515, 2046, 5642, 505, 4611, 3008, 2063, 2280, 1148, 618, 4806, 1503, 3926, 6363, 400, 2662, 9432, 1632, 386, 2545, 228, 1561, 3523, 4508, 9190, 8181, 7302, 6250, 7762, 8141, 6854, 622, 5327, 6379, 3867, 5420, 3030, 7275, 2040, 6042, 4365, 231, 8330, 8527, 2420, 2558, 9618, 3937, 555, 122, 4907, 7838, 5246, 100, 3243, 1449, 1052, 1906, 7657, 753, 4320, 4576, 9621, 8868, 8155, 7410, 2320, 6355, 1994, 7775, 8358, 3508, 3064, 3904, 3602, 5308, 6947, 1544, 624]

In [7]:
# meses_train = [201906, 201907, 201908, 201909, 201910, 201911, 201912,
#                202001, 202002, 202003, 202004, 202005, 202006,
#                202007, 202008, 202009, 202010, 202011, 202012,
#                202101, 202102, 202103, 202104, 202105] # dejo afuera 202106 para test

meses_train = [202101, 202102, 202103] # dejo afuera 202104 para test

data = data[data['foto_mes'].isin(meses_train)]
data.shape

(51606, 679)

In [8]:
# Asignamos pesos a las clases

data['clase_peso'] = 1.0

data.loc[data['clase_ternaria'] == 'BAJA+2', 'clase_peso'] = 1.00002
data.loc[data['clase_ternaria'] == 'BAJA+1', 'clase_peso'] = 1.00001

In [9]:
data['clase_binaria'] = 0
data['clase_binaria'] = np.where(data['clase_ternaria'] == 'BAJA+2', 1, 0)

In [10]:
X_train = data.drop(['clase_ternaria', 'clase_peso', 'clase_binaria'], axis=1)
y_train_binaria = data['clase_binaria']
w_train = data['clase_peso']

In [11]:
def lgb_gan_eval(y_pred, data):
    weight = data.get_weight()
    ganancia = np.where(weight == 1.00002, ganancia_acierto, 0) - np.where(weight < 1.00002, costo_estimulo, 0)
    ganancia = ganancia[np.argsort(y_pred)[::-1]]
    ganancia = np.cumsum(ganancia)

    return 'gan_eval', np.max(ganancia) , True

# Entrenamiento

Cargamos el study de optuna que optimizamos en el script anterior

In [12]:

storage_name = "sqlite:///" + db_path + "optimizacion_lgbm_modelito.db"
study_name = "competencia1_modelito_lgbm" # UPDATE

study = optuna.create_study(
    direction="maximize",
    study_name=study_name,
    storage=storage_name,
    load_if_exists=True,
)

[I 2024-11-29 00:31:44,939] Using an existing study with name 'competencia1_modelito_lgbm' instead of creating a new one.


In [13]:
resultados = study.trials_dataframe()
resultados.shape

(60, 12)

Entrenamos un modelo solo para ver feature importance

In [14]:
best_iter = study.best_trial.user_attrs["best_iter"]
print(f"Mejor cantidad de árboles para el mejor model {best_iter}")

params = {
    'objective': 'binary',
    'boosting_type': 'gbdt',
    'first_metric_only': True,
    'boost_from_average': True,
    'feature_pre_filter': False,
    'max_bin': 31,
    'num_leaves': study.best_trial.params['num_leaves'],
    'learning_rate': study.best_trial.params['learning_rate'],
    'min_data_in_leaf': study.best_trial.params['min_data_in_leaf'],
    'feature_fraction': study.best_trial.params['feature_fraction'],
    'bagging_fraction': study.best_trial.params['bagging_fraction'],
    'seed': semillas[0],
    'verbose': 0
}

train_data = lgb.Dataset(X_train,
                          label=y_train_binaria,
                          weight=w_train)

model = lgb.train(params,
                  train_data,
                  num_boost_round=best_iter)


Mejor cantidad de árboles para el mejor model 139


In [15]:
params

{'objective': 'binary',
 'boosting_type': 'gbdt',
 'first_metric_only': True,
 'boost_from_average': True,
 'feature_pre_filter': False,
 'max_bin': 31,
 'num_leaves': 16,
 'learning_rate': 0.06537511998403012,
 'min_data_in_leaf': 1295,
 'feature_fraction': 0.6760496941965302,
 'bagging_fraction': 0.6807642069212421,
 'seed': 5623,
 'verbose': 0}

Variables mas importantes:

In [16]:
importances = model.feature_importance()
feature_names = X_train.columns.tolist()
importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances})
importance_df = importance_df.sort_values('importance', ascending=False)
importance_df[importance_df['importance'] > 0]

Unnamed: 0,feature,importance
189,avg3_mrentabilidad_annual,51
12,cproductos,39
5,cliente_edad,37
147,Visa_fechaalta,35
8,mrentabilidad_annual,35
...,...,...
31,mtarjeta_master_consumo,1
232,avg3_ccuenta_debitos_automaticos,1
126,Master_mconsumototal,1
285,avg3_Master_msaldototal,1


### Entrenamos con la totalidad de las semillas y guardamos los modelos

In [17]:
version = 'modelito' # UPDATE

best_iter = study.best_trial.user_attrs["best_iter"]
print(f"Mejor cantidad de árboles para el mejor model {best_iter}")

for semilla in semillas:
    
    params = {
        'objective': 'binary',
        'boosting_type': 'gbdt',
        'first_metric_only': True,
        'boost_from_average': True,
        'feature_pre_filter': False,
        'max_bin': 31,
        'num_leaves': study.best_trial.params['num_leaves'],
        'learning_rate': study.best_trial.params['learning_rate'],
        'min_data_in_leaf': study.best_trial.params['min_data_in_leaf'],
        'feature_fraction': study.best_trial.params['feature_fraction'],
        'bagging_fraction': study.best_trial.params['bagging_fraction'],
        'seed': semilla,
        'verbose': 0
    }

    train_data = lgb.Dataset(X_train,
                            label=y_train_binaria,
                            weight=w_train)

    model = lgb.train(params,
                    train_data,
                    num_boost_round=best_iter)
    
    model.save_model(modelos_path + f'{version}/lgb_competencia1_{version}_s{semilla}.txt')
    
    print(f'Modelo generado con semilla {semilla}: DONE')

Mejor cantidad de árboles para el mejor model 139
Modelo generado con semilla 5623: DONE
Modelo generado con semilla 292: DONE
Modelo generado con semilla 7494: DONE
Modelo generado con semilla 8504: DONE
Modelo generado con semilla 1663: DONE
Modelo generado con semilla 785: DONE
Modelo generado con semilla 5377: DONE
Modelo generado con semilla 4838: DONE
Modelo generado con semilla 2141: DONE
Modelo generado con semilla 2235: DONE
Modelo generado con semilla 9836: DONE
Modelo generado con semilla 1258: DONE
Modelo generado con semilla 3273: DONE
Modelo generado con semilla 8349: DONE
Modelo generado con semilla 1639: DONE
Modelo generado con semilla 1597: DONE
Modelo generado con semilla 3195: DONE
Modelo generado con semilla 40: DONE
Modelo generado con semilla 5186: DONE
Modelo generado con semilla 9278: DONE
Modelo generado con semilla 6281: DONE
Modelo generado con semilla 7515: DONE
Modelo generado con semilla 2046: DONE
Modelo generado con semilla 5642: DONE
Modelo generado co