In [None]:
import pandas as pd
import polars as pl
import numpy as np
import matplotlib.pyplot as plt
import random
import optuna
import lightgbm as lgb
import pickle

In [None]:
base_path = '/home/cburich_pymnts/buckets/b1/'
dataset_path = base_path + 'datasets/'
modelos_path = base_path + 'modelos/'
db_path = base_path + 'db/'


# base_path = 'C:/Users/Cristian Burich/Desktop/MA/segundo/eyf/'
# dataset_path = base_path + 'datasets/'
# modelos_path = base_path + 'modelos/'
# db_path = base_path + 'db/'


dataset_file = 'competencia_03_fe_U_k300.parquet'   # usamos la version sin U?

ganancia_acierto = 273000
costo_estimulo = 7000

# agregue sus semillas
semillas = [165229,165211,165203,165237,165247]

# data = pd.read_parquet('/home/eanegrin/datasets/' + dataset_file)
data = pd.read_parquet(dataset_path + dataset_file)

In [None]:
data = data.drop(columns=['clase_ternaria_1', 'tmobile_app', 'cmobile_app_trx'])

In [None]:
data['clase_binaria1'] = np.nan
data['clase_binaria2'] = np.nan

# Update values while keeping NaN as NaN
data['clase_binaria1'] = np.where(data['clase_ternaria'] == 'BAJA+2', 1, 
                                  np.where(data['clase_ternaria'].isna(), np.nan, 0))
data['clase_binaria2'] = np.where(data['clase_ternaria'] == 'CONTINUA', 0, 
                                  np.where(data['clase_ternaria'].isna(), np.nan, 1))

In [None]:
# Asignamos pesos a las clases

data['clase_peso'] = 1.0

data.loc[data['clase_ternaria'] == 'BAJA+2', 'clase_peso'] = 1.00002
data.loc[data['clase_ternaria'] == 'BAJA+1', 'clase_peso'] = 1.00001

data['clase_binaria'] = 0
data['clase_binaria'] = np.where(data['clase_ternaria'] == 'BAJA+2', 1, 0)

In [8]:
def lgb_gan_eval(y_pred, data):
    weight = data.get_weight()
    ganancia = np.where(weight == 1.00002, ganancia_acierto, 0) - np.where(weight < 1.00002, costo_estimulo, 0)
    ganancia = ganancia[np.argsort(y_pred)[::-1]]
    ganancia = np.cumsum(ganancia)

    return 'gan_eval', np.max(ganancia) , True

In [None]:
meses_train = [201906, 201907, 201908, 201909, 201910, 201911, 201912,
               202001, 202002, 202003, 202004, 202005,
               202007, 202008, 202009, 202010, 202011, 202012,
               202101, 202102, 202103, 202104, 202105, 202106, 202107]

In [None]:
train_data = data[data['foto_mes'].isin(meses_train)]


X_train = train_data.drop(['clase_ternaria', 'clase_peso', 'clase_binaria1','clase_binaria2', 'foto_mes'], axis=1)
y_train_binaria1 = train_data['clase_binaria1']
y_train_binaria2 = train_data['clase_binaria2']
w_train = train_data['clase_peso']

future_data = data[data['foto_mes'] == 202109]

X_test = future_data.drop(['clase_ternaria', 'clase_peso', 'clase_binaria1','clase_binaria2', 'foto_mes'], axis=1)

# Entrenamiento

Cargamos el study de optuna que optimizamos en el script anterior

In [None]:
storage_name = "sqlite:///" + db_path + "optimization_lgbm.db"
study_name = "competencia3_lgbm_k300" # UPDATE

study = optuna.create_study(
    direction="maximize",
    study_name=study_name,
    storage=storage_name,
    load_if_exists=True,
)

[I 2024-11-18 08:00:05,806] Using an existing study with name 'competencia2_lgbm_v08' instead of creating a new one.


In [10]:
resultados = study.trials_dataframe()
resultados.shape

(100, 12)

In [11]:
study.best_trial.params

{'num_leaves': 95,
 'learning_rate': 0.014790793124814124,
 'min_data_in_leaf': 1162,
 'feature_fraction': 0.32039319093779284,
 'bagging_fraction': 0.7236519946486292}

In [None]:
best_iter = study.best_trial.user_attrs["best_iter"]

params = {
    'objective': 'binary',
    'boosting_type': 'gbdt',
    'first_metric_only': True,
    'boost_from_average': True,
    'feature_pre_filter': False,
    'max_bin': 31,
    'num_leaves': study.best_trial.params['num_leaves'],
    'learning_rate': study.best_trial.params['learning_rate'],
    'min_data_in_leaf': study.best_trial.params['min_data_in_leaf'],
    'feature_fraction': study.best_trial.params['feature_fraction'],
    'bagging_fraction': study.best_trial.params['bagging_fraction'],
    'seed': semillas[0],
    'verbose': 0
}


corte = 10500  #Editar si hace falta

Mejor cantidad de árboles para el mejor model 871
Mejor cantidad de árboles para el mejor model 871
Mejor cantidad de árboles para el mejor model 871
Mejor cantidad de árboles para el mejor model 871
Mejor cantidad de árboles para el mejor model 871


In [None]:
# Semilla inicial para reproducibilidad
initial_seed = 165229
random.seed(initial_seed)

# Generar 30 semillas adicionales
new_seeds = [random.randint(0, 200000) for _ in range(30)]


# Placeholder for storing results from all seeds
combined_results = []

for seed in new_seeds:
    print(f"Running iteration with seed {seed}")
    
    # Update seed in parameters
    params['seed'] = seed
    
    # Train the model
    train_data = lgb.Dataset(
        X_train, 
        label=y_train_binaria2, 
        weight=w_train
    )
    model = lgb.train(params, train_data, num_boost_round=best_iter)
    
    # Predict on test data
    y_pred_lgm = model.predict(X_test)
    
    # Work on a copy of X_test to avoid modifying the original
    X_test_copy = X_test.copy()
    X_test_copy['pred_lgm'] = y_pred_lgm
    
    # Sort by probability and assign "Predicted" labels
    idx = np.argsort(y_pred_lgm)[::-1]
    X_test_copy.reset_index(drop=True, inplace=True)
    X_test_copy = X_test_copy.iloc[idx]
    
    envios = np.zeros(len(X_test_copy), dtype=int)
    envios[:corte] = 1
    X_test_copy['Predicted'] = envios
    
    # Save output file for this seed (excluding pred_lgm)
    file_name = f'k_306_results_seed_{seed}.csv'
    output_path = base_path + 'exp/KA2000/' + file_name
    output = X_test_copy[['numero_de_cliente', 'Predicted']]  # Exclude 'pred_lgm'
    output.to_csv(output_path, index=False)
    
    # Add the probabilities to the combined results for final aggregation
    combined_results.append(X_test_copy[['numero_de_cliente', 'pred_lgm']].copy())

# Combine results by averaging probabilities
print("Combining results from all seeds...")

# Merge all seed predictions on `numero_de_cliente`
final_results = combined_results[0].rename(columns={'pred_lgm': 'prob_0'})

for i, result in enumerate(combined_results[1:], 1):
    final_results = final_results.merge(
        result.rename(columns={'pred_lgm': f'prob_{i}'}),
        on='numero_de_cliente'
    )

# Average the probabilities across all seeds
final_results['average_prob'] = final_results[[f'prob_{i}' for i in range(len(new_seeds))]].mean(axis=1)

# Assign "Predicted" labels based on averaged probabilities
final_results = final_results.sort_values(by='average_prob', ascending=False)
final_results['Predicted'] = 0
final_results.iloc[:corte, final_results.columns.get_loc('Predicted')] = 1

# Save the final combined output
final_output = final_results[['numero_de_cliente', 'Predicted']]
final_file_name = 'final_combined_results_k306.csv'
final_output_path = base_path + 'exp/KA2000/' + final_file_name
final_output.to_csv(final_output_path, index=False)

print(f"Final combined results saved to {final_output_path}")