In [None]:
import pandas as pd
import polars as pl
import numpy as np
import lightgbm as lgb
import optuna

In [None]:
base_path = '/home/clas_giulia_s/buckets/b1/'

dataset_path = base_path + 'datasets/'
modelos_path = base_path + 'modelos/'
db_path = base_path + 'db/'
dataset_file = 'competencia_02.parquet'

ganancia_acierto = 273000
costo_estimulo = 7000

In [None]:
semillas = [5623, 292, 7494, 8504, 1663, 785, 5377, 4838, 2141, 2235, 9836, 1258, 3273, 8349, 1639, 1597, 3195, 40, 5186, 9278, 6281, 7515, 2046, 5642, 505, 4611, 3008, 2063, 2280, 1148, 618, 4806, 1503, 3926, 6363, 400, 2662, 9432, 1632, 386, 2545, 228, 1561, 3523, 4508, 9190, 8181, 7302, 6250, 7762, 8141, 6854, 622, 5327, 6379, 3867, 5420, 3030, 7275, 2040, 6042, 4365, 231, 8330, 8527, 2420, 2558, 9618, 3937, 555, 122, 4907, 7838, 5246, 100, 3243, 1449, 1052, 1906, 7657, 753, 4320, 4576, 9621, 8868, 8155, 7410, 2320, 6355, 1994, 7775, 8358, 3508, 3064, 3904, 3602, 5308, 6947, 1544, 624]

In [None]:
data = pl.scan_parquet(dataset_path + dataset_file).filter(pl.col("foto_mes") == 202108).collect()

In [None]:
# convierto a pandas dataframe
data = data.to_pandas()

In [None]:
# Asignamos pesos a las clases

data['clase_peso'] = 1.0

data.loc[data['clase_ternaria'] == 'BAJA+2', 'clase_peso'] = 1.00002
data.loc[data['clase_ternaria'] == 'BAJA+1', 'clase_peso'] = 1.00001

In [None]:
data['clase_binaria'] = 0
data['clase_binaria'] = np.where(data['clase_ternaria'] == 'BAJA+2', 1, 0)

In [None]:
X_test = data.drop(['clase_ternaria', 'clase_peso', 'clase_binaria'], axis=1)

### Predicciones para Kaggle

Se generan la prediccion para cada modelo individual, se promedian las predicciones y luego se ordenan de mayor a menor para determinar el numero de envios

In [None]:
version = 'vxx' # UPADTE

modelos = [f'lgb_competencia2_{version}_s{semilla}_final.txt' for semilla in semillas]

In [None]:
len(modelos)

In [None]:
X_test_with_predictions = data.drop(['clase_ternaria', 'clase_peso', 'clase_binaria'], axis=1).copy()

In [None]:
counter = 0

for modelo in modelos:
    
    # cargamos el modelo
    model = lgb.Booster(model_file=f'{modelos_path}{version}/{modelo}')
    
    # predecimos para agosto
    X_test = data.drop(['clase_ternaria', 'clase_peso', 'clase_binaria'], axis=1).copy()
    y_pred_lgm = model.predict(X_test)
    X_test_with_predictions[f'pred_lgm_{semillas[counter]}'] = y_pred_lgm
    
    print(f'{counter}. Predicciones del modelo: {modelo} DONE')
    
    counter += 1

In [None]:
X_test_with_predictions = X_test_with_predictions.copy()

In [None]:
individual_predictions = X_test_with_predictions.iloc[:, -100:].columns.tolist()
individual_predictions

### Semillerio 100

In [None]:
X_test_with_predictions['predicted_avg'] = X_test_with_predictions.iloc[:, -100:].mean(axis=1)

In [None]:
# ordeno de mayor probabilidad de baja a menor
idx = np.argsort(X_test_with_predictions['predicted_avg'])[::-1]
X_test_with_predictions.reset_index(drop=True, inplace=True)
X_test_with_predictions = X_test_with_predictions.iloc[idx]

Se prueban varios puntos de corte

In [None]:
puntos_corte = np.linspace(9000, 13000, 50).astype(int)
puntos_corte

In [None]:
counter = 0

for corte in puntos_corte:
    
    envios = np.zeros(len(X_test_with_predictions), dtype=int)
    envios[:corte] = 1
    X_test_with_predictions['Predicted'] = envios
    
    output = X_test_with_predictions[['numero_de_cliente', 'Predicted']]
        
    counter += 1
    
    file_name = f'results_{version}_{counter}.csv'
    output_path = base_path + f'exp/competencia_2/{version}/' + file_name
    
    output.to_csv(output_path, index=False)
            
    print(f'{counter}. corte: {corte}, {output_path}')