In [1]:
import pandas as pd
import os
import numpy as np
# Funciones auxiliares sklearn
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, KFold, cross_val_predict, GridSearchCV
from sklearn.metrics import cohen_kappa_score, make_scorer, accuracy_score, balanced_accuracy_score, confusion_matrix, roc_curve, auc, accuracy_score, mean_squared_error, mean_absolute_error, r2_score, classification_report # Metricas
# Preprocessing
from sklearn.preprocessing import StandardScaler

#Guardado de objetos en archivos joblib
from joblib import load, dump

# Optimizacion de hiperparametros
import optuna
from optuna.artifacts import FileSystemArtifactStore, upload_artifact

from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor

import warnings

warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#Subimos dos niveles para quedar en la carpeta que contiene input y lab2-mcd-austral
BASE_DIR = './'

#Salida de modelos entrenados
PATH_TO_MODELS = os.path.join(BASE_DIR, "work/models")

#Artefactos a subir a optuna
PATH_TO_TEMP_FILES = os.path.join(BASE_DIR, "work/optuna_temp_artifacts")

#Artefactos que optuna gestiona
PATH_TO_OPTUNA_ARTIFACTS = os.path.join(BASE_DIR, "work/optuna_artifacts")

In [27]:
#df = pd.read_csv('../Text-Mining-Austral/dataset.csv',delimiter=';')
df = pd.read_csv('dataset_pesos.csv',sep=';')
df.head(2)

Unnamed: 0.1,Unnamed: 0,Ejercicio,Tipo_Cpbte,N°_Entrada,Entidad_Nº,Cod_Ret,Fte_Fin.,Cuit/DniOtros,Clase_Registro,Clase_Gasto,Glosa,Sueldo,texto_limpio,pesos,cuentas_sueldo,pesos_cuentas
0,0,2021,AF,137,25,217,10,710,ANT,,RES 7/21 ANTICIPO SUB. Y SUBVEN. CTA 360000200...,0,ANTICIPO SUB SUBVEN,95,360000200981158,4
1,1,2021,AF,138,25,202,10,710,ANT,,RES 8/21 ANT. VIATICOS Y MOVIL. CUENTA Nº3600...,0,ANT VIATICOS MOVIL CUENTA,93,360000200982335,0


In [28]:
# onehot encoding
df.join(pd.get_dummies(df[["Tipo_Cpbte"]]))
df.drop(columns=["Tipo_Cpbte"], inplace=True)

df.join(pd.get_dummies(df[["Clase_Registro"]]))
df.drop(columns=["Clase_Registro"], inplace=True)

df.join(pd.get_dummies(df[["Clase_Gasto"]]))
df.drop(columns=["Clase_Gasto"], inplace=True)



In [29]:
SEED = 12345
TEST_SIZE = 0.2

# X = df.drop(columns = ["Sueldo","Glosa","N°_Entrada","Ejercicio", "Cod_Ret"])
X = df.drop(columns = ["Sueldo","Glosa","N°_Entrada","Ejercicio", "texto_limpio", "cuentas_sueldo"])
#X = df[['pesos']]
y = df.Sueldo

# División en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=SEED)

# Combinar X_train y y_train en un solo DataFrame
train = pd.concat([X_train, y_train], axis=1)

# Combinar X_test y y_test en un solo DataFrame
test = pd.concat([X_test, y_test], axis=1)

In [31]:
def cv_es_gbc_objective(trial):

    #Parametros para LightGBM
    gbc_params = {      
                       # 'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
                        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1.0),
                       # 'max_depth': trial.suggest_int('max_depth', 3, 10),
                        ##'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
                        #'subsample': trial.suggest_uniform('subsample', 0.5, 1.0)
                        #'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
                        #'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None, 'auto']),
                        #'loss': trial.suggest_categorical('loss', ['deviance', 'exponential']),
                        } 

    #Voy a generar estimaciones de los 5 modelos del CV sobre los datos test y los acumulo en la matriz scores_ensemble
    scores_ensemble = np.zeros((len(y_test),len(y_train.unique())))

    #Score del 5 fold CV inicializado en 0
    score_folds = 0

    #Numero de splits del CV
    n_splits = 5

    #Objeto para hacer el split estratificado de CV
    skf = StratifiedKFold(n_splits=n_splits)

    for i, (if_index, oof_index) in enumerate(skf.split(X_train, y_train)):
        
        # Dataset in fold (donde entreno)
        X_if, y_if = X_train.iloc[if_index], y_train.iloc[if_index]
        
        # Dataset Out of fold (donde mido la performance del CV)
        X_oof, y_oof = X_train.iloc[oof_index], y_train.iloc[oof_index]

        # Crear el modelo RandomForestClassifier con los parámetros sugeridos
        gbc_model = GradientBoostingClassifier(**gbc_params, random_state=42)
        
        # Entrenar el modelo
        gbc_model.fit(X_if, y_if)
        
        # Acumular los scores (probabilidades) de cada clase para cada uno de los modelos que determino en los folds
        scores_ensemble += gbc_model.predict_proba(X_test)
        
        # Score del fold (registros de dataset train que en este fold quedan out of fold)
        score_folds += accuracy_score(y_oof, gbc_model.predict(X_oof)) / n_splits


    #Guardo prediccion del trial sobre el conjunto de test
    # Genero nombre de archivo
    predicted_filename = os.path.join(PATH_TO_TEMP_FILES,f'test_{trial.study.study_name}_{trial.number}.joblib')
    # Copia del dataset para guardar la prediccion
    predicted_df = test.copy()
    # Genero columna pred con predicciones sumadas de los 5 folds
    predicted_df['pred'] = [scores_ensemble[p,:] for p in range(scores_ensemble.shape[0])]
    # Grabo dataframe en temp_artifacts
    dump(predicted_df, predicted_filename)
    # Indico a optuna que asocie el archivo generado al trial
    upload_artifact(trial, predicted_filename, artifact_store)    


    #Determino score en conjunto de test y asocio como metrica adicional en optuna
    test_score = accuracy_score(y_test, scores_ensemble.argmax(axis=1))
    trial.set_user_attr("test_score", test_score)

    #Devuelvo score del 5fold cv a optuna para que optimice en base a eso
    return(score_folds)

In [32]:
#Inicio el store de artefactos (archivos) de optuna
artifact_store = FileSystemArtifactStore(base_path=PATH_TO_OPTUNA_ARTIFACTS)

#Genero estudio
study = optuna.create_study(direction='maximize',
                            storage="sqlite:///sueldos2.sqlite3",  # Specify the storage URL here.
                            study_name="gbc con score con cuentas",
                            load_if_exists = True)
#Corro la optimizacion
study.optimize(cv_es_gbc_objective, n_trials=10)

[I 2024-08-10 12:38:31,351] A new study created in RDB with name: gbc con score con cuentas
[I 2024-08-10 12:40:39,172] Trial 0 finished with value: 0.9807436536647812 and parameters: {'learning_rate': 0.2838521462594924}. Best is trial 0 with value: 0.9807436536647812.
[I 2024-08-10 12:42:49,574] Trial 1 finished with value: 0.9623331120103464 and parameters: {'learning_rate': 0.01488214080774319}. Best is trial 0 with value: 0.9807436536647812.
[I 2024-08-10 12:44:53,736] Trial 2 finished with value: 0.8815385374253708 and parameters: {'learning_rate': 0.0030929087617836905}. Best is trial 0 with value: 0.9807436536647812.
