In [5]:
import sys
from pathlib import Path
sys.path.append(str(Path.cwd().parent))

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
from src.feature_engineering import FeatureEngineeringPipeline
import pandas as pd
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import log_loss
import optuna

# ————————————————————————————————————————————————————————————————————————————————
# 1) Carga de los datos de train/test (exportados previamente en data/)
# ————————————————————————————————————————————————————————————————————————————————
data_dir = Path('../data')
train_df = pd.read_parquet(data_dir / 'train.parquet')
test_df  = pd.read_parquet(data_dir / 'test.parquet')

X_train = train_df.drop(columns=['target'])
y_train = train_df['target'].values

X_test  = test_df.drop(columns=['target'])
y_test  = test_df['target'].values

# Columnas
CATEGORICAL_COLS = ['cat1', 'cat2']
CONTINUOUS_COLS   = ['cont1', 'cont2', 'cont3', 'cont4']
DISCRETE_COLS   = ['disc1', 'disc2']

In [7]:
fe = FeatureEngineeringPipeline(
    categorical_cols=['cat1', 'cat2'],
    discrete_cols=['disc1', 'disc2'],
    continuous_cols=['cont1', 'cont2', 'cont3', 'cont4'],
    model=SGDClassifier(loss='log_loss', n_jobs=-1, random_state=42)
)

def objective(trial):
    pipe = fe.build_pipeline(trial)
    pipe.fit(X_train, y_train)
    preds = pipe.predict_proba(X_test)[:,1]
    return log_loss(y_test, preds)

In [10]:
if __name__ == "__main__":

    study = optuna.create_study(
        direction="minimize",
    )
    study.optimize(
        objective,
        n_trials=100,
        catch=(ValueError, RuntimeError),
        n_jobs=-1                         
    )
    print("Mejor trial:", study.best_trial.number)
    print("→ Parámetros:", study.best_params)
    print("→ Log-loss:", study.best_value)

[I 2025-05-03 18:51:33,174] A new study created in memory with name: no-name-1669c37a-d70b-4439-a9e7-e8d729114587
[I 2025-05-03 18:51:37,189] Trial 16 finished with value: 0.33626786839907347 and parameters: {'cat_enc': 'woe', 'disc_enc': 'none', 'cont_tr': 'geometric_encode', 'cont_q': 7, 'cont_enc': 'mean'}. Best is trial 16 with value: 0.33626786839907347.
[I 2025-05-03 18:51:37,401] Trial 3 finished with value: 0.3532390023559228 and parameters: {'cat_enc': 'ordinal', 'ordinal_method': 'arbitrary', 'disc_enc': 'none', 'cont_tr': 'equal_freq', 'cont_q': 6}. Best is trial 16 with value: 0.33626786839907347.
[I 2025-05-03 18:51:37,610] Trial 25 finished with value: 0.3725033466029106 and parameters: {'cat_enc': 'none', 'disc_enc': 'countfreq', 'disc_countfreq_method': 'frequency', 'cont_tr': 'equal_width_encode', 'cont_q': 4, 'cont_enc': 'woe'}. Best is trial 16 with value: 0.33626786839907347.
[I 2025-05-03 18:51:38,132] Trial 15 finished with value: 0.37546610689767745 and parameter

Mejor trial: 38
→ Parámetros: {'cat_enc': 'mean', 'disc_enc': 'dtree', 'disc_dtree_method': 'arbitrary', 'cont_tr': 'none'}
→ Log-loss: 0.2974056430620006


In [11]:
import joblib

# 1) Ajusta la mejor pipeline sobre todo el dataset (o solo train si prefieres)
best_pipe = fe.build_pipeline(study.best_trial)
best_pipe.fit(X_train, y_train)

# 2) Serializa a disco
pipe_dir = Path('../pipelines')
pipe_dir.mkdir(parents=True, exist_ok=True)

# 4. Guardar también en Parquet
pipe_path = pipe_dir / 'best_pipeline.pkl'

joblib.dump(best_pipe, str(pipe_path))
print('Pipeline guardada en best_pipeline.pkl')

Pipeline guardada en best_pipeline.pkl
