In [None]:
# ============================================================
# 0️ Configuración inicial y librerías
# ============================================================
import sys
import os
from pathlib import Path
from datetime import datetime
import warnings
warnings.filterwarnings("ignore")

# Ajustar ruta raíz del proyecto para importar src
root_path = Path(os.getcwd()).parent  # asumimos que notebooks/ está en la raíz
sys.path.append(str(root_path))

# Directorio donde se guardarán los modelos entrenados
model_dir = Path(root_path) / "models"
model_dir.mkdir(exist_ok=True)

# ML y Scikit-Learn
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer

# Modelos
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

# MLflow
import mlflow
import mlflow.sklearn

# SMOTE
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

# Nuestro código
from src.data.load_data import load_arff
from src.features.feature_engineering import FeatureEngineering
from src.models.train_models import PipelineML

# ============================================================
# 1️ Cargar datos
# ============================================================
base_path = Path.cwd().parent  # Ajustar según tu estructura
data_path = base_path / "data/raw/CEE_DATA.arff"

# Usando la clase ARFFLoader
loader = ARFFLoader(data_path)
df = loader.load()

# ============================================================
# 2️ Definir columnas categóricas y ordinales
# ============================================================
categorical_cols = ['Gender','Caste','coaching','time','Class_ten_education',
                    'twelve_education','medium','Father_occupation','Mother_occupation']

ordinal_cols = ["Class_ X_Percentage","Class_XII_Percentage"]
ord_map = ["Poor","Average","Good","Vg","Excellent"]

# ============================================================
# 3️ Preprocesamiento y creación de features con FeatureEngineering
# ============================================================
fe = FeatureEngineering(ordinal_map=ord_map)

# Combinar categorías raras
for col in categorical_cols:
    df = fe.combine_rare(df, col, threshold=0.2)

# Ordinal encoding y Academic_Score
df = fe.create_ordinal_features(df, ordinal_cols)

# Agrupar target
df['Performance_grouped'] = df['Performance'].replace({
    'Average':'Average/Good','Good':'Average/Good',
    'Vg':'Vg','Excellent':'Excellent'
})
df['Performance_num'] = LabelEncoder().fit_transform(df['Performance_grouped'])

# Features de frecuencia y mean encoding
df = fe.add_frequency_features(df, categorical_cols, target_col='Performance_num')

# ============================================================
# 4️ Preparar datos para entrenamiento
# ============================================================
X = df.drop(columns=['Performance', 'Performance_grouped', 'Performance_num'])
y = df['Performance_num']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ============================================================
# 5️ Definir modelos y parámetros (sin class_weight, SMOTE aplicado)
# ============================================================
models = {
    'RandomForest': {
        'model': RandomForestClassifier(random_state=42),
        'params': {
            'classifier__n_estimators': [300, 350],
            'classifier__max_depth': [5, 10],
            'classifier__min_samples_leaf': [3, 5],
            'classifier__min_samples_split': [10, 12, 15],
            'classifier__max_features': ['sqrt', 'log2'],
            'classifier__bootstrap': [True, False]
        }
    },
    'XGBoost': {
        'model': XGBClassifier(objective='multi:softprob', eval_metric='mlogloss', random_state=42,
                               tree_method='hist', n_jobs=-1),
        'params': {
            'classifier__n_estimators': [100, 150],
            'classifier__max_depth': [3, 4],
            'classifier__learning_rate': [0.05, 0.1],
            'classifier__subsample': [0.7, 0.9],
            'classifier__colsample_bytree': [0.7, 0.9],
            'classifier__gamma': [0, 1],
            'classifier__reg_alpha': [0, 0.1],
            'classifier__reg_lambda': [1, 1.5],
            'classifier__min_child_weight': [1, 3, 5]
        }
    },
    'CatBoost': {
        'model': CatBoostClassifier(iterations=300, verbose=0, random_seed=42),
        'params': {
            'classifier__depth': [4, 6],
            'classifier__learning_rate': [0.05, 0.07],
            'classifier__l2_leaf_reg': [1, 3, 5],
            'classifier__border_count': [64]
        }
    },
    'ExtraTrees': {
        "model": ExtraTreesClassifier(random_state=42, n_jobs=-1),
        "params": {
           "classifier__n_estimators": [200, 400],
           "classifier__max_depth": [5, 10, None],
           "classifier__min_samples_split": [2, 5],
           "classifier__min_samples_leaf": [1, 3],
           "classifier__max_features": ["sqrt", "log2"]
        }
    }
}

# ============================================================
# 6️ Instanciar PipelineML y entrenar todos los modelos
# ============================================================
pipeline_ml = PipelineML(model_dir=model_dir, cv=5)

numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

for model_name, model_dict in models.items():
    print(f"Entrenando {model_name}...")

    # Pipeline con SMOTE + preprocesador + clasificador
    pipe = ImbPipeline([
        ('preprocessor', preprocessor),
        ('smote', SMOTE(random_state=42)),
        ('classifier', model_dict['model'])
    ])

    run_name = f"{model_name}_{datetime.now().strftime('%Y%m%d_%H%M')}"
    with mlflow.start_run(run_name=run_name):
        pipeline_ml.train_and_evaluate_model(pipe, X_train, y_train, X_test, y_test, params=model_dict['params'])

# ============================================================
# 7️ Guardar resumen de resultados
# ============================================================
results_dir = base_path / "results"
results_dir.mkdir(exist_ok=True)
summary_path = results_dir / f"summary_{datetime.now().strftime('%Y%m%d_%H%M')}.csv"

summary = pd.DataFrame(pipeline_ml.results_summary)  # todos los modelos
summary.to_csv(summary_path, index=False)

print(f"Resumen guardado en {summary_path}")
