In [None]:
# ============================================================
# 1 Configuración de rutas y entorno
# ============================================================
from pathlib import Path
import sys
import pickle
import mlflow
import pandas as pd
from datetime import datetime

# --- Definir raíz del proyecto ---
root_dir = Path.cwd().parent  # notebooks/ -> proyecto raíz
sys.path.append(str(root_dir))  # para que Python encuentre 'src'

# --- Definir carpeta de MLflow y tracking URI ---
mlruns_path = root_dir / "mlruns"
mlruns_path.mkdir(exist_ok=True)
mlflow.set_tracking_uri(f"file:///{mlruns_path.as_posix()}")  # barras / y file:///

# --- Crear o seleccionar experimento ---
experiment_name = "default"
experiment = mlflow.get_experiment_by_name(experiment_name)
if experiment is None:
    mlflow.create_experiment(experiment_name)
mlflow.set_experiment(experiment_name)  # selecciona el experimento

# --- Rutas de datos y resultados ---
data_raw = root_dir / "data" / "raw"
data_intermediate = root_dir / "data" / "intermediate"
data_processed = root_dir / "data" / "processed"
model_dir = root_dir / "models"
model_summary_dir = root_dir / "reports" / "models_summary"
executed_notebook_dir = root_dir / "notebooks" / "_executed"

# Crear carpetas si no existen
model_dir.mkdir(parents=True, exist_ok=True)
model_summary_dir.mkdir(parents=True, exist_ok=True)

# --- Imports propios ---
from src.data.load_data import load_arff
from src.features.feature_engineering import combine_rare, create_ordinal_features, add_frequency_features
from src.models.train_models import train_model, evaluate_model, save_model

# --- Sklearn & ML ---
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression

# ============================================================
# 2️ Carga de datos
# ============================================================
df = load_arff(data_raw / "CEE_DATA.arff")

# ============================================================
# 3 Preprocesamiento
# ============================================================
categorical_cols = [
    'Gender','Caste','coaching','time','Class_ten_education',
    'twelve_education','medium','Father_occupation','Mother_occupation'
]

# Eliminar duplicados
df = df.drop_duplicates()

# Agrupamiento de clases de Performance
df['Performance_grouped'] = df['Performance'].replace({
    'Average':'Average/Good',
    'Good':'Average/Good',
    'Vg':'Vg',
    'Excellent':'Excellent'
})

# Codificación numérica de la variable objetivo
df['Performance_num'] = LabelEncoder().fit_transform(df['Performance_grouped'])

# Combinar categorías raras
for col in categorical_cols:
    combine_rare(df, col)

# Crear variables ordinales
df = create_ordinal_features(
    df,
    ["Class_ X_Percentage","Class_XII_Percentage"],
    ["Poor","Average","Good","Vg","Excellent"]
)

# Agregar features de frecuencia
df = add_frequency_features(df, categorical_cols)

# ============================================================
# 4️ Guardar dataset intermedio
# ============================================================
df.to_pickle(data_intermediate / "df.pkl")

# ============================================================
# 5️ Split y balanceo de clases
# ============================================================
X = df[[col for col in df.columns if col.endswith('_freq') or col.endswith('_target_mean')] + ['Academic_Score']]
y = df['Performance_num']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# Guardar datasets procesados
with open(data_processed / "Xy_train_resampled.pkl", "wb") as f:
    pickle.dump((X_train_res, y_train_res), f)
with open(data_processed / "Xy_test.pkl", "wb") as f:
    pickle.dump((X_test, y_test), f)

# ============================================================
# 6️ Entrenamiento de modelos
# ============================================================
models = {
    'RandomForest': (
        RandomForestClassifier(random_state=42, class_weight='balanced'), 
        {"n_estimators":[300], "max_depth":[5], "min_samples_leaf":[3]}
    ),
    'XGBoost': (
        XGBClassifier(
            objective='multi:softprob', eval_metric='mlogloss', random_state=42,
            tree_method='hist', use_label_encoder=False, n_jobs=-1
        ),
        {"n_estimators":[100], "max_depth":[3], "learning_rate":[0.05]}
    ),
    'CatBoost': (
        CatBoostClassifier(iterations=300, verbose=0, random_seed=42, class_weights=[1,2,2]),
        {"depth":[4], "learning_rate":[0.05], "l2_leaf_reg":[1]}
    ),
    'ExtraTrees': (
        ExtraTreesClassifier(random_state=42, n_jobs=-1, class_weight='balanced'),
        {"n_estimators":[200], "max_depth":[5], "min_samples_leaf":[1]}
    )
}

results_summary = []
timestamp = datetime.now().strftime("%Y%m%d_%H%M")

for name, (model, params) in models.items():
    print(f"\n--- Entrenando {name} ---")
    
    with mlflow.start_run(run_name=name):
        best_model, best_score, best_params = train_model(model, params, X_train_res, y_train_res)
        print(f"F1 CV: {best_score:.4f}")
        
        y_pred = evaluate_model(best_model, X_test, y_test)
        
        # Guardar modelo
        save_path = model_dir / f"{name}_best_model_{timestamp}.pkl"
        save_model(best_model, save_path)
        
        # Log en MLflow
        mlflow.log_params(best_params)
        mlflow.log_metric("F1_CV", best_score)
        mlflow.sklearn.log_model(best_model, artifact_path="models")
        
        # Agregar al resumen
        results_summary.append({"Model": name, "F1_CV": best_score})

# ============================================================
# 7️ Guardar resumen de resultados
# ============================================================
pd.DataFrame(results_summary).to_csv(
    model_summary_dir / f"results_summary_{timestamp}.csv", index=False
)

print("\n Pipeline completado correctamente.")
