In [None]:
# ============================================================
# 1️ Configuración de rutas, entorno y MLflow
# ============================================================
from pathlib import Path
import sys
import pickle
from datetime import datetime
import mlflow
import pandas as pd

# --- Definir raíz del proyecto ---
root_dir = Path.cwd().parent  # notebooks/ -> proyecto raíz
sys.path.append(str(root_dir))  # para que Python encuentre 'src'

# --- Carpetas principales ---
data_raw = root_dir / "data" / "raw"
data_intermediate = root_dir / "data" / "intermediate"
data_processed = root_dir / "data" / "processed"
model_dir = root_dir / "models"
model_summary_dir = root_dir / "reports" / "models_summary"
executed_notebook_dir = root_dir / "notebooks" / "_executed"

for folder in [data_intermediate, data_processed, model_dir, model_summary_dir, executed_notebook_dir]:
    folder.mkdir(parents=True, exist_ok=True)

# --- Configuración MLflow ---
mlruns_path = root_dir / "mlruns"
mlruns_path.mkdir(exist_ok=True)
mlflow.set_tracking_uri(f"file:///{mlruns_path.as_posix()}")
experiment_name = "default"
if mlflow.get_experiment_by_name(experiment_name) is None:
    mlflow.create_experiment(experiment_name)
mlflow.set_experiment(experiment_name)

# ============================================================
# 2️ Imports de librerías y módulos propios
# ============================================================
from src.data.load_data import load_arff
from src.features.feature_engineering import combine_rare, create_ordinal_features, add_frequency_features
from src.models.train_models import train_and_evaluate_model

# Scikit-Learn
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

# Modelos
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

# ============================================================
# 3️ Carga y limpieza de datos
# ============================================================
df = load_arff(data_raw / "CEE_DATA.arff")

# Eliminar duplicados
df = df.drop_duplicates()

# Agrupamiento de clases de Performance
df['Performance_grouped'] = df['Performance'].replace({
    'Average':'Average/Good',
    'Good':'Average/Good',
    'Vg':'Vg',
    'Excellent':'Excellent'
})

# Variable objetivo numérica
df['Performance_num'] = LabelEncoder().fit_transform(df['Performance_grouped'])

# ============================================================
# 4️ Feature engineering
# ============================================================
categorical_cols = [
    'Gender','Caste','coaching','time','Class_ten_education',
    'twelve_education','medium','Father_occupation','Mother_occupation'
]

# Combinar categorías raras
for col in categorical_cols:
    combine_rare(df, col)

# Variables ordinales
df = create_ordinal_features(
    df,
    ["Class_ X_Percentage","Class_XII_Percentage"],
    ["Poor","Average","Good","Vg","Excellent"]
)

# Features de frecuencia
df = add_frequency_features(df, categorical_cols)

# Guardar dataset intermedio
df.to_pickle(data_intermediate / "df.pkl")

# ============================================================
# 5️ Split y balanceo
# ============================================================
feature_cols = [col for col in df.columns if col.endswith('_freq') or col.endswith('_target_mean')] + ['Academic_Score']
X = df[feature_cols]
y = df['Performance_num']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

with open(data_processed / "Xy_train_resampled.pkl", "wb") as f:
    pickle.dump((X_train_res, y_train_res), f)
with open(data_processed / "Xy_test.pkl", "wb") as f:
    pickle.dump((X_test, y_test), f)

# ============================================================
# 6️ Preprocesamiento con Pipeline de Scikit-Learn
# ============================================================
numeric_cols = ["Academic_Score","Class_ X_Percentage","Class_XII_Percentage"]
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ]
)

# ============================================================
# 7️ Entrenamiento y seguimiento de modelos
# ============================================================
models = {
    'RandomForest': RandomForestClassifier(random_state=42, class_weight='balanced'),
    'XGBoost': XGBClassifier(objective='multi:softprob', eval_metric='mlogloss', random_state=42,
                             tree_method='hist', use_label_encoder=False, n_jobs=-1),
    'CatBoost': CatBoostClassifier(iterations=300, verbose=0, random_seed=42, class_weights=[1,2,2]),
    'ExtraTrees': ExtraTreesClassifier(random_state=42, n_jobs=-1, class_weight='balanced')
}

results_summary = []
timestamp = datetime.now().strftime("%Y%m%d_%H%M")

for name, model in models.items():
    print(f"\n--- Entrenando {name} ---")
    pipe = Pipeline([('preprocessor', preprocessor), ('classifier', model)])
    
    with mlflow.start_run(run_name=name):
        best_model, f1_score, best_params = train_and_evaluate_model(
            pipe, X_train_res, y_train_res, X_test, y_test
        )
        
        # Guardar modelo
        save_path = model_dir / f"{name}_best_model_{timestamp}.pkl"
        with open(save_path, "wb") as f:
            pickle.dump(best_model, f)
        
        # Logging en MLflow
        mlflow.log_params(best_params)
        mlflow.log_metric("F1_CV", f1_score)
        mlflow.sklearn.log_model(best_model, artifact_path="models")
        
        # Log de datasets y resumen
        mlflow.log_artifact(data_intermediate / "df.pkl")
        mlflow.log_artifact(data_processed / "Xy_train_resampled.pkl")
        mlflow.log_artifact(data_processed / "Xy_test.pkl")
        
        results_summary.append({"Model": name, "F1_CV": f1_score})

# ============================================================
# 8️ Guardar resumen de resultados
# ============================================================
summary_file = model_summary_dir / f"results_summary_{timestamp}.csv"
pd.DataFrame(results_summary).to_csv(summary_file, index=False)
mlflow.log_artifact(summary_file)

print("\n Pipeline completado correctamente.")
