In [1]:
import os, sys
from pathlib import Path

# Ruta al directorio raíz del proyecto:
# notebooks/  →  subir uno → raíz
ROOT = Path(os.getcwd()).resolve().parent

# Agregar datasets/ y scripts/ al path
sys.path.append(str(ROOT))
sys.path.append(str(ROOT / "datasets"))
sys.path.append(str(ROOT / "scripts"))

print("ROOT =", ROOT)

# ============================================================
# 0) IMPORTS Y CONFIGURACIÓN GENERAL
# ============================================================
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    f1_score,
    balanced_accuracy_score,
    matthews_corrcoef,
    accuracy_score,
    precision_score,
    recall_score,
    roc_auc_score,
    average_precision_score,
)

from imblearn.over_sampling import SMOTE, BorderlineSMOTE

from datasets.cargar_dataset import cargar_dataset                      # Función para cargar datasets según configuración
from datasets.config_datasets import config_datasets  
from scripts.pc_smote import PCSMOTE  

# ⚠️ Ajustar estos imports según tu estructura real
# ------------------------------------------------
# from scripts.config_datasets import config_datasets
# from scripts.cargar_dataset import cargar_dataset
# from scripts.PCSMOTE import PCSMOTE

RANDOM_STATE = 42

# ============================================================
# 1) FUNCIÓN AUXILIAR: CARGAR Y PREPARAR DATASET
# ============================================================
def cargar_y_preparar_dataset(nombre_dataset, config, test_size=0.2, random_state=RANDOM_STATE):
    """
    Usa tu función cargar_dataset + config_datasets para cargar,
    hacer split train/test y escalar con RobustScaler.

    Devuelve:
    - X_train_scaled, y_train
    - X_test_scaled, y_test
    """
    df_features, y, clases = cargar_dataset(
        path=config["path"],
        clase_minoria=config.get("clase_minoria"),
        col_features=config.get("col_features"),
        col_target=config.get("col_target"),
        sep=config.get("sep"),
        header=config.get("header"),
        binarizar=config.get("binarizar", False),
        tipo=config.get("tipo", "tabular"),
        impute=config.get("impute", "median"),
        na_values=config.get("na_values", ('?', 'NA', 'None')),
        dataset_name=config.get("dataset_name", nombre_dataset),
        names=config.get("esquema"),
    )

    X = df_features.values
    y = np.asarray(y)

    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=test_size,
        random_state=random_state,
        stratify=y,
    )

    scaler = RobustScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    return X_train_scaled, y_train, X_test_scaled, y_test, clases


# ============================================================
# 2) DEFINIR RF + ESPACIO DE BÚSQUEDA
# ============================================================
rf_base = RandomForestClassifier(
    random_state=RANDOM_STATE,
    n_jobs=1,
    bootstrap=True,
    oob_score=False,
)

# Espacio de hiperparámetros sencillo pero razonable
param_space = {
    "n_estimators": [100, 150, 200],
    "max_depth": [None, 5, 10, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["sqrt", "log2"],
}

def construir_cv(n_clases, n_muestras, random_state=RANDOM_STATE):
    """
    Estrategia simple para este ensayo:
      - siempre 5 folds estratificados, salvo que haya muy pocas muestras.
    """
    return StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)


# ============================================================
# 3) FUNCIÓN DE ENTRENAR + EVALUAR (CON TODAS LAS MÉTRICAS)
# ============================================================
def entrenar_y_evaluar(X, y, X_test, y_test, nombre_experimento):
    """
    Entrena un RF con RandomizedSearchCV (scoring=f1_macro)
    y evalúa en el test con varias métricas, incluyendo:
      - F1 macro, Balanced Accuracy, MCC, Accuracy
      - Precision macro, Recall macro
      - ROC AUC macro OVR, AUC-PR macro
    """
    cv = construir_cv(
        n_clases=len(np.unique(y)),
        n_muestras=len(y),
        random_state=RANDOM_STATE
    )

    search = RandomizedSearchCV(
        rf_base,
        param_distributions=param_space,
        n_iter=20,
        scoring="f1_macro",
        cv=cv,
        n_jobs=1,
        random_state=RANDOM_STATE,
        verbose=0,
    )

    search.fit(X, y)
    best_model = search.best_estimator_

    # -------------------------
    # PREDICCIONES EN TEST
    # -------------------------
    y_pred = best_model.predict(X_test)

    f1_test = f1_score(y_test, y_pred, average="macro")
    bacc_test = balanced_accuracy_score(y_test, y_pred)
    mcc_test = matthews_corrcoef(y_test, y_pred)
    acc_test = accuracy_score(y_test, y_pred)

    precision_macro = precision_score(y_test, y_pred, average="macro")
    recall_macro = recall_score(y_test, y_pred, average="macro")

    # -------------------------
    # PROBABILIDADES → ROC-AUC y AUC-PR
    # -------------------------
    roc_auc_macro_ovr = np.nan
    auc_pr_macro = np.nan

    y_proba = None
    if hasattr(best_model, "predict_proba"):
        try:
            y_proba = best_model.predict_proba(X_test)
        except Exception:
            y_proba = None

    if y_proba is not None:
        try:
            roc_auc_macro_ovr = roc_auc_score(
                y_test,
                y_proba,
                multi_class="ovr",
                average="macro",
            )
        except Exception:
            roc_auc_macro_ovr = np.nan

        try:
            auc_pr_macro = average_precision_score(
                y_test,
                y_proba,
                average="macro",
            )
        except Exception:
            auc_pr_macro = np.nan

    resultados = {
        "nombre_experimento": nombre_experimento,
        "f1_cv": float(search.best_score_),
        "f1_test": float(f1_test),
        "bacc_test": float(bacc_test),
        "mcc_test": float(mcc_test),
        "accuracy_test": float(acc_test),
        "precision_macro_test": float(precision_macro),
        "recall_macro_test": float(recall_macro),
        "roc_auc_macro_ovr_test": float(roc_auc_macro_ovr),
        "auc_pr_macro_test": float(auc_pr_macro),
        "mejores_params": search.best_params_,
    }

    return resultados


# ============================================================
# 4) APLICAR RESAMPLERS (BASE, SMOTE, BORDERLINE, PCSMOTE)
# ============================================================
def aplicar_resampleo_smote(X_train, y_train, random_state=RANDOM_STATE):
    sm = SMOTE(random_state=random_state)
    X_res, y_res = sm.fit_resample(X_train, y_train)
    return X_res, y_res

def aplicar_resampleo_borderline(X_train, y_train, random_state=RANDOM_STATE):
    bl = BorderlineSMOTE(random_state=random_state, kind="borderline-1")
    X_res, y_res = bl.fit_resample(X_train, y_train)
    return X_res, y_res

def aplicar_resampleo_pcsmote(
    X_train,
    y_train,
    random_state=RANDOM_STATE,
    percentil_densidad=80.0,
    percentil_riesgo=40.0,
    criterio_pureza="entropia",
):
    """
    Usa tu implementación de PCSMOTE con parámetros fijos
    (los podés tunear luego si querés hacer una grilla).
    """
    sampler = PCSMOTE(
        random_state=random_state,
        criterio_pureza=criterio_pureza,
        percentil_dist_densidad=percentil_densidad,
        percentil_dist_riesgo=percentil_riesgo,
    )
    X_res, y_res = sampler.fit_resample(X_train, y_train)
    return X_res, y_res


# ============================================================
# 5) EXPERIMENTO COMPLETO PARA UN DATASET
# ============================================================
def experimento_completo(nombre_dataset, config):
    """
    Para un dataset dado (por nombre + config_datasets),
    ejecuta 4 condiciones:
        - base train/test
        - SMOTE
        - Borderline-SMOTE
        - PCSMOTE

    Devuelve un DataFrame con las métricas de cada condición.
    """
    print(f"\n======================")
    print(f"Dataset: {nombre_dataset}")
    print(f"======================")

    # 1) Cargar, split, escalar
    X_train, y_train, X_test, y_test, clases = cargar_y_preparar_dataset(
        nombre_dataset,
        config,
    )

    resultados = []

    # ---------------------------------
    # A) CASO BASE (sin sobremuestreo)
    # ---------------------------------
    print("▶ Caso BASE (sin sobremuestreo)")
    res_base = entrenar_y_evaluar(
        X_train, y_train, X_test, y_test,
        nombre_experimento=f"{nombre_dataset}_base"
    )
    resultados.append(res_base)

    # ---------------------------------
    # B) SMOTE
    # ---------------------------------
    print("▶ SMOTE")
    try:
        X_sm, y_sm = aplicar_resampleo_smote(X_train, y_train)
        res_smote = entrenar_y_evaluar(
            X_sm, y_sm, X_test, y_test,
            nombre_experimento=f"{nombre_dataset}_smote"
        )
        resultados.append(res_smote)
    except Exception as e:
        print(f"   ⚠️ Error SMOTE: {e}")

    # ---------------------------------
    # C) Borderline-SMOTE
    # ---------------------------------
    print("▶ Borderline-SMOTE")
    try:
        X_bl, y_bl = aplicar_resampleo_borderline(X_train, y_train)
        res_bl = entrenar_y_evaluar(
            X_bl, y_bl, X_test, y_test,
            nombre_experimento=f"{nombre_dataset}_borderlinesmote"
        )
        resultados.append(res_bl)
    except Exception as e:
        print(f"   ⚠️ Error BorderlineSMOTE: {e}")

    # ---------------------------------
    # D) PCSMOTE
    # ---------------------------------
    print("▶ PCSMOTE (D=80, R=40, P=entropia)")
    try:
        X_pc, y_pc = aplicar_resampleo_pcsmote(
            X_train,
            y_train,
            random_state=RANDOM_STATE,
            percentil_densidad=80.0,
            percentil_riesgo=40.0,
            criterio_pureza="entropia",
        )
        res_pc = entrenar_y_evaluar(
            X_pc, y_pc, X_test, y_test,
            nombre_experimento=f"{nombre_dataset}_pcsmote_D80_R40_entropia"
        )
        resultados.append(res_pc)
    except Exception as e:
        print(f"   ⚠️ Error PCSMOTE: {e}")

    df_resultados = pd.DataFrame(resultados)
    return df_resultados


# ============================================================
# 6) EJEMPLOS DE EJECUCIÓN
# ============================================================
# ⚠️ Ajustar nombres de datasets a los que realmente tengas en config_datasets
#     por ejemplo: "ecoli", "glass", "heart", "wdbc", etc.

# Ejemplo 1: ecoli
df_ecoli = experimento_completo("ecoli", config_datasets["ecoli"])
print("\nResultados ecoli:")
display(df_ecoli)

# Ejemplo 2: glass
df_glass = experimento_completo("glass", config_datasets["glass"])
print("\nResultados glass:")
display(df_glass)

# Ejemplo 3: heart
df_heart = experimento_completo("heart", config_datasets["heart"])
print("\nResultados heart:")
display(df_heart)

# Ejemplo 4: wdbc
df_wdbc = experimento_completo("wdbc", config_datasets["wdbc"])
print("\nResultados wdbc:")
display(df_wdbc)

ROOT = D:\Documentos_D\TESIS\armado tesina\codigo

Dataset: ecoli
▶ Caso BASE (sin sobremuestreo)




▶ SMOTE
   ⚠️ Error SMOTE: Expected n_neighbors <= n_samples,  but n_samples = 2, n_neighbors = 6
▶ Borderline-SMOTE
   ⚠️ Error BorderlineSMOTE: Expected n_neighbors <= n_samples,  but n_samples = 4, n_neighbors = 6
▶ PCSMOTE (D=80, R=40, P=entropia)





Resultados ecoli:


Unnamed: 0,nombre_experimento,f1_cv,f1_test,bacc_test,mcc_test,accuracy_test,precision_macro_test,recall_macro_test,roc_auc_macro_ovr_test,auc_pr_macro_test,mejores_params
0,ecoli_base,0.701282,0.884625,0.898539,0.844957,0.882353,0.89596,0.898539,,,"{'n_estimators': 150, 'min_samples_split': 2, ..."
1,ecoli_pcsmote_D80_R40_entropia,0.715572,0.895387,0.91369,0.865789,0.897059,0.901515,0.91369,,,"{'n_estimators': 100, 'min_samples_split': 5, ..."



Dataset: glass
▶ Caso BASE (sin sobremuestreo)
▶ SMOTE
▶ Borderline-SMOTE
▶ PCSMOTE (D=80, R=40, P=entropia)

Resultados glass:


Unnamed: 0,nombre_experimento,f1_cv,f1_test,bacc_test,mcc_test,accuracy_test,precision_macro_test,recall_macro_test,roc_auc_macro_ovr_test,auc_pr_macro_test,mejores_params
0,glass_base,0.682426,0.796825,0.809524,0.720761,0.790698,0.808761,0.809524,0.973207,0.91134,"{'n_estimators': 100, 'min_samples_split': 5, ..."
1,glass_smote,0.907995,0.793861,0.868254,0.710486,0.767442,0.787897,0.868254,0.966524,0.908052,"{'n_estimators': 150, 'min_samples_split': 2, ..."
2,glass_borderlinesmote,0.914598,0.789752,0.884127,0.715014,0.767442,0.76338,0.884127,0.967063,0.908509,"{'n_estimators': 200, 'min_samples_split': 5, ..."
3,glass_pcsmote_D80_R40_entropia,0.75018,0.788414,0.809524,0.724166,0.790698,0.795635,0.809524,0.975133,0.933796,"{'n_estimators': 150, 'min_samples_split': 2, ..."



Dataset: heart
▶ Caso BASE (sin sobremuestreo)


  _warn_prf(average, modifier, msg_start, len(result))


▶ SMOTE


  _warn_prf(average, modifier, msg_start, len(result))


▶ Borderline-SMOTE


  _warn_prf(average, modifier, msg_start, len(result))


▶ PCSMOTE (D=80, R=40, P=entropia)

Resultados heart:


  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,nombre_experimento,f1_cv,f1_test,bacc_test,mcc_test,accuracy_test,precision_macro_test,recall_macro_test,roc_auc_macro_ovr_test,auc_pr_macro_test,mejores_params
0,heart_base,0.307265,0.209337,0.230303,0.252731,0.557377,0.192461,0.230303,0.787065,0.375244,"{'n_estimators': 100, 'min_samples_split': 5, ..."
1,heart_smote,0.889726,0.179636,0.181818,0.155068,0.459016,0.177922,0.181818,0.74848,0.351797,"{'n_estimators': 150, 'min_samples_split': 2, ..."
2,heart_borderlinesmote,0.910862,0.24716,0.258874,0.268017,0.540984,0.237143,0.258874,0.735732,0.354726,"{'n_estimators': 150, 'min_samples_split': 2, ..."
3,heart_pcsmote_D80_R40_entropia,0.307265,0.209337,0.230303,0.252731,0.557377,0.192461,0.230303,0.787065,0.375244,"{'n_estimators': 100, 'min_samples_split': 5, ..."



Dataset: wdbc
▶ Caso BASE (sin sobremuestreo)
▶ SMOTE
▶ Borderline-SMOTE
▶ PCSMOTE (D=80, R=40, P=entropia)

Resultados wdbc:


Unnamed: 0,nombre_experimento,f1_cv,f1_test,bacc_test,mcc_test,accuracy_test,precision_macro_test,recall_macro_test,roc_auc_macro_ovr_test,auc_pr_macro_test,mejores_params
0,wdbc_base,0.962402,0.94223,0.933532,0.887244,0.947368,0.953947,0.933532,,,"{'n_estimators': 100, 'min_samples_split': 5, ..."
1,wdbc_smote,0.970163,0.961911,0.957341,0.924518,0.964912,0.96723,0.957341,,,"{'n_estimators': 100, 'min_samples_split': 5, ..."
2,wdbc_borderlinesmote,0.97368,0.971277,0.964286,0.944155,0.973684,0.98,0.964286,,,"{'n_estimators': 100, 'min_samples_split': 5, ..."
3,wdbc_pcsmote_D80_R40_entropia,0.966592,0.952129,0.945437,0.905824,0.95614,0.960513,0.945437,,,"{'n_estimators': 100, 'min_samples_split': 5, ..."
