In [2]:
import os
import numpy as np
import pandas as pd
from pathlib import Path
from datetime import datetime

from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
from sklearn.metrics import balanced_accuracy_score, f1_score

from config_datasets import config_datasets
from cargar_dataset import cargar_dataset, graficar_distribucion_clases


# ======================================================================================
# CONFIG
# ======================================================================================
K_VECINOS_LISTA = [1, 3, 5, 7]
N_SPLITS_CV = 5
RANDOM_STATE = 42
SIGMA_RUIDO = 0.01

# Crear carpetas si no existen
Path("resultados").mkdir(exist_ok=True)
Path("figuras").mkdir(exist_ok=True)


# ======================================================================================
# M√âTRICAS (sin helpers "m√°gicos", todo expl√≠cito)
# ======================================================================================
def macro_f1(y_true, y_pred) -> float:
    return float(f1_score(y_true, y_pred, average="macro"))


def balanced_acc(y_true, y_pred) -> float:
    return float(balanced_accuracy_score(y_true, y_pred))


# ======================================================================================
# ESTUDIO 1: Consistencia vecinal (pureza de vecinos de misma clase)
# ======================================================================================
def estudio_consistencia_vecinal(X: np.ndarray, y: np.ndarray, k: int) -> dict:
    nn = NearestNeighbors(n_neighbors=k + 1, metric="euclidean")
    nn.fit(X)
    idx_vecinos = nn.kneighbors(X, return_distance=False)[:, 1:]  # quitamos el propio punto

    consistencias = []
    por_clase = {}

    clases = np.unique(y)
    for c in clases:
        por_clase[c] = []

    for i in range(X.shape[0]):
        y_i = y[i]
        vecinos = idx_vecinos[i]

        misma = 0
        for j in vecinos:
            if y[j] == y_i:
                misma += 1

        c_i = misma / float(k)
        consistencias.append(c_i)
        por_clase[y_i].append(c_i)

    media_global = float(np.mean(consistencias)) if len(consistencias) > 0 else 0.0

    medias_clase = {}
    for c in clases:
        vals = por_clase[c]
        medias_clase[c] = float(np.mean(vals)) if len(vals) > 0 else 0.0

    return {
        "consistencia_global_media": media_global,
        "consistencia_media_por_clase": medias_clase,
    }


# ======================================================================================
# ESTUDIO 2: kNN baseline (CV)
# ======================================================================================
def estudio_knn_baseline_cv(X: np.ndarray, y: np.ndarray, k: int, n_splits: int, random_state: int) -> dict:
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    bacs = []
    f1s = []

    for train_idx, test_idx in skf.split(X, y):
        X_train = X[train_idx]
        y_train = y[train_idx]
        X_test = X[test_idx]
        y_test = y[test_idx]

        clf = KNeighborsClassifier(n_neighbors=k, metric="euclidean")
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

        bacs.append(balanced_acc(y_test, y_pred))
        f1s.append(macro_f1(y_test, y_pred))

    return {
        "knn_bac_media": float(np.mean(bacs)) if len(bacs) > 0 else 0.0,
        "knn_bac_std": float(np.std(bacs, ddof=1)) if len(bacs) > 1 else 0.0,
        "knn_f1_macro_media": float(np.mean(f1s)) if len(f1s) > 0 else 0.0,
        "knn_f1_macro_std": float(np.std(f1s, ddof=1)) if len(f1s) > 1 else 0.0,
    }


# ======================================================================================
# ESTUDIO 3: Estabilidad de vecindario (Jaccard A vs B)
# B = X + ruido gaussiano leve
# ======================================================================================
def estudio_estabilidad_vecindario_jaccard(X: np.ndarray, k: int, sigma_ruido: float, random_state: int) -> dict:
    rng = np.random.default_rng(random_state)

    X_a = X
    ruido = rng.normal(loc=0.0, scale=sigma_ruido, size=X.shape)
    X_b = X + ruido

    nn_a = NearestNeighbors(n_neighbors=k + 1, metric="euclidean")
    nn_b = NearestNeighbors(n_neighbors=k + 1, metric="euclidean")
    nn_a.fit(X_a)
    nn_b.fit(X_b)

    idx_a = nn_a.kneighbors(X_a, return_distance=False)[:, 1:]
    idx_b = nn_b.kneighbors(X_b, return_distance=False)[:, 1:]

    jaccards = []

    for i in range(X.shape[0]):
        set_a = set(idx_a[i].tolist())
        set_b = set(idx_b[i].tolist())

        inter = len(set_a.intersection(set_b))
        uni = len(set_a.union(set_b))
        j = inter / float(uni) if uni > 0 else 0.0
        jaccards.append(j)

    return {
        "jaccard_media": float(np.mean(jaccards)) if len(jaccards) > 0 else 0.0,
        "jaccard_std": float(np.std(jaccards, ddof=1)) if len(jaccards) > 1 else 0.0,
        "sigma_ruido": sigma_ruido,
    }


# ======================================================================================
# ESTUDIO 4: Margen local (inter/intra)
# ======================================================================================
def estudio_margen_local(X: np.ndarray, y: np.ndarray, k: int, eps: float = 1e-9) -> dict:
    nn = NearestNeighbors(n_neighbors=k + 1, metric="euclidean")
    nn.fit(X)
    distancias, indices = nn.kneighbors(X, return_distance=True)

    distancias = distancias[:, 1:]
    indices = indices[:, 1:]

    ratios = []
    por_clase = {}

    clases = np.unique(y)
    for c in clases:
        por_clase[c] = []

    for i in range(X.shape[0]):
        y_i = y[i]
        idx_vec = indices[i]
        dist_vec = distancias[i]

        intra = []
        inter = []

        for pos in range(len(idx_vec)):
            j = idx_vec[pos]
            d = dist_vec[pos]
            if y[j] == y_i:
                intra.append(d)
            else:
                inter.append(d)

        if len(intra) == 0 or len(inter) == 0:
            continue

        d_intra = float(np.mean(intra))
        d_inter = float(np.mean(inter))
        r_i = d_inter / (d_intra + eps)

        ratios.append(r_i)
        por_clase[y_i].append(r_i)

    media_global = float(np.mean(ratios)) if len(ratios) > 0 else 0.0

    medias_clase = {}
    for c in clases:
        vals = por_clase[c]
        medias_clase[c] = float(np.mean(vals)) if len(vals) > 0 else 0.0

    return {
        "margen_local_media_global": media_global,
        "margen_local_media_por_clase": medias_clase,
        "n_validos": int(len(ratios)),
        "n_total": int(X.shape[0]),
    }


# ======================================================================================
# MAIN
# ======================================================================================
timestamp = datetime.now().strftime("%Y-%m-%d_%H%M")

nombre_archivo_txt = f"resultados/reporte_distribucion_y_vecindad_{timestamp}.txt"
nombre_archivo_csv = f"resultados/diagnostico_vecindad_{timestamp}.csv"

lineas_resultado = []
filas_csv = []

for nombre, cfg in config_datasets.items():
    lineas_resultado.append(f"\nüîç Analizando dataset: {nombre.upper()}")
    print(f"\nüîç Analizando dataset: {nombre.upper()}")

    try:
        names = cfg.get("esquema") if cfg.get("header", None) is None else None

        X, y, _ = cargar_dataset(
            path=cfg.get("path"),
            clase_minoria=cfg.get("clase_minoria"),
            col_features=cfg.get("col_features"),
            col_target=cfg.get("col_target"),
            sep=cfg.get("sep", ","),
            header=cfg.get("header", None),
            binarizar=False,
            tipo=cfg.get("tipo", "tabular"),
            impute="median",
            names=names
        )
        # Asegurar numpy para que el indexado por filas funcione siempre
        if isinstance(X, pd.DataFrame):
            X = X.to_numpy()
        else:
            X = np.asarray(X)

        y = np.asarray(y)

        # -------------------------
        # Distribuci√≥n de clases
        # -------------------------
        conteo = pd.Series(y).value_counts()
        clase_min_real = conteo.idxmin()
        total = conteo.sum()
        proporcion = (conteo / total * 100).round(2)

        print("üéØ Valores √∫nicos del target:", list(conteo.index))
        print("üìä Distribuci√≥n de clases:")

        lineas_resultado.append(f"üéØ Valores √∫nicos del target: {list(conteo.index)}")
        lineas_resultado.append("üìä Distribuci√≥n de clases:")

        for clase, count in conteo.items():
            print(f"   - {clase}: {count} ({proporcion[clase]}%)")
            lineas_resultado.append(f"   - {clase}: {count} ({proporcion[clase]}%)")

        lineas_resultado.append(f"‚úÖ Clase minoritaria real: {clase_min_real}")
        lineas_resultado.append(f"‚ö†Ô∏è Clase configurada como minoritaria: {cfg.get('clase_minoria')}")

        # Gr√°fico de distribuci√≥n
        nombre_figura = f"figuras/{nombre.lower()}_distribucion_{timestamp}.png"
        graficar_distribucion_clases(y, nombre_dataset=nombre, guardar_en=nombre_figura)

        # -------------------------
        # Diagn√≥stico de vecindad (4 estudios)
        # -------------------------
        lineas_resultado.append("üß≠ Diagn√≥stico de vecindad (m√©trica eucl√≠dea sobre X cargado):")

        for k in K_VECINOS_LISTA:
            r1 = estudio_consistencia_vecinal(X, y, k)
            r2 = estudio_knn_baseline_cv(X, y, k, N_SPLITS_CV, RANDOM_STATE)
            r3 = estudio_estabilidad_vecindario_jaccard(X, k, SIGMA_RUIDO, RANDOM_STATE)
            r4 = estudio_margen_local(X, y, k)

            linea = (
                f"   k={k:<2} | "
                f"consistencia={r1['consistencia_global_media']:.4f} | "
                f"kNN_BAC={r2['knn_bac_media']:.4f}¬±{r2['knn_bac_std']:.4f} | "
                f"kNN_F1m={r2['knn_f1_macro_media']:.4f}¬±{r2['knn_f1_macro_std']:.4f} | "
                f"Jaccard={r3['jaccard_media']:.4f}¬±{r3['jaccard_std']:.4f} | "
                f"margen={r4['margen_local_media_global']:.4f} (validos={r4['n_validos']}/{r4['n_total']})"
            )
            print(linea)
            lineas_resultado.append(linea)

            filas_csv.append({
                "dataset": nombre,
                "k": k,
                "n_muestras": int(X.shape[0]),
                "n_features": int(X.shape[1]),

                "consistencia_vecinal_media": float(r1["consistencia_global_media"]),
                "knn_bac_media": float(r2["knn_bac_media"]),
                "knn_bac_std": float(r2["knn_bac_std"]),
                "knn_f1_macro_media": float(r2["knn_f1_macro_media"]),
                "knn_f1_macro_std": float(r2["knn_f1_macro_std"]),
                "jaccard_media": float(r3["jaccard_media"]),
                "jaccard_std": float(r3["jaccard_std"]),
                "margen_local_media": float(r4["margen_local_media_global"]),
                "margen_local_n_validos": int(r4["n_validos"]),
                "margen_local_n_total": int(r4["n_total"]),
            })

    except Exception as e:
        print(f"‚ùå Error al analizar {nombre}: {e}")
        lineas_resultado.append(f"‚ùå Error al analizar {nombre}: {e}")

# Guardar TXT
with open(nombre_archivo_txt, "w", encoding="utf-8") as f:
    f.write("\n".join(lineas_resultado))

# Guardar CSV
df_out = pd.DataFrame(filas_csv)
df_out.to_csv(nombre_archivo_csv, index=False, encoding="utf-8")

print(f"\nüìÅ Reporte guardado en: {nombre_archivo_txt}")
print(f"üìÅ CSV guardado en: {nombre_archivo_csv}")



üîç Analizando dataset: US_CRIME
üéØ Valores √∫nicos del target: [-1, 1]
üìä Distribuci√≥n de clases:
   - -1: 1844 (92.48%)
   - 1: 150 (7.52%)
   k=1  | consistencia=0.9208 | kNN_BAC=0.6473¬±0.0563 | kNN_F1m=0.6670¬±0.0621 | Jaccard=0.9273¬±0.2597 | margen=0.0000 (validos=0/1994)
   k=3  | consistencia=0.9162 | kNN_BAC=0.6538¬±0.0440 | kNN_F1m=0.6909¬±0.0530 | Jaccard=0.9051¬±0.1973 | margen=1.0071 (validos=228/1994)
   k=5  | consistencia=0.9142 | kNN_BAC=0.6304¬±0.0473 | kNN_F1m=0.6737¬±0.0608 | Jaccard=0.9042¬±0.1549 | margen=1.0110 (validos=336/1994)
   k=7  | consistencia=0.9128 | kNN_BAC=0.6243¬±0.0447 | kNN_F1m=0.6675¬±0.0565 | Jaccard=0.9100¬±0.1290 | margen=1.0099 (validos=413/1994)

üîç Analizando dataset: SHUTTLE
üéØ Valores √∫nicos del target: [1, 4, 5, 3, 2, 7, 6]
üìä Distribuci√≥n de clases:
   - 1: 45586 (78.6%)
   - 4: 8903 (15.35%)
   - 5: 3267 (5.63%)
   - 3: 171 (0.29%)
   - 2: 50 (0.09%)
   - 7: 13 (0.02%)
   - 6: 10 (0.02%)
   k=1  | consistencia=0.9983 | 



   k=1  | consistencia=0.8125 | kNN_BAC=0.7308¬±0.0964 | kNN_F1m=0.6681¬±0.1240 | Jaccard=0.6964¬±0.4605 | margen=0.0000 (validos=0/336)
   k=3  | consistencia=0.8026 | kNN_BAC=0.7432¬±0.0914 | kNN_F1m=0.7135¬±0.1218 | Jaccard=0.7113¬±0.2812 | margen=1.0428 (validos=92/336)
   k=5  | consistencia=0.7958 | kNN_BAC=0.7540¬±0.0859 | kNN_F1m=0.7365¬±0.0965 | Jaccard=0.7357¬±0.2114 | margen=1.0680 (validos=134/336)
   k=7  | consistencia=0.7925 | kNN_BAC=0.7642¬±0.0839 | kNN_F1m=0.7485¬±0.0928 | Jaccard=0.7562¬±0.1752 | margen=1.0610 (validos=165/336)

üîç Analizando dataset: PREDICT_FAULTS
üéØ Valores √∫nicos del target: ['No Failure', 'Heat Dissipation Failure', 'Power Failure', 'Overstrain Failure', 'Tool Wear Failure', 'Random Failures']
üìä Distribuci√≥n de clases:
   - No Failure: 9652 (96.52%)
   - Heat Dissipation Failure: 112 (1.12%)
   - Power Failure: 95 (0.95%)
   - Overstrain Failure: 78 (0.78%)
   - Tool Wear Failure: 45 (0.45%)
   - Random Failures: 18 (0.18%)
   k=1  | co