### Obtención de parámetros

In [None]:
!pip -q install -U flaml[automl] shap scikit-plot plotly==5.* kaleido

In [None]:
import pandas as pd
from google.colab import files

uploaded = files.upload()
fname = next(iter(uploaded))
df = pd.read_csv(fname, sep=None) # exoplanetas_unificado.csv
print(df.shape)
df.head(3)

In [None]:
import re
import numpy as np
if not hasattr(np, "NaN"):
    np.NaN = np.nan
import pandas as pd

# 1) Detectar columna objetivo
POSSIBLE_TARGETS = ["label","clase","class","status","disposition","disposicion","koi_disposition","objetivo","target"]
target = next((c for c in POSSIBLE_TARGETS if c in df.columns), "disposition")
print("Target:", target)

# 2) Mapear a 3 clases
LABEL_MAP_FP = {"FALSE POSITIVE","FP","FA","REFUTED","REFUTED PLANET","REFUTED [PLANET]","FALSE POSITIVE [CANDIDATE]"}
LABEL_MAP_CONFIRMED = {"CONFIRMED","CP","KP","KNOWN PLANET","CONFIRMED [PLANET]"}
LABEL_MAP_CANDIDATE = {"CANDIDATE","PC","APC","NOT DISPOSITIONED","NOT DISPOSITIONED"}  # NBSP posible

def _norm(s: str) -> str:
    s = str(s).replace("\xa0"," ")
    s = re.sub(r"\[[^\]]*\]","", s)
    s = re.sub(r"[^A-Za-z ]+"," ", s).upper()
    return re.sub(r"\s+"," ", s).strip()

FP = {_norm(x) for x in LABEL_MAP_FP}
CF = {_norm(x) for x in LABEL_MAP_CONFIRMED}
CD = {_norm(x) for x in LABEL_MAP_CANDIDATE}

def canonical_label(s):
    if pd.isna(s): return np.nan
    t = _norm(s)
    if t in FP or "REFUTED" in t:           return "FALSE POSITIVE"
    if t in CF or "KNOWN PLANET" in t:      return "CONFIRMED"
    if t in CD or "NOT DISPOSITIONED" in t: return "CANDIDATE"
    if "FALSE" in t and "POSITIVE" in t:    return "FALSE POSITIVE"
    if "CONFIRM" in t:                      return "CONFIRMED"
    if "CANDIDATE" in t:                    return "CANDIDATE"
    return np.nan

df["target_raw"] = df[target]
df[target] = df[target].apply(canonical_label)
df = df[df[target].isin(["CONFIRMED","CANDIDATE","FALSE POSITIVE"])].copy()
print(df[target].value_counts())

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
import numpy as np

# A) Columnas a excluir SIEMPRE (IDs) como en tu AutoML
id_cols = [c for c in df.columns if any(k in c.lower() for k in ["id","name","pl_name","tic","kepid","kepoi"])]

# B) Columnas con ALTO riesgo de fuga (predicciones, etiquetas o derivados)
LEAK_PATTERNS = [
    "prediction", "predic", "predicho", "pred", "label", "disposition",
    "disposicion", "status", "class", "target", "objetivo",
    "koi_disposition", "result", "score", "proba", "fold", "kfold", "split"
]
LEAK_EXACT = {"prediction_label","prediction_score","Label","Score","target_raw"}

leak_cols = [c for c in df.columns if c in LEAK_EXACT or any(pat in c.lower() for pat in LEAK_PATTERNS)]

# C) Columnas a excluir adicionalmente (RA y DEC)
coords_cols = [c for c in df.columns if c.strip().upper() in ["RA", "DEC"]]

# D) Construimos X e y como en el AutoML, pero SIN estas columnas
y = df[target].astype(str)
X = df.drop(columns=list(set([target] + id_cols + leak_cols + coords_cols)))

print(f"Columnas eliminadas (potencial fuga): {sorted(set(leak_cols + coords_cols + [target]))}")
print(f"Total features elegidas: {X.shape[1]}")
# (si quieres verlas) print(sorted(X.columns.tolist())[:50])

# E) Preprocesado igual que antes: median numérica + OneHot categórica
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()

preprocess = ColumnTransformer([
    ("num", SimpleImputer(strategy="median"), num_cols),
    ("cat", Pipeline([
        ("imp", SimpleImputer(strategy="most_frequent")),
        ("oh", OneHotEncoder(handle_unknown="ignore"))
    ]), cat_cols)
])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

from flaml import AutoML
automl = AutoML()
automl.fit(
    X_train=X_train, y_train=y_train,
    X_val=X_test,   y_val=y_test,
    task="classification",
    metric="macro_f1",              # robusto con clases balanceadas/desbalanceadas
    estimator_list=["lgbm","xgboost","rf","extra_tree","lrl1"],
    time_budget=600,
    seed=42,
    log_file_name="automl.log"
)
print("Best:", automl.best_estimator, automl.best_config)

In [None]:
from sklearn.metrics import classification_report, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import numpy as np

preds  = np.array(automl.predict(X_test))
probas = automl.predict_proba(X_test)

print(classification_report(y_test, preds, digits=3))

fig, ax = plt.subplots(figsize=(5.5,5))
ConfusionMatrixDisplay.from_predictions(y_test, preds ,cmap="Blues", ax=ax, normalize=None)
ax.set_title("Confusion Matrix (3 classes)")
plt.show()

In [None]:
# Force plot por muestra (¿por qué se clasificó así?)
i = 0
shap.force_plot(getattr(explainer, 'expected_value', [explainer.expected_value])[0],
                shap_values[0][i] if isinstance(shap_values, list) else shap_values[i],
                matplotlib=True)


In [None]:
def explain_row(idx, k=8):
    x = X_test.iloc[[idx]]
    y_true = y_test.iloc[idx]
    proba = automl.predict_proba(x)[0]
    y_pred = automl.predict(x)[0]
    # SHAP local (si está disponible)
    try:
        x_t = preprocess.transform(x)
        sv_all = explainer.shap_values(x_t)
        if isinstance(sv_all, list):
            class_idx = list(automl.model.classes_).index(y_pred)
            sv = sv_all[class_idx][0]
        else:
            sv = sv_all[0]
        names = preprocess.get_feature_names_out()
        top = dict(sorted(zip(names, np.abs(sv)), key=lambda t: t[1], reverse=True)[:k])
    except Exception as e:
        top = {"info": f"No SHAP local por {e}"}
    return {"y_true": str(y_true), "y_pred": str(y_pred),
            "probas": dict(zip(automl.model.classes_, proba)),
            "top_features": top}

# Mapeo dataset → fuzzy
MAP = {
    "radius":  "pl_radio",
    "teq":     "pl_temperatura_eq",
    "insol":   "insolacion",
    "period":  "periodo_orbital",
    "st_teff": "st_temperatura",
    "st_rad":  "st_radio",
    "st_logg": "st_gravedad",
}

def to_fuzzy_input(row):
    def g(k, default=np.nan):
        col = MAP[k]; return float(row.get(col, default))
    return {"radius": g("radius",1.0),"teq":g("teq",290),"insol":g("insol",1.0),
            "period":g("period",50),"st_teff":g("st_teff",5777),
            "st_rad":g("st_rad",1.0),"st_logg":g("st_logg",4.4)}

from logicaDifusa import definir_variables as fuzzy_ihl

def predict_with_fuzzy(idx):
    x = X_test.iloc[[idx]]
    raw = df.loc[x.index[0]]
    y_pred = automl.predict(x)[0]

    out = explain_row(idx, k=6)
    out["fuzzy"] = None
    if y_pred in ("CONFIRMED","CANDIDATE"):
        fin = to_fuzzy_input(raw)
        ihl, cats = fuzzy_ihl(fin)
        out["fuzzy"] = {"ihl_score": float(ihl), "ihl_categorias": cats}
    return out

predict_with_fuzzy(0)


### Uso de los parámetros

In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, label_binarize
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# ========= Cargar datos =========
df = pd.read_csv("exoplanetas_unificado.csv")

# Quitar RA y DEC (correcto)
df = df.drop(columns=["RA", "DEC"])

# ========= Normalizar etiquetas =========
LABEL_MAP_FP = {"FALSE POSITIVE", "FP", "FA", "REFUTED [PLANET]", "FALSE POSITIVE [CANDIDATE]"}
LABEL_MAP_CONFIRMED = {"CONFIRMED", "CP", "KP", "KNOWN PLANET"}
LABEL_MAP_CANDIDATE = {"CANDIDATE", "PC", "APC", "NOT DISPOSITIONED"}

def normalize_label(label):
    if label in LABEL_MAP_FP:
        return "FALSE POSITIVE"
    elif label in LABEL_MAP_CONFIRMED:
        return "CONFIRMED"
    elif label in LABEL_MAP_CANDIDATE:
        return "CANDIDATE"
    else:
        return "OTHER"

# Aplicar normalización
y = df["label"].astype(str).apply(normalize_label)

# Filtrar solo las clases válidas
mask = y != "OTHER"
df = df[mask]
y = y[mask]

# ========= Separar X e y =========
# Quitamos columnas no numéricas que XGBoost no acepta
X = df.drop(columns=["label", "mission", "object_id"])

# ========= Codificar etiquetas =========
le = LabelEncoder()
y_encoded = le.fit_transform(y)
class_names = le.classes_

# ========= Train/test split =========
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

# ========= Modelo =========
params = {
        'n_estimators': 1080,
        'max_leaves': 45,
        'min_child_weight': 0.18534714676420808,
        'learning_rate': 0.07207682269307049,
        'subsample': 0.922847602771779,
        'colsample_bylevel': 1.0,
        'colsample_bytree': 1.0,
        'reg_alpha': 0.0014249188153002787,
        'reg_lambda': 0.2021107529299507
}

model = xgb.XGBClassifier(**params)
model.fit(X_train, y_train)

# ========= Predicciones =========
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)

# ========= Matriz de confusión =========
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(6, 5))
sns.heatmap(
    cm,
    annot=True,
    fmt="d",
    cmap=sns.color_palette(["#00ffe5", "#ff00ff", "#00aa88"]),
    xticklabels=class_names,
    yticklabels=class_names,
    cbar=False
)
plt.xlabel("Predicción", fontsize=12)
plt.ylabel("Verdadero", fontsize=12)
plt.title("Matriz de Confusión", fontsize=14, fontweight="bold")
plt.savefig("matriz_confusion.png", dpi=300, bbox_inches="tight")
plt.show()

# ========= Curvas ROC multiclase =========
y_test_bin = label_binarize(y_test, classes=np.arange(len(class_names)))
n_classes = y_test_bin.shape[1]

fpr, tpr, roc_auc = {}, {}, {}
colors = ["#00ffe5", "#ff00ff", "#00aa88"]

for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_pred_proba[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

fpr["micro"], tpr["micro"], _ = roc_curve(y_test_bin.ravel(), y_pred_proba.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

plt.figure(figsize=(7, 6))
for i, color in zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=2, label=f"{class_names[i]} (AUC = {roc_auc[i]:.2f})")

plt.plot(fpr["micro"], tpr["micro"], linestyle="--", color="black", label=f"Micro-average (AUC = {roc_auc['micro']:.2f})", lw=2)

plt.plot([0, 1], [0, 1], "k--", lw=1)
plt.xlabel("Tasa de Falsos Positivos", fontsize=12)
plt.ylabel("Tasa de Verdaderos Positivos", fontsize=12)
plt.title("Curvas ROC Multiclase", fontsize=14, fontweight="bold")
plt.legend(loc="lower right")
plt.grid(alpha=0.3)
plt.savefig("curva_ROC_multiclase.png", dpi=300, bbox_inches="tight")
plt.show()

# ========= Reporte de clasificación =========
print("===== Reporte de Clasificación =====")
print(classification_report(y_test, y_pred, target_names=class_names))

In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import joblib

# ========= Cargar datos =========
df = pd.read_csv("exoplanetas_unificado.csv")

# Quitar RA y DEC (correcto)
df = df.drop(columns=["RA", "DEC"])

# ========= Normalizar etiquetas =========
LABEL_MAP_FP = {"FALSE POSITIVE", "FP", "FA", "REFUTED [PLANET]", "FALSE POSITIVE [CANDIDATE]"}
LABEL_MAP_CONFIRMED = {"CONFIRMED", "CP", "KP", "KNOWN PLANET"}
LABEL_MAP_CANDIDATE = {"CANDIDATE", "PC", "APC", "NOT DISPOSITIONED"}

def normalize_label(label):
    if label in LABEL_MAP_FP:
        return "FALSE POSITIVE"
    elif label in LABEL_MAP_CONFIRMED or label in LABEL_MAP_CANDIDATE:
        return "PLANET"   # clase positiva (confirmados + candidatos)
    else:
        return "OTHER"

y = df["label"].astype(str).apply(normalize_label)

# Filtrar solo las clases válidas
mask = y != "OTHER"
df = df[mask]
y = y[mask]

# ========= Separar X e y =========
X = df.drop(columns=["label", "mission", "object_id"])

# ========= Codificar etiquetas =========
le = LabelEncoder()
y_encoded = le.fit_transform(y)
class_names = le.classes_   # ["FALSE POSITIVE", "PLANET"]

# ========= Train/test split =========
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

# ========= Modelo binario =========
params = {
    'n_estimators': 228,
    'max_leaves': 143,
    'min_child_weight': 0.5296177148580371,
    'learning_rate': 0.07186279434453446,
    'subsample': 0.9911870640734798,
    'colsample_bylevel': 0.8415892418572928,
    'colsample_bytree': 1.0,
    'reg_alpha': 0.018870980731499835,
    'reg_lambda': 0.8602626316154423,
    'objective': 'binary:logistic',
    'eval_metric': 'logloss'
}

model = xgb.XGBClassifier(**params)
model.fit(X_train, y_train)

# ========= Guardar modelo y encoder =========
joblib.dump({"model": model, "label_encoder": le}, "xgb_exoplanet_model.pkl")
print("✅ Modelo guardado en xgb_exoplanet_model.pkl")

# ========= Predicciones =========
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

# ========= Matriz de confusión =========
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(5, 4))
sns.heatmap(
    cm,
    annot=True,
    fmt="d",
    cmap=sns.color_palette(["#00ffe5", "#ff00ff"]),  # tu paleta
    xticklabels=class_names,
    yticklabels=class_names,
    cbar=False
)
plt.xlabel("Predicción", fontsize=12)
plt.ylabel("Verdadero", fontsize=12)
plt.title("Matriz de Confusión (Binaria)", fontsize=14, fontweight="bold")
plt.savefig("matriz_confusion_binaria.png", dpi=300, bbox_inches="tight")
plt.show()

# ========= Curva ROC =========
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(6, 5))
plt.plot(fpr, tpr, lw=2, color=colors[0], label=f"PLANET (AUC = {roc_auc:.2f})")
plt.plot([0, 1], [0, 1], "k--", lw=1)
plt.xlabel("Tasa de Falsos Positivos", fontsize=12)
plt.ylabel("Tasa de Verdaderos Positivos", fontsize=12)
plt.title("Curva ROC (Binaria)", fontsize=14, fontweight="bold")
plt.legend(loc="lower right")
plt.grid(alpha=0.3)
plt.savefig("curva_ROC_binaria.png", dpi=300, bbox_inches="tight")
plt.show()

# ========= Reporte de clasificación =========
print("===== Reporte de Clasificación =====")
print(classification_report(y_test, y_pred, target_names=class_names))

In [None]:
import shap

# ========= Calcular valores SHAP =========
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)

# ========= Gráfico resumen (importancia por variable) =========
plt.figure(figsize=(8, 6))
shap.summary_plot(
    shap_values,
    X_test,
    plot_type="bar",
    color=colors[0],  # tu color principal: "#00ffe5"
    show=False
)
plt.title("Importancia de las Variables (SHAP)", fontsize=14, fontweight="bold")
plt.xlabel("Valor medio |SHAP| (impacto promedio en la predicción)", fontsize=11)
plt.savefig("importancia_SHAP_bar.png", dpi=300, bbox_inches="tight")
plt.show()

# ========= Gráfico detallado (beeswarm / dispersión) =========
plt.figure(figsize=(10, 6))
shap.summary_plot(
    shap_values,
    X_test,
    cmap=plt.cm.colors.ListedColormap(["#00ffe5", "#ff00ff", "#00aa88"]),
    show=False
)
plt.title("Contribución de cada Variable (SHAP)", fontsize=14, fontweight="bold")
plt.savefig("importancia_SHAP_beeswarm.png", dpi=300, bbox_inches="tight")
plt.show()

In [None]:
import joblib
import os 
from google.colab import drive 

print(os.listdir("/content/drive"))
drive.mount('/content/drive')

joblib.dump(model, "/content/drive/MyDrive/xgb_exoplanet_model.pkl")