# 🔭 Exoplanet Classification with ML (XGBoost, CatBoost, LightGBM, Voting Ensemble)
This notebook trains models on exoplanet datasets (Kepler, K2, TESS merged).
We predict whether a planet is **CONFIRMED, CANDIDATE, or FALSE POSITIVE**.

In [3]:
# Cell 1: Imports and Setup
import os
import warnings
from pathlib import Path
import joblib
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import VotingClassifier
from imblearn.over_sampling import SMOTE
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
import optuna


warnings.filterwarnings("ignore")
RND = 42
N_FOLDS = 5
N_TRIALS = 30
RESULTS_DIR = Path("ml_models")
RESULTS_DIR.mkdir(exist_ok=True)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Cell 2: Utility Functions
def load_data(path):
    return pd.read_csv(path)


def feature_engineering(df):
    df = df.copy()
    if "planet_radius" in df.columns and "star_radius" in df.columns:
        df["planet_density_ratio"] = df["planet_radius"] / (df["star_radius"] + 1e-6)
    if "period" in df.columns:
        df["log_period"] = np.log1p(df["period"])
    if "insolation" in df.columns and "star_radius" in df.columns:
        df["stellar_flux"] = df["insolation"] / (df["star_radius"] ** 2 + 1e-6)
    if "equilibrium_temp" in df.columns and "star_teff" in df.columns:
        df["temp_ratio"] = df["equilibrium_temp"] / (df["star_teff"] + 1e-6)
    return df

In [5]:

# Cell 3: Optuna Objective Functions
def objective_xgb(trial, X, y):
        params = {
                "n_estimators": trial.suggest_int("n_estimators", 100, 400),
                "max_depth": trial.suggest_int("max_depth", 3, 10),
                "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
                "subsample": trial.suggest_float("subsample", 0.5, 1.0),
                "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
                "gamma": trial.suggest_float("gamma", 0, 5),
                "random_state": RND,
                "n_jobs": -1,
                "objective": "multi:softprob",
                "eval_metric": "mlogloss",
                "use_label_encoder": False
        }
        model = xgb.XGBClassifier(**params)
        cv = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RND)
        scores = cross_val_score(model, X, y, cv=cv, scoring="f1_macro", n_jobs=-1)
        return scores.mean()


def objective_lgb(trial, X, y):
        params = {
                "n_estimators": trial.suggest_int("n_estimators", 100, 400),
                "max_depth": trial.suggest_int("max_depth", 3, 10),
                "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
                "subsample": trial.suggest_float("subsample", 0.5, 1.0),
                "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
                "random_state": RND,
                "n_jobs": -1,
                "objective": "multiclass"
        }
        model = lgb.LGBMClassifier(**params)
        cv = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RND)
        scores = cross_val_score(model, X, y, cv=cv, scoring="f1_macro", n_jobs=-1)
        return scores.mean()


def objective_cat(trial, X, y):
        params = {
                "iterations": trial.suggest_int("iterations", 200, 600),
                "depth": trial.suggest_int("depth", 4, 10),
                "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
                "loss_function": "MultiClass",
                "random_state": RND,
                "verbose": 0
        }
        model = cb.CatBoostClassifier(**params, thread_count=1)
        cv = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RND)
        scores = cross_val_score(model, X, y, cv=cv, scoring="f1_macro", n_jobs=-1)
        return scores.mean()

In [6]:
# Cell 4: Main Training and Evaluation
def main(data_path="merged_unified_dataset.csv", target_col="label"):
    print("Loading data...")
    df = load_data(data_path)
    df = df.dropna(subset=[target_col])
    df = feature_engineering(df)


    feature_cols = [c for c in df.columns if c not in [target_col, "source"]]
    X = df[feature_cols]
    y = df[target_col]


    le = LabelEncoder()
    y_enc = le.fit_transform(y)
    labels = le.classes_.tolist()


    X_train, X_test, y_train, y_test = train_test_split(X, y_enc, test_size=0.2, stratify=y_enc, random_state=RND)


    imputer = SimpleImputer(strategy="median")
    scaler = StandardScaler()
    smote = SMOTE(random_state=RND)


    X_train_imp = imputer.fit_transform(X_train)
    X_train_res, y_train_res = smote.fit_resample(X_train_imp, y_train)
    X_train_scaled = scaler.fit_transform(X_train_res)


    X_test_imp = imputer.transform(X_test)
    X_test_scaled = scaler.transform(X_test_imp)


    models = {}


    # XGBoost
    print("\n=== Tuning XGBoost ===")
    study_xgb = optuna.create_study(direction="maximize")
    study_xgb.optimize(lambda t: objective_xgb(t, X_train_scaled, y_train_res), n_trials=N_TRIALS)
    best_xgb = xgb.XGBClassifier(**study_xgb.best_params)
    best_xgb.fit(X_train_scaled, y_train_res)
    models["XGBoost"] = Pipeline([("imputer", imputer), ("scaler", scaler), ("model", best_xgb)])
    joblib.dump((models["XGBoost"], le.classes_), RESULTS_DIR / "XGBoost_pipeline.pkl")


    # LightGBM
    print("\n=== Tuning LightGBM ===")
    study_lgb = optuna.create_study(direction="maximize")
    study_lgb.optimize(lambda t: objective_lgb(t, X_train_scaled, y_train_res), n_trials=N_TRIALS)
    best_lgb = lgb.LGBMClassifier(**study_lgb.best_params)
    best_lgb.fit(X_train_scaled, y_train_res)
    models["LightGBM"] = Pipeline([("imputer", imputer), ("scaler", scaler), ("model", best_lgb)])
    joblib.dump((models["LightGBM"], le.classes_), RESULTS_DIR / "LightGBM_pipeline.pkl")


    # CatBoost
    print("\n=== Tuning CatBoost ===")
    study_cat = optuna.create_study(direction="maximize")
    study_cat.optimize(lambda t: objective_cat(t, X_train_scaled, y_train_res), n_trials=N_TRIALS)
    best_cat = cb.CatBoostClassifier(**study_cat.best_params, thread_count=1)
    best_cat.fit(X_train_scaled, y_train_res)
    models["CatBoost"] = Pipeline([("imputer", imputer), ("scaler", scaler), ("model", best_cat)])
    joblib.dump((models["CatBoost"], le.classes_), RESULTS_DIR / "CatBoost_pipeline.pkl")


    # Voting Ensemble
    voting = VotingClassifier(estimators=[("xgb", best_xgb), ("lgb", best_lgb), ("cat", best_cat)], voting="soft", weights=[2,1,1], n_jobs=-1)
    voting.fit(X_train_scaled, y_train_res)
    models["VotingEnsemble"] = Pipeline([("imputer", imputer), ("scaler", scaler), ("model", voting)])
    joblib.dump((models["VotingEnsemble"], le.classes_), RESULTS_DIR / "VotingEnsemble_pipeline.pkl")


    # Evaluation
    for name, pipeline in models.items():
        print(f"\n\n===== {name} =====")
        y_pred = pipeline.predict(X_test)
        y_pred_str = le.inverse_transform(y_pred)
        y_true_str = le.inverse_transform(y_test)


        acc = accuracy_score(y_true_str, y_pred_str)
        f1 = f1_score(y_true_str, y_pred_str, average="macro")
        cm = confusion_matrix(y_true_str, y_pred_str, labels=labels)


        print(f"Accuracy (test set): {acc:.4f}")
        print(f"F1 Score (macro, test set): {f1:.4f}")
        print("\nClassification Report (test set):")
        print(classification_report(y_true_str, y_pred_str, target_names=labels, zero_division=0))
        print("Confusion Matrix:\n", cm)


        cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring="f1_macro")
        print("\n**5-Fold CV F1 Scores (train set):", cv_scores)
        print("Mean CV F1 Score (train set):", np.mean(cv_scores))


    print("\nTraining + Evaluation complete!")
       

In [7]:
# Cell 5: Run main
if __name__ == "__main__":
    DATA_PATH = "merged_unified_dataset.csv"
    main(DATA_PATH)

[I 2025-09-27 13:44:09,480] A new study created in memory with name: no-name-3aa7b854-bdb8-4be5-b14e-714de98ccb31


Loading data...

=== Tuning XGBoost ===


[I 2025-09-27 13:44:28,455] Trial 0 finished with value: 0.7673112182502901 and parameters: {'n_estimators': 322, 'max_depth': 5, 'learning_rate': 0.03292790798834129, 'subsample': 0.5397993928005097, 'colsample_bytree': 0.9362822434660988, 'gamma': 3.792119043878215}. Best is trial 0 with value: 0.7673112182502901.
[I 2025-09-27 13:44:31,627] Trial 1 finished with value: 0.7916418516312883 and parameters: {'n_estimators': 253, 'max_depth': 9, 'learning_rate': 0.27570103993852046, 'subsample': 0.7652415454051997, 'colsample_bytree': 0.7865856903748073, 'gamma': 2.0724871810333867}. Best is trial 1 with value: 0.7916418516312883.
[I 2025-09-27 13:44:35,057] Trial 2 finished with value: 0.7976953393575863 and parameters: {'n_estimators': 208, 'max_depth': 5, 'learning_rate': 0.13910104986617627, 'subsample': 0.7257309350871248, 'colsample_bytree': 0.6786079789065522, 'gamma': 1.2693410567656072}. Best is trial 2 with value: 0.7976953393575863.
[I 2025-09-27 13:44:37,376] Trial 3 finished


=== Tuning LightGBM ===


[I 2025-09-27 13:48:20,423] Trial 0 finished with value: 0.7700994382412013 and parameters: {'n_estimators': 326, 'max_depth': 4, 'learning_rate': 0.026341101709275345, 'subsample': 0.9027261638048898, 'colsample_bytree': 0.6541286672167924}. Best is trial 0 with value: 0.7700994382412013.
[I 2025-09-27 13:48:26,759] Trial 1 finished with value: 0.8262192313529635 and parameters: {'n_estimators': 342, 'max_depth': 8, 'learning_rate': 0.15538840825648545, 'subsample': 0.959588771366159, 'colsample_bytree': 0.594194390895906}. Best is trial 1 with value: 0.8262192313529635.
[I 2025-09-27 13:48:33,702] Trial 2 finished with value: 0.8298024704200244 and parameters: {'n_estimators': 345, 'max_depth': 7, 'learning_rate': 0.2779967432702859, 'subsample': 0.6141041203770192, 'colsample_bytree': 0.6011270401158461}. Best is trial 2 with value: 0.8298024704200244.
[I 2025-09-27 13:48:37,701] Trial 3 finished with value: 0.8245554788655728 and parameters: {'n_estimators': 164, 'max_depth': 10, '

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000756 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3825
[LightGBM] [Info] Number of data points in the train set: 11613, number of used features: 15
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-09-27 13:51:04,196] A new study created in memory with name: no-name-be72bce3-0b8e-490c-94d8-a61db1d9ac5e



=== Tuning CatBoost ===


[I 2025-09-27 13:51:38,746] Trial 0 finished with value: 0.8135330487028206 and parameters: {'iterations': 248, 'depth': 7, 'learning_rate': 0.2559496386593807}. Best is trial 0 with value: 0.8135330487028206.
[I 2025-09-27 13:51:51,476] Trial 1 finished with value: 0.7900855052171563 and parameters: {'iterations': 432, 'depth': 4, 'learning_rate': 0.20143264911925096}. Best is trial 0 with value: 0.8135330487028206.
[I 2025-09-27 13:51:58,909] Trial 2 finished with value: 0.6716291809552494 and parameters: {'iterations': 231, 'depth': 4, 'learning_rate': 0.010932078509843639}. Best is trial 0 with value: 0.8135330487028206.
[I 2025-09-27 13:52:20,336] Trial 3 finished with value: 0.7405874240578847 and parameters: {'iterations': 463, 'depth': 5, 'learning_rate': 0.016018619368624223}. Best is trial 0 with value: 0.8135330487028206.
[I 2025-09-27 13:52:38,404] Trial 4 finished with value: 0.7589679924501902 and parameters: {'iterations': 245, 'depth': 6, 'learning_rate': 0.038760545640

0:	learn: 0.9645096	total: 674ms	remaining: 6m 21s
1:	learn: 0.8817848	total: 1.16s	remaining: 5m 26s
2:	learn: 0.8166938	total: 1.7s	remaining: 5m 19s
3:	learn: 0.7696109	total: 2.2s	remaining: 5m 9s
4:	learn: 0.7358597	total: 2.72s	remaining: 5m 5s
5:	learn: 0.7114677	total: 3.21s	remaining: 5m
6:	learn: 0.6869285	total: 3.74s	remaining: 4m 59s
7:	learn: 0.6692094	total: 4.24s	remaining: 4m 56s
8:	learn: 0.6547931	total: 4.74s	remaining: 4m 53s
9:	learn: 0.6404206	total: 5.26s	remaining: 4m 52s
10:	learn: 0.6246805	total: 5.77s	remaining: 4m 51s
11:	learn: 0.6090909	total: 6.25s	remaining: 4m 49s
12:	learn: 0.5983859	total: 6.75s	remaining: 4m 47s
13:	learn: 0.5886366	total: 7.23s	remaining: 4m 45s
14:	learn: 0.5781875	total: 7.76s	remaining: 4m 45s
15:	learn: 0.5687927	total: 8.25s	remaining: 4m 44s
16:	learn: 0.5608138	total: 8.75s	remaining: 4m 43s
17:	learn: 0.5503317	total: 9.23s	remaining: 4m 41s
18:	learn: 0.5416471	total: 9.78s	remaining: 4m 42s
19:	learn: 0.5355819	total: 10

In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Load dataset
df = pd.read_csv("merged_unified_dataset.csv")

# Apply feature engineering (copy same function from your training script)
def feature_engineering(df):
    df = df.copy()
    if "planet_radius" in df.columns and "star_radius" in df.columns:
        df["planet_density_ratio"] = df["planet_radius"] / (df["star_radius"] + 1e-6)
    if "period" in df.columns:
        df["log_period"] = np.log1p(df["period"])
    if "insolation" in df.columns and "star_radius" in df.columns:
        df["stellar_flux"] = df["insolation"] / (df["star_radius"] ** 2 + 1e-6)
    if "equilibrium_temp" in df.columns and "star_teff" in df.columns:
        df["temp_ratio"] = df["equilibrium_temp"] / (df["star_teff"] + 1e-6)
    return df

df = feature_engineering(df)

# Prepare features & target
target_col = "label"
feature_cols = [c for c in df.columns if c not in [target_col, "source"]]
X = df[feature_cols]
y = df[target_col]

# Encode labels
le = LabelEncoder()
y_enc = le.fit_transform(y)

# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y_enc, test_size=0.2, stratify=y_enc, random_state=42
)

print("✅ Data reloaded and split. Now you can evaluate your saved models.")


✅ Data reloaded and split. Now you can evaluate your saved models.


In [12]:
xgb_acc = accuracy_score(y_test, xgb_model.predict(X_test))
lgbm_acc = accuracy_score(y_test, lgbm_model.predict(X_test))
cat_acc = accuracy_score(y_test, cat_model.predict(X_test))
ensemble_acc = accuracy_score(y_test, ensemble_model.predict(X_test))

print("✅ Model Accuracies:")
print(f"XGBoost:        {xgb_acc:.4f}")
print(f"LightGBM:       {lgbm_acc:.4f}")
print(f"CatBoost:       {cat_acc:.4f}")
print(f"VotingEnsemble: {ensemble_acc:.4f}")


✅ Model Accuracies:
XGBoost:        0.7454
LightGBM:       0.7439
CatBoost:       0.7428
VotingEnsemble: 0.7595


In [None]:
import joblib
import pandas as pd
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

# =========================
# Feature engineering function (same as used during training)
# =========================
def feature_engineering(df):
    df = df.copy()
    if "planet_radius" in df.columns and "star_radius" in df.columns:
        df["planet_density_ratio"] = df["planet_radius"] / (df["star_radius"] + 1e-6)
    if "period" in df.columns:
        df["log_period"] = np.log1p(df["period"])
    if "insolation" in df.columns and "star_radius" in df.columns:
        df["stellar_flux"] = df["insolation"] / (df["star_radius"] ** 2 + 1e-6)
    if "equilibrium_temp" in df.columns and "star_teff" in df.columns:
        df["temp_ratio"] = df["equilibrium_temp"] / (df["star_teff"] + 1e-6)
    return df

# =========================
# Load raw dataset
# =========================
DATA_PATH = "merged_unified_dataset.csv"
df = pd.read_csv(DATA_PATH)
df = df.dropna(subset=["label"])  # remove rows without target

# Apply feature engineering
df_fe = feature_engineering(df)

# Separate features and target
feature_cols = [c for c in df_fe.columns if c not in ["label", "source"]]
X = df_fe[feature_cols]
y = df_fe["label"]

# Encode target
le = LabelEncoder()
y_enc = le.fit_transform(y)

# Split into train/test (same way as original code)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y_enc, test_size=0.2, stratify=y_enc, random_state=42
)

# =========================
# Load saved pipelines
# =========================
xgb_pipe, _ = joblib.load("ml_models/XGBoost_pipeline.pkl")
lgbm_pipe, _ = joblib.load("ml_models/LightGBM_pipeline.pkl")
cat_pipe, _ = joblib.load("ml_models/CatBoost_pipeline.pkl")

# =========================
# Create Stacking Classifier
# =========================
stacking_model = StackingClassifier(
    estimators=[
        ('xgb', xgb_pipe),
        ('lgbm', lgbm_pipe),
        ('cat', cat_pipe)
    ],
    final_estimator=xgb_pipe.named_steps["model"],  # XGBoost as meta model
    cv=5,
    n_jobs=-1,
    passthrough=True
)

# =========================
# Train stacking (preprocessing handled by pipelines)
# =========================
stacking_model.fit(X_train, y_train)

# =========================
# Evaluate
# =========================
y_pred = stacking_model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print("Stacking Accuracy:", acc)
