In [4]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.utils import resample

df = pd.read_csv("exoplanets_clean_full.csv")

# ============================================================
# STEP 1: CREATE A BALANCED TEST SET (EVALUATION ONLY)
# ============================================================

X_test_0 = X_test[y_test == 0]
X_test_1 = X_test[y_test == 1]

y_test_0 = y_test[y_test == 0]
y_test_1 = y_test[y_test == 1]

# Downsample majority class
X_test_0_down, y_test_0_down = resample(
    X_test_0,
    y_test_0,
    replace=False,
    n_samples=len(y_test_1),
    random_state=42
)

# Balanced test data
X_test_bal = np.vstack((X_test_0_down, X_test_1))
y_test_bal = np.hstack((y_test_0_down, y_test_1))


# ============================================================
# MODEL 1: Random Forest (Balanced Evaluation)
# ============================================================

rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=6,
    min_samples_leaf=20,
    min_samples_split=30,
    max_features=0.6,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)

rf_probs = rf.predict_proba(X_test_bal)[:, 1]
rf_preds = (rf_probs >= 0.5).astype(int)

print("\n Random Forest (Balanced Evaluation)")
print("-------------------------------------")
print("Accuracy :", accuracy_score(y_test_bal, rf_preds))
print("Precision:", precision_score(y_test_bal, rf_preds))
print("Recall   :", recall_score(y_test_bal, rf_preds))
print("F1-score :", f1_score(y_test_bal, rf_preds))
print("ROC-AUC  :", roc_auc_score(y_test_bal, rf_probs))


# ============================================================
# MODEL 2: Logistic Regression (Balanced Evaluation)
# ============================================================

lr = LogisticRegression(
    penalty="l2",
    C=0.3,
    class_weight="balanced",
    solver="liblinear",
    max_iter=5000
)

lr.fit(X_train, y_train)

lr_probs = lr.predict_proba(X_test_bal)[:, 1]
lr_preds = (lr_probs >= 0.5).astype(int)

print("\n Logistic Regression (Balanced Evaluation)")
print("-------------------------------------------")
print("Accuracy :", accuracy_score(y_test_bal, lr_preds))
print("Precision:", precision_score(y_test_bal, lr_preds))
print("Recall   :", recall_score(y_test_bal, lr_preds))
print("F1-score :", f1_score(y_test_bal, lr_preds))
print("ROC-AUC  :", roc_auc_score(y_test_bal, lr_probs))


# ============================================================
# MODEL 3: XGBoost (Balanced Evaluation)
# ============================================================

xgb_bin = XGBClassifier(
    objective="binary:logistic",
    n_estimators=150,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.7,
    colsample_bytree=0.7,
    reg_alpha=0.5,
    reg_lambda=2.0,
    random_state=42,
    eval_metric="logloss"
)

xgb_bin.fit(X_train, y_train)

xgb_probs = xgb_bin.predict_proba(X_test_bal)[:, 1]
xgb_preds = (xgb_probs >= 0.5).astype(int)

print("\n XGBoost (Balanced Evaluation)")
print("--------------------------------")
print("Accuracy :", accuracy_score(y_test_bal, xgb_preds))
print("Precision:", precision_score(y_test_bal, xgb_preds))
print("Recall   :", recall_score(y_test_bal, xgb_preds))
print("F1-score :", f1_score(y_test_bal, xgb_preds))
print("ROC-AUC  :", roc_auc_score(y_test_bal, xgb_probs))



 Random Forest (Balanced Evaluation)
-------------------------------------
Accuracy : 0.9788732394366197
Precision: 0.9722222222222222
Recall   : 0.9859154929577465
F1-score : 0.9790209790209791
ROC-AUC  : 0.9986113866296369

 Logistic Regression (Balanced Evaluation)
-------------------------------------------
Accuracy : 0.852112676056338
Precision: 0.8378378378378378
Recall   : 0.8732394366197183
F1-score : 0.8551724137931035
ROC-AUC  : 0.931362824836342

 XGBoost (Balanced Evaluation)
--------------------------------
Accuracy : 0.9084507042253521
Precision: 1.0
Recall   : 0.8169014084507042
F1-score : 0.8992248062015504
ROC-AUC  : 0.9998016266613767
