In [None]:



import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)


# STEP 1: Load Dataset

df = pd.read_csv("exoplanets_clean_full.csv")
print("Dataset shape:", df.shape)


# STEP 2: Create Binary Habitability Label (Rule-based)

df["habitable_label"] = (
    (df["pl_eqt"].between(230, 330)) &
    (df["pl_rade"].between(0.9, 2.2))
).astype(int)

print("\nClass distribution:")
print(df["habitable_label"].value_counts())


# STEP 3: Select RAW Features (No Leakage)

FEATURES = [
    "pl_rade",
    "pl_bmasse",
    "pl_orbper",
    "pl_orbsmax",
    "st_teff",
    "st_rad",
    "sy_dist"
]

X = df[FEATURES]
y = df["habitable_label"]


# STEP 4: Trainâ€“Test Split 

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.30,
    random_state=42,
    stratify=y
)


# STEP 5: Feature Scaling 

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test  = scaler.transform(X_test)


# STEP 6: Train Regularized Logistic Regression 

lr = LogisticRegression(
    penalty="l2",
    C=0.3,                      # Strong regularization
    class_weight="balanced",
    solver="liblinear",
    max_iter=5000
)

lr.fit(X_train, y_train)


# STEP 7: Evaluation on Test Data

y_pred  = lr.predict(X_test)
y_prob  = lr.predict_proba(X_test)[:, 1]

print("\nMODEL EVALUATION (TEST DATA)")
print("--------------------------------")
print("Accuracy :", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall   :", recall_score(y_test, y_pred))
print("F1-score :", f1_score(y_test, y_pred))
print("ROC-AUC  :", roc_auc_score(y_test, y_prob))


# STEP 8: Cross-Validation 

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cv_scores = cross_val_score(
    lr,
    scaler.fit_transform(X),
    y,
    cv=cv,
    scoring="roc_auc"
)

print("\nCROSS-VALIDATION ROC-AUC")
print("--------------------------------")
print("CV Scores:", cv_scores)
print("Mean CV ROC-AUC:", cv_scores.mean())
print("Std CV ROC-AUC :", cv_scores.std())


# STEP 9: Rank Exoplanets by Habitability Probability

df["habitability_probability"] = lr.predict_proba(
    scaler.transform(X)
)[:, 1]

ranked_exoplanets = (
    df.sort_values("habitability_probability", ascending=False)
      .drop_duplicates("pl_name")
      [["pl_name", "habitability_probability"]]
)

print("\nTop 10 Ranked Exoplanets:")
print(ranked_exoplanets.head(10))


Dataset shape: (34993, 62)

Class distribution:
habitable_label
0    34756
1      237
Name: count, dtype: int64

MODEL EVALUATION (TEST DATA)
--------------------------------
Accuracy : 0.7992951038293008
Precision: 0.028703703703703703
Recall   : 0.8732394366197183
F1-score : 0.055580457194083374
ROC-AUC  : 0.913392506183162

CROSS-VALIDATION ROC-AUC
--------------------------------
CV Scores: [0.85735928 0.89224872 0.91663669 0.92334487 0.8864636 ]
Mean CV ROC-AUC: 0.8952106334701109
Std CV ROC-AUC : 0.023531341307247124

Top 10 Ranked Exoplanets:
                  pl_name  habitability_probability
33591        TRAPPIST-1 h                  0.988017
33571        TRAPPIST-1 d                  0.987990
33577        TRAPPIST-1 e                  0.987743
33581        TRAPPIST-1 f                  0.987524
33565        TRAPPIST-1 c                  0.987500
33564        TRAPPIST-1 b                  0.987452
33586        TRAPPIST-1 g                  0.987376
14157       Kepler-1649 b   