In [1]:



import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)


# STEP 1: Load Dataset

df = pd.read_csv("exoplanets_clean_full.csv")
print("Dataset shape:", df.shape)


# STEP 2: Create Binary Habitability Label (Rule-based)

df["habitable_label"] = (
    (df["pl_eqt"].between(230, 330)) &
    (df["pl_rade"].between(0.9, 2.2))
).astype(int)

print("\nClass distribution:")
print(df["habitable_label"].value_counts())


# STEP 3: Select RAW Features (No Leakage)

FEATURES = [
    "pl_rade",
    "pl_bmasse",
    "pl_orbper",
    "pl_orbsmax",
    "st_teff",
    "st_rad",
    "sy_dist"
]

X = df[FEATURES]
y = df["habitable_label"]


# STEP 4: Trainâ€“Test Split 

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.30,
    random_state=42,
    stratify=y
)


# STEP 5: Feature Scaling 

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test  = scaler.transform(X_test)


# STEP 6: Train Regularized Logistic Regression 

lr = LogisticRegression(
    penalty="l2",
    C=0.3,                      # Strong regularization
    class_weight="balanced",
    solver="liblinear",
    max_iter=5000
)

lr.fit(X_train, y_train)


# STEP 7: Evaluation on Test Data

y_pred  = lr.predict(X_test)
y_prob  = lr.predict_proba(X_test)[:, 1]

print("\nMODEL EVALUATION (TEST DATA)")
print("--------------------------------")
print("Accuracy :", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall   :", recall_score(y_test, y_pred))
print("F1-score :", f1_score(y_test, y_pred))
print("ROC-AUC  :", roc_auc_score(y_test, y_prob))


# STEP 8: Cross-Validation 

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cv_scores = cross_val_score(
    lr,
    scaler.fit_transform(X),
    y,
    cv=cv,
    scoring="roc_auc"
)

print("\nCROSS-VALIDATION ROC-AUC")
print("--------------------------------")
print("CV Scores:", cv_scores)
print("Mean CV ROC-AUC:", cv_scores.mean())
print("Std CV ROC-AUC :", cv_scores.std())


# STEP 9: Rank Exoplanets by Habitability Probability

df["habitability_probability"] = lr.predict_proba(
    scaler.transform(X)
)[:, 1]

ranked_exoplanets = (
    df.sort_values("habitability_probability", ascending=False)
      .drop_duplicates("pl_name")
      [["pl_name", "habitability_probability"]]
)

print("\nTop 10 Ranked Exoplanets:")
print(ranked_exoplanets.head(10))


Dataset shape: (34993, 62)

Class distribution:
habitable_label
0    34756
1      237
Name: count, dtype: int64

MODEL EVALUATION (TEST DATA)
--------------------------------
Accuracy : 0.7992951038293008
Precision: 0.028703703703703703
Recall   : 0.8732394366197183
F1-score : 0.055580457194083374
ROC-AUC  : 0.913392506183162

CROSS-VALIDATION ROC-AUC
--------------------------------
CV Scores: [0.85735928 0.89224872 0.91663669 0.92334487 0.8864636 ]
Mean CV ROC-AUC: 0.8952106334701109
Std CV ROC-AUC : 0.023531341307247124

Top 10 Ranked Exoplanets:
                  pl_name  habitability_probability
33591        TRAPPIST-1 h                  0.988017
33571        TRAPPIST-1 d                  0.987990
33577        TRAPPIST-1 e                  0.987743
33581        TRAPPIST-1 f                  0.987524
33565        TRAPPIST-1 c                  0.987500
33564        TRAPPIST-1 b                  0.987452
33586        TRAPPIST-1 g                  0.987376
14157       Kepler-1649 b   

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# ------------------------------------------------------------
# Train Random Forest (regularized)
# ------------------------------------------------------------
rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=6,
    min_samples_leaf=20,
    min_samples_split=30,
    max_features=0.6,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)

# ------------------------------------------------------------
# Evaluate on TEST data
# ------------------------------------------------------------
rf_preds = rf.predict(X_test)
rf_probs = rf.predict_proba(X_test)[:, 1]

rf_accuracy = accuracy_score(y_test, rf_preds)

print("Random Forest Evaluation")
print("-------------------------")
print("Accuracy :", rf_accuracy)
print("Precision:", precision_score(y_test, rf_preds))
print("Recall   :", recall_score(y_test, rf_preds))
print("F1-score :", f1_score(y_test, rf_preds))
print("ROC-AUC  :", roc_auc_score(y_test, rf_probs))


Random Forest Evaluation
-------------------------
Accuracy : 0.9859020765860164
Precision: 0.3225806451612903
Recall   : 0.9859154929577465
F1-score : 0.4861111111111111
ROC-AUC  : 0.9981048658885316


In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)

# ============================================================
# MODEL 1: Random Forest (Binary Classification)
# ============================================================
rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=6,
    min_samples_leaf=20,
    min_samples_split=30,
    max_features=0.6,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)

rf_preds = rf.predict(X_test)
rf_probs = rf.predict_proba(X_test)[:, 1]

print("\nðŸŒ² Random Forest Evaluation")
print("--------------------------------")
print("Accuracy :", accuracy_score(y_test, rf_preds))
print("Precision:", precision_score(y_test, rf_preds))
print("Recall   :", recall_score(y_test, rf_preds))
print("F1-score :", f1_score(y_test, rf_preds))
print("ROC-AUC  :", roc_auc_score(y_test, rf_probs))


# ============================================================
# MODEL 2: Logistic Regression (Baseline, Realistic)
# ============================================================
lr = LogisticRegression(
    penalty="l2",
    C=0.3,
    class_weight="balanced",
    solver="liblinear",
    max_iter=5000
)

lr.fit(X_train, y_train)

lr_preds = lr.predict(X_test)
lr_probs = lr.predict_proba(X_test)[:, 1]

print("\nðŸ“Š Logistic Regression Evaluation")
print("--------------------------------")
print("Accuracy :", accuracy_score(y_test, lr_preds))
print("Precision:", precision_score(y_test, lr_preds))
print("Recall   :", recall_score(y_test, lr_preds))
print("F1-score :", f1_score(y_test, lr_preds))
print("ROC-AUC  :", roc_auc_score(y_test, lr_probs))


# ============================================================
# MODEL 3: XGBoost (Binary Classification)
# ============================================================
xgb_bin = XGBClassifier(
    objective="binary:logistic",
    n_estimators=150,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.7,
    colsample_bytree=0.7,
    reg_alpha=0.5,
    reg_lambda=2.0,
    random_state=42,
    eval_metric="logloss"
)

xgb_bin.fit(X_train, y_train)

xgb_preds = xgb_bin.predict(X_test)
xgb_probs = xgb_bin.predict_proba(X_test)[:, 1]

print("\nðŸš€ XGBoost (Binary) Evaluation")
print("--------------------------------")
print("Accuracy :", accuracy_score(y_test, xgb_preds))
print("Precision:", precision_score(y_test, xgb_preds))
print("Recall   :", recall_score(y_test, xgb_preds))
print("F1-score :", f1_score(y_test, xgb_preds))
print("ROC-AUC  :", roc_auc_score(y_test, xgb_probs))



ðŸŒ² Random Forest Evaluation
--------------------------------
Accuracy : 0.9859020765860164
Precision: 0.3225806451612903
Recall   : 0.9859154929577465
F1-score : 0.4861111111111111
ROC-AUC  : 0.9981048658885316

ðŸ“Š Logistic Regression Evaluation
--------------------------------
Accuracy : 0.7992951038293008
Precision: 0.028703703703703703
Recall   : 0.8732394366197183
F1-score : 0.055580457194083374
ROC-AUC  : 0.913392506183162

ðŸš€ XGBoost (Binary) Evaluation
--------------------------------
Accuracy : 0.9982853876928939
Precision: 0.9206349206349206
Recall   : 0.8169014084507042
F1-score : 0.8656716417910447
ROC-AUC  : 0.9995650511875319


In [4]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.utils import resample
import joblib
# ============================================================
# STEP 1: CREATE A BALANCED TEST SET (EVALUATION ONLY)
# ============================================================

X_test_0 = X_test[y_test == 0]
X_test_1 = X_test[y_test == 1]

y_test_0 = y_test[y_test == 0]
y_test_1 = y_test[y_test == 1]

# Downsample majority class
X_test_0_down, y_test_0_down = resample(
    X_test_0,
    y_test_0,
    replace=False,
    n_samples=len(y_test_1),
    random_state=42
)

# Balanced test data
X_test_bal = np.vstack((X_test_0_down, X_test_1))
y_test_bal = np.hstack((y_test_0_down, y_test_1))


# ============================================================
# MODEL 1: Random Forest (Balanced Evaluation)
# ============================================================

rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=6,
    min_samples_leaf=20,
    min_samples_split=30,
    max_features=0.6,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)

rf_probs = rf.predict_proba(X_test_bal)[:, 1]
rf_preds = (rf_probs >= 0.5).astype(int)

print("\nðŸŒ² Random Forest (Balanced Evaluation)")
print("-------------------------------------")
print("Accuracy :", accuracy_score(y_test_bal, rf_preds))
print("Precision:", precision_score(y_test_bal, rf_preds))
print("Recall   :", recall_score(y_test_bal, rf_preds))
print("F1-score :", f1_score(y_test_bal, rf_preds))
print("ROC-AUC  :", roc_auc_score(y_test_bal, rf_probs))


# ============================================================
# MODEL 2: Logistic Regression (Balanced Evaluation)
# ============================================================

lr = LogisticRegression(
    penalty="l2",
    C=0.3,
    class_weight="balanced",
    solver="liblinear",
    max_iter=5000
)

lr.fit(X_train, y_train)

lr_probs = lr.predict_proba(X_test_bal)[:, 1]
lr_preds = (lr_probs >= 0.5).astype(int)

print("\nðŸ“Š Logistic Regression (Balanced Evaluation)")
print("-------------------------------------------")
print("Accuracy :", accuracy_score(y_test_bal, lr_preds))
print("Precision:", precision_score(y_test_bal, lr_preds))
print("Recall   :", recall_score(y_test_bal, lr_preds))
print("F1-score :", f1_score(y_test_bal, lr_preds))
print("ROC-AUC  :", roc_auc_score(y_test_bal, lr_probs))


# ============================================================
# MODEL 3: XGBoost (Balanced Evaluation)
# ============================================================

xgb_bin = XGBClassifier(
    objective="binary:logistic",
    n_estimators=150,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.7,
    colsample_bytree=0.7,
    reg_alpha=0.5,
    reg_lambda=2.0,
    random_state=42,
    eval_metric="logloss"
)

xgb_bin.fit(X_train, y_train)

xgb_probs = xgb_bin.predict_proba(X_test_bal)[:, 1]
xgb_preds = (xgb_probs >= 0.5).astype(int)

print("\nðŸš€ XGBoost (Balanced Evaluation)")
print("--------------------------------")
print("Accuracy :", accuracy_score(y_test_bal, xgb_preds))
print("Precision:", precision_score(y_test_bal, xgb_preds))
print("Recall   :", recall_score(y_test_bal, xgb_preds))
print("F1-score :", f1_score(y_test_bal, xgb_preds))
print("ROC-AUC  :", roc_auc_score(y_test_bal, xgb_probs))
# joblib.dump(model, "model.pkl")
# print("model.pkl saved successfully")



ðŸŒ² Random Forest (Balanced Evaluation)
-------------------------------------
Accuracy : 0.9788732394366197
Precision: 0.9722222222222222
Recall   : 0.9859154929577465
F1-score : 0.9790209790209791
ROC-AUC  : 0.9986113866296369

ðŸ“Š Logistic Regression (Balanced Evaluation)
-------------------------------------------
Accuracy : 0.852112676056338
Precision: 0.8378378378378378
Recall   : 0.8732394366197183
F1-score : 0.8551724137931035
ROC-AUC  : 0.931362824836342

ðŸš€ XGBoost (Balanced Evaluation)
--------------------------------
Accuracy : 0.9084507042253521
Precision: 1.0
Recall   : 0.8169014084507042
F1-score : 0.8992248062015504
ROC-AUC  : 0.9998016266613767


In [5]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Load dataset
df = pd.read_csv("exoplanets_clean_full.csv")

# Create habitability label (same rule as before)
df["habitable_label"] = (
    (df["pl_eqt"].between(200, 300)) &
    (df["pl_rade"].between(0.5, 2.0))
).astype(int)

# Select features
FEATURES = [
    "pl_rade",
    "pl_bmasse",
    "pl_eqt",
    "pl_orbper",
    "st_teff",
    "st_rad",
    "st_lum",
    "sy_dist"
]

X = df[FEATURES]
y = df["habitable_label"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Train model (simple & stable for API)
model = LogisticRegression(
    class_weight="balanced",
    max_iter=5000
)

model.fit(X_train, y_train)

print("Model trained successfully")
joblib.dump(model, "model.pkl")
print("model.pkl saved successfully")


Model trained successfully
model.pkl saved successfully


In [6]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import joblib

pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("model", LogisticRegression(
        class_weight="balanced",
        max_iter=1000,
        random_state=42
    ))
])

pipeline.fit(X_train, y_train)

joblib.dump(pipeline, "model.pkl")


['model.pkl']

In [7]:
# import joblib

# model = joblib.load("model.pkl")

# print("Model type:", type(model))
# print("Number of features expected:", model.n_features_in_)
# print("Feature names:", getattr(model, "feature_names_in_", "Not available"))


In [8]:
from sklearn.calibration import CalibratedClassifierCV

base_model = LogisticRegression(
    class_weight="balanced",
    max_iter=1000
)

calibrated_model = CalibratedClassifierCV(
    base_model,
    method="sigmoid",
    cv=5
)

calibrated_model.fit(X_train, y_train)
joblib.dump(calibrated_model, "model.pkl")


['model.pkl']

In [9]:
FEATURES = [
    "pl_rade",
    "pl_bmasse",
    "pl_eqt",
    "pl_orbper",
    "st_teff",
    "st_rad"
]

X = df[FEATURES]
y = df["habitability_score"]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

model = XGBRegressor(...)
model.fit(X_scaled, y)

joblib.dump(model, "model.pkl")
joblib.dump(scaler, "scaler.pkl")


KeyError: 'habitability_score'