In [1]:
import pandas as pd 
url = 'datasets/KOI.csv'
kepler_df = pd.read_csv(url, comment='#')
kepler_df.head(5)

Unnamed: 0,kepid,kepoi_name,kepler_name,koi_disposition,koi_pdisposition,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,10797460,K00752.01,Kepler-227 b,CONFIRMED,CANDIDATE,1.0,0,0,0,0,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,10797460,K00752.02,Kepler-227 c,CONFIRMED,CANDIDATE,0.969,0,0,0,0,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
2,10811496,K00753.01,,CANDIDATE,CANDIDATE,0.0,0,0,0,0,...,-176.0,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
3,10848459,K00754.01,,FALSE POSITIVE,FALSE POSITIVE,0.0,0,1,0,0,...,-174.0,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
4,10854555,K00755.01,Kepler-664 b,CONFIRMED,CANDIDATE,1.0,0,0,0,0,...,-211.0,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509


In [2]:
subset = ['kepoi_name', 'kepler_name', 'koi_pdisposition', 'koi_score', 'koi_tce_delivname', 'koi_teq', 'koi_time0bk']
kepler_clean_df = kepler_df.drop(subset, axis=1)
kepler_clean_df.isnull().sum().sort_values(ascending=False)

koi_teq_err2         9564
koi_teq_err1         9564
koi_steff_err2        483
koi_srad_err2         468
koi_slogg_err2        468
koi_slogg_err1        468
koi_srad_err1         468
koi_steff_err1        468
koi_time0bk_err1      454
koi_period_err2       454
koi_depth_err2        454
koi_depth_err1        454
koi_time0bk_err2      454
koi_impact_err1       454
koi_duration_err2     454
koi_duration_err1     454
koi_impact_err2       454
koi_period_err1       454
koi_srad              363
koi_slogg             363
koi_prad              363
koi_prad_err2         363
koi_depth             363
koi_impact            363
koi_model_snr         363
koi_prad_err1         363
koi_steff             363
koi_tce_plnt_num      346
koi_insol_err2        321
koi_insol             321
koi_insol_err1        321
koi_kepmag              1
kepid                   0
koi_disposition         0
koi_period              0
koi_fpflag_ss           0
koi_fpflag_co           0
koi_fpflag_ec           0
koi_fpflag_n

In [3]:
candidates_df = kepler_clean_df[kepler_clean_df["koi_disposition"].str.strip().str.upper() == "CANDIDATE"].copy()

labeled_df = kepler_clean_df[kepler_clean_df["koi_disposition"].str.strip().str.upper().isin(["CONFIRMED","FALSE POSITIVE"])].copy()

labeled_df["label"] = (
    labeled_df["koi_disposition"]
    .str.strip().str.upper()
    .map({"CONFIRMED": 1, "FALSE POSITIVE": 0})
)

labeled_df.drop(["koi_disposition"], axis=1, inplace=True)
labeled_df.head(5)

Unnamed: 0,kepid,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk_err1,koi_time0bk_err2,...,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag,label
0,10797460,0,0,0,0,9.488036,2.775e-05,-2.775e-05,0.00216,-0.00216,...,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347,1
1,10797460,0,0,0,0,54.418383,0.0002479,-0.0002479,0.00352,-0.00352,...,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347,1
3,10848459,0,1,0,0,1.736952,2.63e-07,-2.63e-07,0.000115,-0.000115,...,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597,0
4,10854555,0,0,0,0,2.525592,3.761e-06,-3.761e-06,0.00113,-0.00113,...,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509,1
5,10872983,0,0,0,0,11.094321,2.036e-05,-2.036e-05,0.00141,-0.00141,...,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714,1


In [4]:
from sklearn.model_selection import GroupShuffleSplit, GroupKFold, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import numpy as np

drop = ["label", "kepid"]
feat_cols = [c for c in labeled_df.columns if c not in drop]
cont = ["koi_period", "koi_impact", "koi_duration", "koi_depth", "koi_prad", "koi_insol", "koi_model_snr", 
        "koi_steff", "koi_slogg", "koi_srad", "ra", "dec", "koi_kepmag"]
flags = ["koi_fpflag_nt", "koi_fpflag_ss", "koi_fpflag_co", "koi_fpflag_ec", "koi_tce_plnt_num"]

In [5]:
X = labeled_df[feat_cols]
y = labeled_df["label"]
groups = labeled_df["kepid"]

X = labeled_df[feat_cols].copy()
X["kepid_copy"] = labeled_df["kepid"]

gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, test_idx = next(gss.split(X, y, groups=groups))

X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
groups_train = groups.iloc[train_idx]


In [6]:
preprocess_rf = ColumnTransformer([
    ("pass", "passthrough", cont + flags)
])

pipeline_rf = Pipeline([
    ("preprocessing", preprocess_rf),
    ("rf", RandomForestClassifier(random_state=42, n_jobs=-1))
])

param_grid_rf = {
    "rf__max_depth": [None, 10],              
    "rf__min_samples_leaf": [1, 2, 4],         
    "rf__max_features": ["sqrt", 0.5],       
    "rf__class_weight": [None, "balanced_subsample"]
}

kf = GroupKFold(n_splits=5)
rf_cv = GridSearchCV(pipeline_rf, param_grid_rf, scoring="average_precision", cv=kf, n_jobs=-1, refit=True, verbose=1)
rf_cv.fit(X_train, y_train, groups=groups_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


In [7]:
rf_model = rf_cv.best_estimator_
print("RF best PR-AUC (CV):", rf_cv.best_score_)
print("RF best params:", rf_cv.best_params_)

RF best PR-AUC (CV): 0.9987173173599088
RF best params: {'rf__class_weight': 'balanced_subsample', 'rf__max_depth': None, 'rf__max_features': 'sqrt', 'rf__min_samples_leaf': 2}


In [8]:
y_pred_rf = rf_model.predict(X_test)
print(classification_report(y_test, y_pred_rf))
print(confusion_matrix(y_test, y_pred_rf))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       965
           1       1.00      0.99      0.99       579

    accuracy                           0.99      1544
   macro avg       0.99      0.99      0.99      1544
weighted avg       0.99      0.99      0.99      1544

[[963   2]
 [  8 571]]


In [19]:
proba = rf_model.predict_proba(X_test)[:, 1]  

results_df = pd.DataFrame({
    "kepid": X_test["kepid_copy"].values,
    "probability_exoplanet": proba,
    "true_label": y_test.values
})


results_df_sorted = results_df.sort_values(by="probability_exoplanet", ascending=False)

print(results_df_sorted.head(100))
results_df_sorted.head(100).to_csv('exoplanet_probabilities.csv', index=False)

        kepid  probability_exoplanet  true_label
486   7603200                    1.0           1
536  11566064                    1.0           1
537   3328080                    1.0           1
540  12785320                    1.0           1
480   6291837                    1.0           1
..        ...                    ...         ...
406   5364071                    1.0           1
392   5364071                    1.0           1
411   9837685                    1.0           1
402   5364071                    1.0           1
403  11253827                    1.0           1

[100 rows x 3 columns]
