In [99]:
import pandas as pd 

X_train = pd.read_csv("/Users/janma/Desktop/SpaceAppsExoplanets/src/backend/datasets/kepler/X_train_imputed_scaled.csv", index_col=0)
X_test = pd.read_csv("/Users/janma/Desktop/SpaceAppsExoplanets/src/backend/datasets/kepler/X_test_imputed_scaled.csv",  index_col=0)
y_train = pd.read_csv("/Users/janma/Desktop/SpaceAppsExoplanets/src/backend/datasets/kepler/y_train.csv", index_col=0)["label"]
y_test = pd.read_csv("/Users/janma/Desktop/SpaceAppsExoplanets/src/backend/datasets/kepler/y_test.csv",  index_col=0)["label"]
groups = pd.read_csv("/Users/janma/Desktop/SpaceAppsExoplanets/src/backend/datasets/kepler/groups.csv", index_col=0)["kepid"]

In [100]:
y = pd.concat([y_train, y_test], axis=0)
y.value_counts()

label
0    4839
1    2746
Name: count, dtype: int64

In [112]:
from sklearn.model_selection import StratifiedGroupKFold, cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

kf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)
log = LogisticRegression()
score = cross_validate(log, X_train, y_train, cv=kf, scoring="f1_weighted", n_jobs=-1, groups=groups)
log.fit(X_train, y_train)
y_pred = log.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"Precision: {precision_score(y_test, y_pred)}")
print(f"Recall: {recall_score(y_test, y_pred)}")
print(f"F1: {f1_score(y_test, y_pred)}")
print(f"Score: {score}")

Accuracy: 0.7784974093264249
Precision: 0.697171381031614
Recall: 0.7236614853195165
F1: 0.7101694915254237
Score: {'fit_time': array([0.00587606, 0.02391195, 0.00587606, 0.01220298, 0.00756693]), 'score_time': array([0.00141788, 0.00460601, 0.00141287, 0.00345588, 0.00154209]), 'test_score': array([0.81966891, 0.83039239, 0.8595988 , 0.83949967, 0.84944641])}


In [102]:
from sklearn.model_selection import GridSearchCV
import numpy as np 

C_l2 = np.logspace(-3, 3, 7)                 
C_l1 = [1e-3, 1e-2, 1e-1, 1, 10, 100]        

param_grid = [
    {
        "penalty": ["l2"],
        "solver": ["lbfgs"],
        "C": C_l2,
        "class_weight": [None, "balanced"],
        "max_iter": [3000],
        "tol": [1e-4, 1e-3],
    },
    {
        "penalty": ["l1"],
        "solver": ["liblinear"],
        "C": C_l1,
        "class_weight": [None, "balanced"],
        "max_iter": [5000],
        "tol": [1e-3],   
    },
    {
        "penalty": ["elasticnet"],
        "solver": ["saga"],
        "l1_ratio": [0.1, 0.5, 0.9],
        "C": [0.1, 1, 10],
        "class_weight": [None, "balanced"],
        "max_iter": [5000],
        "tol": [1e-3],
    },
]

log_cv = GridSearchCV(LogisticRegression(), param_grid=param_grid, scoring="f1", cv=kf, refit=True)
log_cv.fit(X_train, y_train, groups=groups)

log_reg_best_model = log_cv.best_estimator_ 
print("Best f1:", log_cv.best_score_)
print("Best params:", log_cv.best_params_)

Best f1: 0.8072357762821755
Best params: {'C': 100, 'class_weight': None, 'max_iter': 5000, 'penalty': 'l1', 'solver': 'liblinear', 'tol': 0.001}


In [103]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred_2 = log_reg_best_model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred_2)}")
print(f"Precision: {precision_score(y_test, y_pred_2)}")
print(f"Recall: {recall_score(y_test, y_pred_2)}")
print(f"F1: {f1_score(y_test, y_pred_2)}")

print(classification_report(y_test, y_pred_2))
print(confusion_matrix(y_test, y_pred_2))

Accuracy: 0.6593264248704663
Precision: 0.5240690281562216
Recall: 0.9965457685664939
F1: 0.6869047619047619
              precision    recall  f1-score   support

           0       1.00      0.46      0.63       965
           1       0.52      1.00      0.69       579

    accuracy                           0.66      1544
   macro avg       0.76      0.73      0.66      1544
weighted avg       0.82      0.66      0.65      1544

[[441 524]
 [  2 577]]


In [105]:
from sklearn.ensemble import RandomForestClassifier

kf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)
rf = RandomForestClassifier()
score = cross_validate(rf, X_train, y_train, cv=kf, scoring="f1", n_jobs=-1, groups=groups)
rf.fit(X_train, y_train)
y_pred_3 = rf.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred_3)}")
print(f"Precision: {precision_score(y_test, y_pred_3)}")
print(f"Recall: {recall_score(y_test, y_pred_3)}")
print(f"F1: {f1_score(y_test, y_pred_3)}")

Accuracy: 0.8290155440414507
Precision: 0.9112271540469974
Recall: 0.6027633851468048
F1: 0.7255717255717256


In [106]:
param_grid_rf = {
    "max_depth": [None, 10],              
    "min_samples_leaf": [1, 2, 4],         
    "max_features": ["sqrt", 0.5],       
    "class_weight": [None, "balanced_subsample"]
}

rf_cv = GridSearchCV(RandomForestClassifier(), param_grid=param_grid_rf, scoring="f1", cv=kf, refit=True)
rf_cv.fit(X_train, y_train, groups=groups)

rf_cv_best_model = rf_cv.best_estimator_ 
print("Best f1:", rf_cv.best_score_)
print("Best params:", rf_cv.best_params_)

Best f1: 0.8868726229402348
Best params: {'class_weight': None, 'max_depth': None, 'max_features': 0.5, 'min_samples_leaf': 1}


In [107]:
y_pred_4 = rf_cv_best_model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred_4)}")
print(f"Precision: {precision_score(y_test, y_pred_4)}")
print(f"Recall: {recall_score(y_test, y_pred_4)}")
print(f"F1: {f1_score(y_test, y_pred_4)}")

print(classification_report(y_test, y_pred_4))
print(confusion_matrix(y_test, y_pred_4))

Accuracy: 0.8335492227979274
Precision: 0.900497512437811
Recall: 0.6252158894645942
F1: 0.7380224260958206
              precision    recall  f1-score   support

           0       0.81      0.96      0.88       965
           1       0.90      0.63      0.74       579

    accuracy                           0.83      1544
   macro avg       0.86      0.79      0.81      1544
weighted avg       0.84      0.83      0.83      1544

[[925  40]
 [217 362]]
