In [1]:
import pandas as pd

# Read the CSV and Perform Basic Data Cleaning

In [2]:
df = pd.read_csv("cumulative.csv")
df = df.drop(columns=["rowid", "kepid", "kepoi_name", "kepler_name", "koi_pdisposition", "koi_score", "koi_tce_delivname"])
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,9.488036,2.775e-05,-2.775e-05,170.53875,0.00216,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
2,FALSE POSITIVE,0,1,0,0,19.89914,1.494e-05,-1.494e-05,175.850252,0.000581,...,-176.0,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
3,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174.0,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
4,CONFIRMED,0,0,0,0,2.525592,3.761e-06,-3.761e-06,171.59555,0.00113,...,-211.0,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509


# Create a Train Test Split

Use `koi_disposition` for the y values

In [3]:
target = df["koi_disposition"]
data = df.drop("koi_disposition", axis=1)

In [4]:
# Split data into training and testing
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)

In [5]:
X_train.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
8017,0,1,1,0,0.806277,4.947e-06,-4.947e-06,131.78567,0.00672,-0.00672,...,-184.0,4.471,0.054,-0.229,0.996,0.324,-0.108,290.81723,38.53912,13.614
1233,0,1,1,0,3.582077,4.318e-06,-4.318e-06,355.515064,0.000864,-0.000864,...,-235.0,4.422,0.09,-0.195,0.993,0.283,-0.131,296.07822,43.13694,15.193
2592,0,0,0,0,5.060923,2.616e-05,-2.616e-05,134.47316,0.00473,-0.00473,...,-112.0,4.492,0.048,-0.112,0.911,0.121,-0.06,289.91742,40.828606,13.346
4770,0,1,0,1,8.480304,3.32e-07,-3.32e-07,135.854534,3.1e-05,-3.1e-05,...,-169.0,3.946,0.195,-0.105,2.21,0.375,-0.563,298.8002,46.665539,7.631
6632,0,0,0,1,4.994716,4.495e-05,-4.495e-05,136.1833,0.0095,-0.0095,...,-194.0,3.706,0.32,-0.08,2.83,0.458,-1.068,282.58215,46.81551,13.352


# Pre-processing

Scale the data using the MinMaxScaler

In [6]:
from sklearn.preprocessing import MinMaxScaler

X_scaler = MinMaxScaler().fit(X_train)

In [7]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Train the Logistic Regression Model

In [12]:
# Create logistic regression model
from sklearn.linear_model import LogisticRegression
model2 = LogisticRegression(multi_class='auto', solver='lbfgs', max_iter=500)
model

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [13]:
#Fit data to the model
model2.fit(X_train_scaled, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [14]:
print(f"Training Data Score: {model2.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model2.score(X_test_scaled, y_test)}")

Training Data Score: 0.8555962183592559
Testing Data Score: 0.8435498627630376


# Hyperparameter Tuning

Use `GridSearchCV` to tune the `C` and `gamma` parameters

In [25]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [1, 5, 10],
              'max_iter': [500, 1000, 5000]}
grid = GridSearchCV(model2, param_grid, verbose=2)

In [26]:
# Train the model with GridSearch
grid.fit(X_train_scaled, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV] C=1, max_iter=500 ...............................................
[CV] ................................ C=1, max_iter=500, total=   0.3s
[CV] C=1, max_iter=500 ...............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s


[CV] ................................ C=1, max_iter=500, total=   0.3s
[CV] C=1, max_iter=500 ...............................................
[CV] ................................ C=1, max_iter=500, total=   0.3s
[CV] C=1, max_iter=1000 ..............................................
[CV] ............................... C=1, max_iter=1000, total=   0.3s
[CV] C=1, max_iter=1000 ..............................................
[CV] ............................... C=1, max_iter=1000, total=   0.3s
[CV] C=1, max_iter=1000 ..............................................
[CV] ............................... C=1, max_iter=1000, total=   0.3s
[CV] C=1, max_iter=5000 ..............................................
[CV] ............................... C=1, max_iter=5000, total=   0.3s
[CV] C=1, max_iter=5000 ..............................................
[CV] ............................... C=1, max_iter=5000, total=   0.3s
[CV] C=1, max_iter=5000 ..............................................
[CV] .



[CV] ................................ C=5, max_iter=500, total=   0.6s
[CV] C=5, max_iter=1000 ..............................................
[CV] ............................... C=5, max_iter=1000, total=   0.6s
[CV] C=5, max_iter=1000 ..............................................
[CV] ............................... C=5, max_iter=1000, total=   0.6s
[CV] C=5, max_iter=1000 ..............................................
[CV] ............................... C=5, max_iter=1000, total=   0.6s
[CV] C=5, max_iter=5000 ..............................................
[CV] ............................... C=5, max_iter=5000, total=   0.6s
[CV] C=5, max_iter=5000 ..............................................
[CV] ............................... C=5, max_iter=5000, total=   0.6s
[CV] C=5, max_iter=5000 ..............................................
[CV] ............................... C=5, max_iter=5000, total=   0.6s
[CV] C=10, max_iter=500 ..............................................




[CV] ............................... C=10, max_iter=500, total=   0.5s
[CV] C=10, max_iter=500 ..............................................




[CV] ............................... C=10, max_iter=500, total=   0.5s
[CV] C=10, max_iter=500 ..............................................




[CV] ............................... C=10, max_iter=500, total=   0.6s
[CV] C=10, max_iter=1000 .............................................
[CV] .............................. C=10, max_iter=1000, total=   0.8s
[CV] C=10, max_iter=1000 .............................................
[CV] .............................. C=10, max_iter=1000, total=   0.8s
[CV] C=10, max_iter=1000 .............................................
[CV] .............................. C=10, max_iter=1000, total=   0.8s
[CV] C=10, max_iter=5000 .............................................
[CV] .............................. C=10, max_iter=5000, total=   0.7s
[CV] C=10, max_iter=5000 .............................................
[CV] .............................. C=10, max_iter=5000, total=   0.7s
[CV] C=10, max_iter=5000 .............................................
[CV] .............................. C=10, max_iter=5000, total=   0.8s


[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:   14.1s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=500, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [1, 5, 10], 'max_iter': [500, 1000, 5000]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=2)

In [27]:
print(grid.best_params_)
print(grid.best_score_)

{'C': 10, 'max_iter': 1000}
0.8702348276913693


In [28]:
# Make predictions with the hypertuned model
predictions = grid.predict(X_test_scaled)

In [30]:
# Calculate classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions,
                            target_names=["confirmed", "notconfirmed", "candidate"]))

              precision    recall  f1-score   support

   confirmed       0.83      0.62      0.71       523
notconfirmed       0.74      0.87      0.80       594
   candidate       0.98      1.00      0.99      1069

    accuracy                           0.87      2186
   macro avg       0.85      0.83      0.83      2186
weighted avg       0.88      0.87      0.87      2186

