In [16]:
import pandas as pd
import warnings
warnings.simplefilter('ignore')
import matplotlib.pyplot as plt
import numpy as np

# Read the CSV and Perform Basic Data Cleaning

In [17]:
df = pd.read_csv("exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


In [18]:
df_xo = df.drop (columns = ['koi_period_err1', 'koi_period_err2', 'koi_time0bk_err1', 'koi_time0bk_err2', 'koi_time0bk_err2', 'koi_impact_err2', 'koi_duration_err1', 'koi_duration_err2', 'koi_depth_err1', 'koi_depth_err1', 'koi_depth_err2', 'koi_prad_err1', 'koi_prad_err2', 'koi_insol_err1', 'koi_insol_err2', 'koi_steff_err1', 'koi_steff_err2', 'koi_slogg_err1', 'koi_slogg_err2', 'koi_srad_err1', 'koi_srad_err2'])
df_xo.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_time0bk,koi_impact,koi_impact_err1,koi_duration,...,koi_teq,koi_insol,koi_model_snr,koi_tce_plnt_num,koi_steff,koi_slogg,koi_srad,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,162.51384,0.586,0.059,4.507,...,443,9.11,25.8,2,5455,4.467,0.927,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,175.850252,0.969,5.126,1.7822,...,638,39.3,76.3,1,5853,4.544,0.868,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,170.307565,1.276,0.115,2.40641,...,1395,891.96,505.6,1,5805,4.564,0.791,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,171.59555,0.701,0.235,1.6545,...,1406,926.16,40.9,1,6031,4.438,1.046,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,172.97937,0.762,0.139,3.1402,...,1160,427.65,40.2,2,6046,4.486,0.972,296.28613,48.22467,15.714


# Select your features (columns)

In [19]:
# Set features. This will also be used as your x values.
selected_features = df_xo[['koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co', 'koi_fpflag_ec', 'koi_period', 'koi_time0bk']]

# Create a Train Test Split

Use `koi_disposition` for the y values

In [20]:
X = selected_features
y = df_xo["koi_disposition"]
print(X.shape, y.shape)

(6991, 6) (6991,)


In [21]:
from sklearn.model_selection import train_test_split

X = pd.get_dummies(X)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=42, stratify=y)

In [22]:
X_train.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_time0bk
6080,1,0,0,0,12.496435,132.0358
3001,0,0,0,0,11.615625,131.96843
570,0,1,0,0,10.980246,137.137607
4897,1,0,0,0,466.90824,136.3731
625,0,1,1,1,1.061933,133.850441


In [23]:
y_train.head()

6080    FALSE POSITIVE
3001         CANDIDATE
570     FALSE POSITIVE
4897    FALSE POSITIVE
625     FALSE POSITIVE
Name: koi_disposition, dtype: object

# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [24]:
# Scale your data
from sklearn.preprocessing import MinMaxScaler

X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [25]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', max_iter=10000, multi_class='auto')

model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=10000, multi_class='auto',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

# Train the Model



In [26]:
print(f"Training Data Score: {model.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model.score(X_test_scaled, y_test)}")

Training Data Score: 0.737745565515926
Testing Data Score: 0.7408466819221968


In [28]:
predictions = model.predict(X_test_scaled)
model_prediction = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)
model_prediction.head()

Unnamed: 0,Prediction,Actual
0,CANDIDATE,CANDIDATE
1,FALSE POSITIVE,FALSE POSITIVE
2,FALSE POSITIVE,FALSE POSITIVE
3,FALSE POSITIVE,FALSE POSITIVE
4,CANDIDATE,CANDIDATE


# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [30]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [1, 5, 10]}

grid = GridSearchCV(model, param_grid, verbose=3)

In [31]:
# Train the model with GridSearch
grid.fit(X_train, y_train)

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] C=1 .............................................................
[CV] .................... C=1, score=0.7934782608695652, total=   0.2s
[CV] C=1 .............................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s


[CV] .................... C=1, score=0.7797482837528604, total=   0.2s
[CV] C=1 .............................................................
[CV] .................... C=1, score=0.7847738981110475, total=   0.2s
[CV] C=5 .............................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.4s remaining:    0.0s


[CV] .................... C=5, score=0.7934782608695652, total=   0.2s
[CV] C=5 .............................................................
[CV] .................... C=5, score=0.7797482837528604, total=   0.1s
[CV] C=5 .............................................................
[CV] .................... C=5, score=0.7847738981110475, total=   0.3s
[CV] C=10 ............................................................
[CV] ................... C=10, score=0.7934782608695652, total=   0.2s
[CV] C=10 ............................................................
[CV] ................... C=10, score=0.7797482837528604, total=   0.2s
[CV] C=10 ............................................................
[CV] ................... C=10, score=0.7853463079564968, total=   0.3s


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    1.9s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=10000, multi_class='auto',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [1, 5, 10]}, pre_dispatch='2*n_jobs', refit=True,
       return_train_score='warn', scoring=None, verbose=3)

In [33]:
print(grid.best_params_)
print(grid.best_score_)

{'C': 10}
0.7861911119588022


In [34]:
predictions = grid.predict(X_test)

In [35]:
    from sklearn.metrics import classification_report
    print(classification_report(y_test, predictions,
                                target_names=["blue", "red", "green"]))

              precision    recall  f1-score   support

        blue       0.66      0.29      0.40       422
         red       0.56      0.84      0.67       450
       green       0.99      1.00      0.99       876

   micro avg       0.79      0.79      0.79      1748
   macro avg       0.74      0.71      0.69      1748
weighted avg       0.80      0.79      0.77      1748



# Save the Model

In [37]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'klif_borja_LR.sav'
joblib.dump(model, filename)

['klif_borja_LR.sav']