In [1]:
# Update sklearn
#!pip install sklearn --upgrade
# Install joblib
#!pip install joblib

In [2]:
# Import dependencies
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [3]:
# Load in exoplanet data
planets_df = pd.read_csv("resources/exoplanet_data.csv")
planets_df

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,2.479000e-04,-2.479000e-04,162.513840,0.003520,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.899140,1.490000e-05,-1.490000e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.630000e-07,-2.630000e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.285210,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.760000e-06,-3.760000e-06,171.595550,0.001130,...,-211,4.438,0.070,-0.210,1.046,0.334,-0.133,288.75488,48.226200,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.050000e-05,-1.050000e-05,172.979370,0.001900,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.224670,15.714
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6986,FALSE POSITIVE,0,0,0,1,8.589871,1.846000e-04,-1.846000e-04,132.016100,0.015700,...,-152,4.296,0.231,-0.189,1.088,0.313,-0.228,298.74921,46.973351,14.478
6987,FALSE POSITIVE,0,1,1,0,0.527699,1.160000e-07,-1.160000e-07,131.705093,0.000170,...,-166,4.529,0.035,-0.196,0.903,0.237,-0.079,297.18875,47.093819,14.082
6988,CANDIDATE,0,0,0,0,1.739849,1.780000e-05,-1.780000e-05,133.001270,0.007690,...,-220,4.444,0.056,-0.224,1.031,0.341,-0.114,286.50937,47.163219,14.757
6989,FALSE POSITIVE,0,0,1,0,0.681402,2.430000e-06,-2.430000e-06,132.181750,0.002850,...,-236,4.447,0.056,-0.224,1.041,0.341,-0.114,294.16489,47.176281,15.385


In [4]:
# Show all columns
planets_df.columns

Index(['koi_disposition', 'koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co',
       'koi_fpflag_ec', 'koi_period', 'koi_period_err1', 'koi_period_err2',
       'koi_time0bk', 'koi_time0bk_err1', 'koi_time0bk_err2', 'koi_impact',
       'koi_impact_err1', 'koi_impact_err2', 'koi_duration',
       'koi_duration_err1', 'koi_duration_err2', 'koi_depth', 'koi_depth_err1',
       'koi_depth_err2', 'koi_prad', 'koi_prad_err1', 'koi_prad_err2',
       'koi_teq', 'koi_insol', 'koi_insol_err1', 'koi_insol_err2',
       'koi_model_snr', 'koi_tce_plnt_num', 'koi_steff', 'koi_steff_err1',
       'koi_steff_err2', 'koi_slogg', 'koi_slogg_err1', 'koi_slogg_err2',
       'koi_srad', 'koi_srad_err1', 'koi_srad_err2', 'ra', 'dec',
       'koi_kepmag'],
      dtype='object')

In [5]:
# Drop unneccessary columns
planets_df = planets_df[['koi_disposition', 'koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co',
        'koi_fpflag_ec', 'koi_period','koi_time0bk', 'koi_impact','koi_duration', 'koi_depth', 
        'koi_prad','koi_teq', 'koi_insol','koi_model_snr', 'koi_tce_plnt_num', 'koi_steff', 
        'koi_slogg', 'koi_srad', 'ra', 'dec', 'koi_kepmag']]


In [6]:
# Drop null columns & rows
planets_df = planets_df.dropna(axis='columns', how='all')
planets_df = planets_df.dropna(how="any")
planets_df

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_time0bk,koi_impact,koi_duration,koi_depth,...,koi_teq,koi_insol,koi_model_snr,koi_tce_plnt_num,koi_steff,koi_slogg,koi_srad,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,162.513840,0.586,4.50700,874.8,...,443,9.11,25.8,2,5455,4.467,0.927,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.899140,175.850252,0.969,1.78220,10829.0,...,638,39.30,76.3,1,5853,4.544,0.868,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,170.307565,1.276,2.40641,8079.2,...,1395,891.96,505.6,1,5805,4.564,0.791,285.53461,48.285210,15.597
3,CONFIRMED,0,0,0,0,2.525592,171.595550,0.701,1.65450,603.3,...,1406,926.16,40.9,1,6031,4.438,1.046,288.75488,48.226200,15.509
4,CONFIRMED,0,0,0,0,4.134435,172.979370,0.762,3.14020,686.0,...,1160,427.65,40.2,2,6046,4.486,0.972,296.28613,48.224670,15.714
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6986,FALSE POSITIVE,0,0,0,1,8.589871,132.016100,0.765,4.80600,87.7,...,929,176.40,8.4,1,5638,4.296,1.088,298.74921,46.973351,14.478
6987,FALSE POSITIVE,0,1,1,0,0.527699,131.705093,1.252,3.22210,1579.2,...,2088,4500.53,453.3,1,5638,4.529,0.903,297.18875,47.093819,14.082
6988,CANDIDATE,0,0,0,0,1.739849,133.001270,0.043,3.11400,48.5,...,1608,1585.81,10.6,1,6119,4.444,1.031,286.50937,47.163219,14.757
6989,FALSE POSITIVE,0,0,1,0,0.681402,132.181750,0.147,0.86500,103.6,...,2218,5713.41,12.3,1,6173,4.447,1.041,294.16489,47.176281,15.385


## Select Features

In [7]:
X = planets_df.drop("koi_disposition", axis=1)
y = planets_df["koi_disposition"]
print(X.shape, y.shape)

(6991, 20) (6991,)


## Train, Test, Split

In [8]:
# Separate the data into training & testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [9]:
X_train

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_time0bk,koi_impact,koi_duration,koi_depth,koi_prad,koi_teq,koi_insol,koi_model_snr,koi_tce_plnt_num,koi_steff,koi_slogg,koi_srad,ra,dec,koi_kepmag
6122,0,0,0,0,6.768901,133.077240,0.150,3.61600,123.1,1.24,1017,253.30,10.8,1,5737,4.327,1.125,294.40472,39.351681,14.725
6370,0,1,0,1,0.733726,132.020050,0.291,2.30900,114.6,0.86,1867,2891.64,13.8,1,5855,4.578,0.797,284.50391,42.463860,15.770
2879,1,0,0,0,7.652707,134.460380,0.970,79.89690,641.1,3.21,989,226.81,254.3,1,6328,4.481,0.963,295.50211,38.983540,13.099
107,0,0,0,0,7.953547,174.662240,0.300,2.63120,875.4,2.25,696,55.37,38.4,1,4768,4.536,0.779,291.15878,40.750271,15.660
29,0,0,0,0,4.959319,172.258529,0.831,2.22739,9802.0,12.21,1103,349.40,696.5,1,5712,4.359,1.082,292.16705,48.727589,15.263
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3772,0,1,0,0,8.268081,135.056330,1.099,3.47103,71479.0,50.31,863,130.83,1320.5,1,5749,4.502,0.922,292.53125,46.728699,15.768
5191,0,0,0,0,11.161938,133.553800,0.739,5.19500,124.4,1.97,1093,337.23,13.4,3,6200,4.072,1.640,295.21268,49.562180,13.374
5226,0,1,0,0,6.150251,134.422825,1.270,1.68923,2128.7,100.03,2251,6066.49,471.0,1,8914,3.896,2.867,297.18176,45.988441,10.622
5390,1,0,0,0,3.343285,134.845100,1.210,27.29000,166.8,64.00,2094,4535.97,79.5,2,6541,3.773,2.652,296.86258,41.147419,13.276


## Pre-Processing

In [10]:
# MinMaxScaler: scale the numerical data
# Define the scaler
X_scaler = MinMaxScaler().fit(X_train)

# Transform x train & test (with scaler)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [11]:
# Encode labels for KOI disposition
label_encoder = LabelEncoder()

# Train the encoder
label_encoder.fit(y_train)

encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

In [12]:
# One hot encoding
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

In [13]:
# Feature selection & remove unnecessary features


### Canidate: 0
### Confirmed: 1
### False Positive: 2

In [14]:
# # Loop through to show original class & label
# for label, original_class in zip(encoded_y, y):
#     print('Original Class: ' + str(original_class))
#     print('Encoded Label: ' + str(label))
#     print('-' * 12)

## Train the Model

In [15]:
# Create model
model = Sequential()
# Input layer
model.add(Dense(100, activation='relu', input_dim=X_train.shape[1]))
model.add(Dense(100, activation='relu'))

In [16]:
# Find the shape of the y_train (categorical)
y_train_categorical.shape

(5243, 3)

In [17]:
# Output layer
model.add(Dense(y_train_categorical.shape[1], activation='softmax'))

In [18]:
# Print out the model summary
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 100)               2100      
_________________________________________________________________
dense_1 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 303       
Total params: 12,503
Trainable params: 12,503
Non-trainable params: 0
_________________________________________________________________


In [19]:
# Compile the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [20]:
# Fit/Train the model
model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=100,
    shuffle=True,
    verbose=2
)

Train on 5243 samples
Epoch 1/100
5243/5243 - 2s - loss: 0.5453 - accuracy: 0.7118
Epoch 2/100
5243/5243 - 1s - loss: 0.3809 - accuracy: 0.7887
Epoch 3/100
5243/5243 - 0s - loss: 0.3707 - accuracy: 0.7936
Epoch 4/100
5243/5243 - 1s - loss: 0.3648 - accuracy: 0.7992
Epoch 5/100
5243/5243 - 0s - loss: 0.3626 - accuracy: 0.8068
Epoch 6/100
5243/5243 - 1s - loss: 0.3589 - accuracy: 0.8066
Epoch 7/100
5243/5243 - 0s - loss: 0.3599 - accuracy: 0.8098
Epoch 8/100
5243/5243 - 0s - loss: 0.3563 - accuracy: 0.8104
Epoch 9/100
5243/5243 - 0s - loss: 0.3536 - accuracy: 0.8137
Epoch 10/100
5243/5243 - 0s - loss: 0.3499 - accuracy: 0.8165
Epoch 11/100
5243/5243 - 0s - loss: 0.3514 - accuracy: 0.8161
Epoch 12/100
5243/5243 - 0s - loss: 0.3460 - accuracy: 0.8177
Epoch 13/100
5243/5243 - 0s - loss: 0.3466 - accuracy: 0.8152
Epoch 14/100
5243/5243 - 0s - loss: 0.3454 - accuracy: 0.8203
Epoch 15/100
5243/5243 - 0s - loss: 0.3451 - accuracy: 0.8165
Epoch 16/100
5243/5243 - 1s - loss: 0.3417 - accuracy: 0.

<tensorflow.python.keras.callbacks.History at 0x7fef03166fd0>

In [21]:
model_loss, model_accuracy = model.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

1748/1 - 0s - loss: 0.3259 - accuracy: 0.8730
Loss: 0.3190134546849493, Accuracy: 0.8729977011680603


In [22]:
# print(f"Training Data Score: {model2.score(X_train_scaled, y_train)}")
# print(f"Testing Data Score: {model2.score(X_test_scaled, y_test)}")

## Hyperparameter Tuning

In [26]:
# Create the SVC Model
from sklearn.svm import SVC 
model = SVC(kernel='linear')
model

SVC(kernel='linear')

In [27]:
# GridSearchCV: tune model parameters
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [1, 5, 10, 50],
              'gamma': [0.0001, 0.0005, 0.001, 0.005]}
grid = GridSearchCV(model, param_grid, verbose=3)

In [28]:
# Train the model with GridSearch
grid.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV 1/5] END .................C=1, gamma=0.0001;, score=0.819 total time=   0.5s
[CV 2/5] END .................C=1, gamma=0.0001;, score=0.813 total time=   0.3s
[CV 3/5] END .................C=1, gamma=0.0001;, score=0.815 total time=   0.4s
[CV 4/5] END .................C=1, gamma=0.0001;, score=0.797 total time=   0.4s
[CV 5/5] END .................C=1, gamma=0.0001;, score=0.811 total time=   0.3s
[CV 1/5] END .................C=1, gamma=0.0005;, score=0.819 total time=   0.3s
[CV 2/5] END .................C=1, gamma=0.0005;, score=0.813 total time=   0.3s
[CV 3/5] END .................C=1, gamma=0.0005;, score=0.815 total time=   0.3s
[CV 4/5] END .................C=1, gamma=0.0005;, score=0.797 total time=   0.3s
[CV 5/5] END .................C=1, gamma=0.0005;, score=0.811 total time=   0.2s
[CV 1/5] END ..................C=1, gamma=0.001;, score=0.819 total time=   0.3s
[CV 2/5] END ..................C=1, gamma=0.001;

GridSearchCV(estimator=SVC(kernel='linear'),
             param_grid={'C': [1, 5, 10, 50],
                         'gamma': [0.0001, 0.0005, 0.001, 0.005]},
             verbose=3)

In [29]:
print(grid.best_params_)

{'C': 50, 'gamma': 0.0001}


In [30]:
print(grid.best_score_)

0.8199484787402034


In [31]:
predictions = grid.predict(X_test)

In [32]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions,
                            target_names=["Canidate", "Confirmed", "False Positive"]))

                precision    recall  f1-score   support

      Canidate       0.15      0.18      0.17       411
     Confirmed       0.00      0.00      0.00       484
False Positive       0.43      0.63      0.51       853

      accuracy                           0.35      1748
     macro avg       0.19      0.27      0.23      1748
  weighted avg       0.25      0.35      0.29      1748



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# Tune & compare at least 2 different classifiers


## Save the Model

In [None]:
import joblib
filename = 'julia_brunett.sav'
joblib.dump(model, filename)

In [None]:
# Save the model
model.save("exoplanets.h5")