In [1]:
# Update sklearn
#!pip install sklearn --upgrade
# Install joblib
#!pip install joblib

In [2]:
# Import dependencies
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [3]:
# Load in exoplanet data
planets_df = pd.read_csv("resources/exoplanet_data.csv")
planets_df

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,2.479000e-04,-2.479000e-04,162.513840,0.003520,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.899140,1.490000e-05,-1.490000e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.630000e-07,-2.630000e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.285210,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.760000e-06,-3.760000e-06,171.595550,0.001130,...,-211,4.438,0.070,-0.210,1.046,0.334,-0.133,288.75488,48.226200,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.050000e-05,-1.050000e-05,172.979370,0.001900,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.224670,15.714
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6986,FALSE POSITIVE,0,0,0,1,8.589871,1.846000e-04,-1.846000e-04,132.016100,0.015700,...,-152,4.296,0.231,-0.189,1.088,0.313,-0.228,298.74921,46.973351,14.478
6987,FALSE POSITIVE,0,1,1,0,0.527699,1.160000e-07,-1.160000e-07,131.705093,0.000170,...,-166,4.529,0.035,-0.196,0.903,0.237,-0.079,297.18875,47.093819,14.082
6988,CANDIDATE,0,0,0,0,1.739849,1.780000e-05,-1.780000e-05,133.001270,0.007690,...,-220,4.444,0.056,-0.224,1.031,0.341,-0.114,286.50937,47.163219,14.757
6989,FALSE POSITIVE,0,0,1,0,0.681402,2.430000e-06,-2.430000e-06,132.181750,0.002850,...,-236,4.447,0.056,-0.224,1.041,0.341,-0.114,294.16489,47.176281,15.385


In [4]:
# Show all columns
planets_df.columns

Index(['koi_disposition', 'koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co',
       'koi_fpflag_ec', 'koi_period', 'koi_period_err1', 'koi_period_err2',
       'koi_time0bk', 'koi_time0bk_err1', 'koi_time0bk_err2', 'koi_impact',
       'koi_impact_err1', 'koi_impact_err2', 'koi_duration',
       'koi_duration_err1', 'koi_duration_err2', 'koi_depth', 'koi_depth_err1',
       'koi_depth_err2', 'koi_prad', 'koi_prad_err1', 'koi_prad_err2',
       'koi_teq', 'koi_insol', 'koi_insol_err1', 'koi_insol_err2',
       'koi_model_snr', 'koi_tce_plnt_num', 'koi_steff', 'koi_steff_err1',
       'koi_steff_err2', 'koi_slogg', 'koi_slogg_err1', 'koi_slogg_err2',
       'koi_srad', 'koi_srad_err1', 'koi_srad_err2', 'ra', 'dec',
       'koi_kepmag'],
      dtype='object')

In [5]:
# # Drop unneccessary columns
# planets_df = planets_df[['koi_disposition', 'koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co',
#         'koi_fpflag_ec', 'koi_period','koi_time0bk', 'koi_impact','koi_duration', 'koi_depth', 
#         'koi_prad','koi_teq', 'koi_insol','koi_model_snr', 'koi_tce_plnt_num', 'koi_steff', 
#         'koi_slogg', 'koi_srad', 'ra', 'dec', 'koi_kepmag']]


In [6]:
# Drop null columns & rows
planets_df = planets_df.dropna(axis='columns', how='all')
planets_df = planets_df.dropna(how="any")
planets_df

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,2.479000e-04,-2.479000e-04,162.513840,0.003520,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.899140,1.490000e-05,-1.490000e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.630000e-07,-2.630000e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.285210,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.760000e-06,-3.760000e-06,171.595550,0.001130,...,-211,4.438,0.070,-0.210,1.046,0.334,-0.133,288.75488,48.226200,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.050000e-05,-1.050000e-05,172.979370,0.001900,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.224670,15.714
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6986,FALSE POSITIVE,0,0,0,1,8.589871,1.846000e-04,-1.846000e-04,132.016100,0.015700,...,-152,4.296,0.231,-0.189,1.088,0.313,-0.228,298.74921,46.973351,14.478
6987,FALSE POSITIVE,0,1,1,0,0.527699,1.160000e-07,-1.160000e-07,131.705093,0.000170,...,-166,4.529,0.035,-0.196,0.903,0.237,-0.079,297.18875,47.093819,14.082
6988,CANDIDATE,0,0,0,0,1.739849,1.780000e-05,-1.780000e-05,133.001270,0.007690,...,-220,4.444,0.056,-0.224,1.031,0.341,-0.114,286.50937,47.163219,14.757
6989,FALSE POSITIVE,0,0,1,0,0.681402,2.430000e-06,-2.430000e-06,132.181750,0.002850,...,-236,4.447,0.056,-0.224,1.041,0.341,-0.114,294.16489,47.176281,15.385


## Select Features

In [7]:
X = planets_df.drop("koi_disposition", axis=1)
# X = planets_df[['koi_fpflag_co', 'koi_fpflag_nt', 'koi_fpflag_ss', 'koi_model_snr', 'koi_prad', 'koi_fpflag_ec']]
y = planets_df["koi_disposition"]
print(X.shape, y.shape)

(6991, 40) (6991,)


## Train, Test, Split

In [8]:
# Separate the data into training & testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [9]:
# Show rows for training
X_train

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
6122,0,0,0,0,6.768901,7.380000e-05,-7.380000e-05,133.077240,0.008440,-0.008440,...,-171,4.327,0.153,-0.187,1.125,0.310,-0.207,294.40472,39.351681,14.725
6370,0,1,0,1,0.733726,6.060000e-06,-6.060000e-06,132.020050,0.007950,-0.007950,...,-175,4.578,0.033,-0.187,0.797,0.211,-0.056,284.50391,42.463860,15.770
2879,1,0,0,0,7.652707,6.540000e-05,-6.540000e-05,134.460380,0.006190,-0.006190,...,-189,4.481,0.050,-0.200,0.963,0.290,-0.097,295.50211,38.983540,13.099
107,0,0,0,0,7.953547,1.910000e-05,-1.910000e-05,174.662240,0.001820,-0.001820,...,-85,4.536,0.056,-0.016,0.779,0.023,-0.049,291.15878,40.750271,15.660
29,0,0,0,0,4.959319,5.150000e-07,-5.150000e-07,172.258529,0.000083,-0.000083,...,-77,4.359,0.110,-0.110,1.082,0.173,-0.130,292.16705,48.727589,15.263
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3772,0,1,0,0,8.268081,6.340000e-07,-6.340000e-07,135.056330,0.000064,-0.000064,...,-190,4.502,0.050,-0.200,0.922,0.273,-0.091,292.53125,46.728699,15.768
5191,0,0,0,0,11.161938,1.677000e-04,-1.677000e-04,133.553800,0.013000,-0.013000,...,-124,4.072,0.188,-0.101,1.640,0.281,-0.343,295.21268,49.562180,13.374
5226,0,1,0,0,6.150251,7.000000e-07,-7.000000e-07,134.422825,0.000088,-0.000088,...,-458,3.896,0.270,-0.180,2.867,0.988,-1.087,297.18176,45.988441,10.622
5390,1,0,0,0,3.343285,4.380000e-05,-4.380000e-05,134.845100,0.011200,-0.011200,...,-197,3.773,0.293,-0.098,2.652,0.433,-0.939,296.86258,41.147419,13.276


## Pre-Processing

### Min Max Scaler (x)

In [10]:
# MinMaxScaler: scale the numerical data
# Define the scaler
X_scaler = MinMaxScaler().fit(X_train)

# Transform x train & test (with scaler)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

### One Hot Encoding (y)

In [11]:
# Encode labels for y: "KOI disposition"
label_encoder = LabelEncoder()

# Train the encoder
label_encoder.fit(y_train)

# Encoded y train & test
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

In [12]:
# One hot encoding
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

In [14]:
# See the categories
y_train_categorical

array([[1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       ...,
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.]], dtype=float32)

In [None]:
# Feature selection & remove unnecessary features


### Decision Tree:

In [15]:
from sklearn import tree
# Decision Tree Classifier
clf = tree.DecisionTreeClassifier()
# Fit the model
clf = clf.fit(X_train_scaled, y_train)
# Score the model
print("Training Score: ", clf.score(X_train_scaled, y_train))
print("Testing Score: ", clf.score(X_test_scaled, y_test))

Training Score:  1.0
Testing Score:  0.8518306636155606


### Random Forest:

In [20]:
from sklearn.ensemble import RandomForestClassifier
# Random Forest Classifier
rf = RandomForestClassifier(n_estimators=500)
# Fit the model
rf = rf.fit(X_train_scaled, y_train)
# Score the model
print("Training Score: ", rf.score(X_train_scaled, y_train))
print("Testing Score: ", rf.score(X_test_scaled, y_test))

Training Score:  1.0
Testing Score:  0.9016018306636155


In [21]:
feature_names = X.columns
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

[(0.1089034558042913, 'koi_fpflag_co'),
 (0.10175071189277543, 'koi_fpflag_nt'),
 (0.06960145088905746, 'koi_fpflag_ss'),
 (0.05487657639410657, 'koi_model_snr'),
 (0.047724627471074546, 'koi_prad'),
 (0.03603193663812355, 'koi_fpflag_ec'),
 (0.03385302423004972, 'koi_duration_err2'),
 (0.030377988326039898, 'koi_duration_err1'),
 (0.03034350051427421, 'koi_steff_err1'),
 (0.030130336943208876, 'koi_prad_err1'),
 (0.029198540921958515, 'koi_prad_err2'),
 (0.02575906612772451, 'koi_steff_err2'),
 (0.025583394848260783, 'koi_time0bk_err2'),
 (0.023984878505689103, 'koi_depth'),
 (0.02363750002527533, 'koi_time0bk_err1'),
 (0.021856810605963, 'koi_duration'),
 (0.021844285480678682, 'koi_period'),
 (0.019029881497374394, 'koi_insol_err1'),
 (0.018921687536233814, 'koi_impact'),
 (0.018669264826748275, 'koi_period_err1'),
 (0.017867905139720818, 'koi_period_err2'),
 (0.016046700775154122, 'koi_teq'),
 (0.014570134497740146, 'koi_depth_err2'),
 (0.014312569107146482, 'koi_insol'),
 (0.01341

## Additional Models:

In [None]:
# LASSO model
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error

lasso = Lasso(alpha=.01).fit(X_train_scaled, y_train_categorical)

predictions = lasso.predict(X_test_scaled)

MSE = mean_squared_error(y_test_categorical, predictions)
r2 = lasso.score(X_test_scaled, y_test_categorical)

print(f"MSE: {MSE}, R2: {r2}")

In [None]:
# ElasticNet model
from sklearn.linear_model import ElasticNet

elasticnet = ElasticNet(alpha=.01).fit(X_train_scaled, y_train_categorical)

predictions = elasticnet.predict(X_test_scaled)

MSE = mean_squared_error(y_test_categorical, predictions)
r2 = elasticnet.score(X_test_scaled, y_test_categorical)

print(f"MSE: {MSE}, R2: {r2}")

In [None]:
# Ridge model
from sklearn.linear_model import Ridge

ridge = Ridge(alpha=.01).fit(X_train_scaled, y_train_categorical)

predictions = ridge.predict(X_test_scaled)

MSE = mean_squared_error(y_test_categorical, predictions)
r2 = ridge.score(X_test_scaled, y_test_categorical)

print(f"MSE: {MSE}, R2: {r2}")

### K Nearest Neighbors:

In [None]:
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt

# Loop through different k values to see which has the highest accuracy
# Note: We only use odd numbers because we don't want any ties
train_scores = []
test_scores = []

for k in range(1, 20, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    train_score = knn.score(X_train_scaled, y_train)
    test_score = knn.score(X_test_scaled, y_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")
    
    
plt.plot(range(1, 20, 2), train_scores, marker='o')
plt.plot(range(1, 20, 2), test_scores, marker="x")
plt.xlabel("K Neighbors")
plt.ylabel("Testing Accuracy Score")
plt.show()

### Grid Search:

In [None]:
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

In [None]:
# Use grid search to find best k value
parameters = { 'n_neighbors': range(1, 20, 2) }
knn = KNeighborsClassifier()
clf = GridSearchCV(knn, parameters, verbose=3)

In [None]:
# Fit the model
clf.fit(X_train_scaled, y_train)

In [None]:
# Find best parameters & scores
print(clf.best_params_)
print("Best Score: ", clf.best_score_)

In [None]:
# Make predictions based on test data
predictions = clf.predict(X_test)

In [None]:
# Print out classification report
from sklearn.metrics import classification_report

target_names = ["Candidate", "Confirmed", "False Positive"]
print(classification_report(y_test, predictions, target_names=target_names, zero_division=1))

In [None]:
# Find accuracy for classifier
knn = KNeighborsClassifier(n_neighbors=15)
knn.fit(X_train_scaled, y_train)
print('k=15 Test Acc: %.3f' % knn.score(X_test_scaled, y_test))

### Canidate: 0
### Confirmed: 1
### False Positive: 2

In [None]:
# # Loop through to show original class & label
# for label, original_class in zip(encoded_y, y):
#     print('Original Class: ' + str(original_class))
#     print('Encoded Label: ' + str(label))
#     print('-' * 12)

## Train the Model

### Logistic Regression Model:

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier

In [None]:
classifier.fit(X_train_scaled, y_train)

In [None]:
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")

In [None]:
predictions = classifier.predict(X_test_scaled)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

### Deep Learning Model:

In [None]:
# Create model
model = Sequential()
# Input layer
model.add(Dense(100, activation='relu', input_dim=X_train.shape[1]))
# Second hidden layer
model.add(Dense(100, activation='relu'))

In [None]:
# Find the shape of the y_train (categorical)
y_train_categorical.shape

In [None]:
# Output layer
model.add(Dense(y_train_categorical.shape[1], activation='softmax'))

In [None]:
# Print out the model summary
model.summary()

In [None]:
# Compile the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
# Fit/Train the model
model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=100,
    shuffle=True,
    verbose=2
)

In [None]:
model_loss, model_accuracy = model.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

In [None]:
print(f"Training Data Score: {model.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model.score(X_test_scaled, y_test)}")

## Hyperparameter Tuning

In [None]:
# Create the SVC Model
from sklearn.svm import SVC 
model = SVC(kernel='linear')
model

In [None]:
# GridSearchCV: tune model parameters
from sklearn.model_selection import GridSearchCV
parameters = { 'n_neighbors': range(1, 20, 2) }
knn = KNeighborsClassifier()
grid = GridSearchCV(knn, parameters, verbose=3)

In [None]:
# Train the model with GridSearch
grid.fit(X_train_scaled, y_train)

In [None]:
print(grid.best_params_)

In [None]:
print(grid.best_score_)

In [None]:
predictions = grid.predict(X_test)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions,
                            target_names=["Canidate", "Confirmed", "False Positive"]))

In [None]:
# Tune & compare at least 2 different classifiers


## Save the Model

In [None]:
import joblib
filename = 'models/julia_brunett.sav'
joblib.dump(model, filename)

In [None]:
# Save the model
model.save("models/exoplanets.h5")