In [1]:
# Update sklearn to prevent version mismatches
#!pip install sklearn --upgrade

In [2]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
#!pip install joblib

In [3]:
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
import numpy as np
import os

# Read the CSV and Perform Basic Data Cleaning

In [4]:
df = pd.read_csv("exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


In [5]:
# rename header to make easier to interpret what each column means
# see link below for detailed description for each header
# https://exoplanetarchive.ipac.caltech.edu/docs/API_kepcandidate_columns.html
df_cleaned_headers = pd.read_csv("exoplanet_data_headers.csv")
# Drop the null columns where all values are null
df_cleaned_headers = df_cleaned_headers.dropna(axis='columns', how='all')
# Drop the null rows
df_cleaned_headers = df_cleaned_headers.dropna()
df_cleaned_headers.head()

Unnamed: 0,koi_disposition,not_transit_like_flag,stellar_eclipse_flag,centroid_offset_flag,ephemeris_match_contamination_flag,orbital_period_days,koi_period_err1,koi_period_err2,transit_epoch,koi_time0bk_err1,...,koi_steff_err2,stellar_surface_gravity,koi_slogg_err1,koi_slogg_err2,stellar_radius,koi_srad_err1,koi_srad_err2,ra,dec,kepler_band_mag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


In [6]:
columns = ['koi_period_err1', 'koi_period_err2', 'koi_time0bk_err1', 'koi_time0bk_err2', 'koi_impact_err1', 'koi_impact_err2', 'koi_duration_err1', 'koi_duration_err2', 'koi_depth_err1', 'koi_depth_err2', 'koi_prad_err1', 'koi_prad_err2', 'koi_insol_err1', 'koi_insol_err2', 'koi_steff_err1', 'koi_steff_err2', 'koi_slogg_err1', 'koi_slogg_err2', 'koi_srad_err1', 'koi_srad_err2', 'ra', 'dec', 'kepler_band_mag']
df_cleaned_headers.drop(columns, inplace=True, axis=1)
df_cleaned_headers.head()

Unnamed: 0,koi_disposition,not_transit_like_flag,stellar_eclipse_flag,centroid_offset_flag,ephemeris_match_contamination_flag,orbital_period_days,transit_epoch,impact_parameter,transit_duration_hours,transit_depth_ppm,planetary_radius,temperature_kelvin,insolation_flux,transit_signal_to_noise,tce_planet_number,stellar_effective_temperature_kelvin,stellar_surface_gravity,stellar_radius
0,CONFIRMED,0,0,0,0,54.418383,162.51384,0.586,4.507,874.8,2.83,443,9.11,25.8,2,5455,4.467,0.927
1,FALSE POSITIVE,0,1,0,0,19.89914,175.850252,0.969,1.7822,10829.0,14.6,638,39.3,76.3,1,5853,4.544,0.868
2,FALSE POSITIVE,0,1,0,0,1.736952,170.307565,1.276,2.40641,8079.2,33.46,1395,891.96,505.6,1,5805,4.564,0.791
3,CONFIRMED,0,0,0,0,2.525592,171.59555,0.701,1.6545,603.3,2.75,1406,926.16,40.9,1,6031,4.438,1.046
4,CONFIRMED,0,0,0,0,4.134435,172.97937,0.762,3.1402,686.0,2.77,1160,427.65,40.2,2,6046,4.486,0.972


# Select your features (columns)

In [7]:
# Set features. This will also be used as your x values.
selected_features = df_cleaned_headers[['orbital_period_days', 'transit_epoch', 'impact_parameter', 'transit_duration_hours', 'transit_depth_ppm', 'planetary_radius', 'temperature_kelvin', 'insolation_flux', 'transit_signal_to_noise', 'tce_planet_number', 'stellar_effective_temperature_kelvin', 'stellar_surface_gravity', 'stellar_radius', 'not_transit_like_flag', 'stellar_eclipse_flag', 'centroid_offset_flag', 'ephemeris_match_contamination_flag']]
selected_features.head()

Unnamed: 0,orbital_period_days,transit_epoch,impact_parameter,transit_duration_hours,transit_depth_ppm,planetary_radius,temperature_kelvin,insolation_flux,transit_signal_to_noise,tce_planet_number,stellar_effective_temperature_kelvin,stellar_surface_gravity,stellar_radius,not_transit_like_flag,stellar_eclipse_flag,centroid_offset_flag,ephemeris_match_contamination_flag
0,54.418383,162.51384,0.586,4.507,874.8,2.83,443,9.11,25.8,2,5455,4.467,0.927,0,0,0,0
1,19.89914,175.850252,0.969,1.7822,10829.0,14.6,638,39.3,76.3,1,5853,4.544,0.868,0,1,0,0
2,1.736952,170.307565,1.276,2.40641,8079.2,33.46,1395,891.96,505.6,1,5805,4.564,0.791,0,1,0,0
3,2.525592,171.59555,0.701,1.6545,603.3,2.75,1406,926.16,40.9,1,6031,4.438,1.046,0,0,0,0
4,4.134435,172.97937,0.762,3.1402,686.0,2.77,1160,427.65,40.2,2,6046,4.486,0.972,0,0,0,0


In [8]:
selected_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6991 entries, 0 to 6990
Data columns (total 17 columns):
orbital_period_days                     6991 non-null float64
transit_epoch                           6991 non-null float64
impact_parameter                        6991 non-null float64
transit_duration_hours                  6991 non-null float64
transit_depth_ppm                       6991 non-null float64
planetary_radius                        6991 non-null float64
temperature_kelvin                      6991 non-null int64
insolation_flux                         6991 non-null float64
transit_signal_to_noise                 6991 non-null float64
tce_planet_number                       6991 non-null int64
stellar_effective_temperature_kelvin    6991 non-null int64
stellar_surface_gravity                 6991 non-null float64
stellar_radius                          6991 non-null float64
not_transit_like_flag                   6991 non-null int64
stellar_eclipse_flag             

In [9]:
X = selected_features
y = df_cleaned_headers["koi_disposition"]
print(X.shape, y.shape)

(6991, 17) (6991,)


In [10]:
#preprocess "koi_disposition" use one hot to make numeric
data = y.copy()
data_binary_encoded = pd.get_dummies(data)
data_binary_encoded.head()

Unnamed: 0,CANDIDATE,CONFIRMED,FALSE POSITIVE
0,0,1,0
1,0,0,1
2,0,0,1
3,0,1,0
4,0,1,0


# Create a Train Test Split

Use `koi_disposition` for the y values

In [11]:
from sklearn.model_selection import train_test_split

#y = pd.get_dummies(y)
print(X.shape, y.shape)
feature_names = X.columns
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

X_train.head()

(6991, 17) (6991,)


Unnamed: 0,orbital_period_days,transit_epoch,impact_parameter,transit_duration_hours,transit_depth_ppm,planetary_radius,temperature_kelvin,insolation_flux,transit_signal_to_noise,tce_planet_number,stellar_effective_temperature_kelvin,stellar_surface_gravity,stellar_radius,not_transit_like_flag,stellar_eclipse_flag,centroid_offset_flag,ephemeris_match_contamination_flag
6080,12.496435,132.0358,1.17,84.32,271.7,55.34,1397,899.44,141.5,1,6821,3.805,2.73,1,0,0,0
3001,11.615625,131.96843,0.977,2.233,191.2,2.84,905,158.69,8.8,2,5332,4.083,1.453,0,0,0,0
570,10.980246,137.137607,0.733,3.74464,50078.0,21.94,821,107.47,1555.4,1,5952,4.462,0.897,0,1,0,0
4897,466.90824,136.3731,0.0868,2.64,660.0,2.19,210,0.46,5.4,1,5340,4.456,0.867,1,0,0,0
625,1.061933,133.850441,0.713,2.1429,133.6,2.29,2508,9391.15,80.2,1,6134,3.975,1.851,0,1,1,1


In [12]:
y_train.head()

6080    FALSE POSITIVE
3001         CANDIDATE
570     FALSE POSITIVE
4897    FALSE POSITIVE
625     FALSE POSITIVE
Name: koi_disposition, dtype: object

In [13]:
y_test.head()

2106         CANDIDATE
2841    FALSE POSITIVE
3377    FALSE POSITIVE
5092    FALSE POSITIVE
47           CANDIDATE
Name: koi_disposition, dtype: object

# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [14]:
# Scale your data
from sklearn.preprocessing import MinMaxScaler
X_minmax = MinMaxScaler().fit(X_train)
#y_minmax = MinMaxScaler().fit(y_train)

X_train_minmax = X_minmax.transform(X_train)
X_test_minmax = X_minmax.transform(X_test)
#y_train_minmax = y_minmax.transform(y_train)
#y_test_minmax = y_minmax.transform(y_test)

  return self.partial_fit(X, y)


In [15]:
# Support vector machine linear classifier
from sklearn.svm import SVC 
model = SVC(kernel='linear')
model.fit(X_train_minmax, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [16]:
print('Train Acc: %.5f' % model.score(X_train_minmax, y_train))

Train Acc: 0.78734


In [17]:
# Model Accuracy
print('Test Acc: %.5f' % model.score(X_test_minmax, y_test))

Test Acc: 0.79291


In [18]:
target_names = ["CANDIDATE", "CONFIRMED", "FALSE POSITIVE"]

# Train the Model



In [19]:
# Calculate classification report
from sklearn.metrics import classification_report
predictions = model.predict(X_test_minmax)
print(classification_report(y_test, predictions,
                            target_names=target_names))

                precision    recall  f1-score   support

     CANDIDATE       0.89      0.19      0.31       422
     CONFIRMED       0.56      0.96      0.71       450
FALSE POSITIVE       0.99      1.00      0.99       876

     micro avg       0.79      0.79      0.79      1748
     macro avg       0.81      0.72      0.67      1748
  weighted avg       0.85      0.79      0.75      1748



# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [20]:
# try a random search
from sklearn.model_selection import RandomizedSearchCV

In [21]:
model.get_params()

{'C': 1.0,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'auto_deprecated',
 'kernel': 'linear',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [22]:
C = [1.0, 2.0, 5.0, 10.0, 25.0, 50.0]
kernel = ["linear", "poly", "rbf", "sigmoid"]
degree = [1, 2, 3, 4, 5]
gamma = [0.0001, 0.001, 0.01, "scale", "auto"]
shrinking = [True, False]
probability= [True, False]
tol = [0.0001, 0.001, 0.01, 0.1]
param_grid = dict(C=C, kernel=kernel, degree=degree, gamma=gamma, shrinking=shrinking, probability=probability, tol=tol)

In [23]:
import time
random = RandomizedSearchCV(model, param_distributions=param_grid, cv = 5, verbose=2, random_state=42, n_jobs=-1, iid=False)

start_time = time.time()
random_result = random.fit(X_train_minmax, y_train)
# Summarize results
print("Best: %f using %s" % (random_result.best_score_, random_result.best_params_))
print("Execution time: " + str((time.time() - start_time)) + ' sec')

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    8.3s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   18.4s finished


Best: 0.803737 using {'tol': 0.01, 'shrinking': False, 'probability': True, 'kernel': 'rbf', 'gamma': 'scale', 'degree': 3, 'C': 25.0}
Execution time: 20.38546633720398 sec


In [24]:
print(random.best_params_)
print(random.best_score_)

{'tol': 0.01, 'shrinking': False, 'probability': True, 'kernel': 'rbf', 'gamma': 'scale', 'degree': 3, 'C': 25.0}
0.8037372925141357


In [25]:
# Make predictions with the hypertuned model
predictionsX = random.predict(X_test_minmax)
predictionsX

array(['CONFIRMED', 'FALSE POSITIVE', 'FALSE POSITIVE', ...,
       'FALSE POSITIVE', 'FALSE POSITIVE', 'CONFIRMED'], dtype=object)

In [26]:
# Calculate classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, predictionsX,
                            target_names=["CANDIDATE", "CONFIRMED", "FALSE POSITIVE"]))

                precision    recall  f1-score   support

     CANDIDATE       0.76      0.43      0.55       422
     CONFIRMED       0.62      0.86      0.72       450
FALSE POSITIVE       0.99      1.00      0.99       876

     micro avg       0.82      0.82      0.82      1748
     macro avg       0.79      0.76      0.75      1748
  weighted avg       0.84      0.82      0.81      1748



# Save the Model

In [27]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'jason_klug_SVM.sav'
joblib.dump(model, filename)

['jason_klug_SVM.sav']