In [1]:
# Update sklearn to prevent version mismatches
#!pip install sklearn --upgrade

In [2]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
#!pip install joblib

In [3]:
import warnings
warnings.filterwarnings('always')
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

# Read the CSV and Perform Basic Data Cleaning

In [4]:
df = pd.read_csv("exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


In [5]:
# rename header to make easier to interpret what each column means
# see link below for detailed description for each header
# https://exoplanetarchive.ipac.caltech.edu/docs/API_kepcandidate_columns.html
df_cleaned_headers = pd.read_csv("exoplanet_data_headers.csv")
# Drop the null columns where all values are null
df_cleaned_headers = df_cleaned_headers.dropna(axis='columns', how='all')
# Drop the null rows
df_cleaned_headers = df_cleaned_headers.dropna()
df_cleaned_headers.head()

Unnamed: 0,koi_disposition,not_transit_like_flag,stellar_eclipse_flag,centroid_offset_flag,ephemeris_match_contamination_flag,orbital_period_days,koi_period_err1,koi_period_err2,transit_epoch,koi_time0bk_err1,...,koi_steff_err2,stellar_surface_gravity,koi_slogg_err1,koi_slogg_err2,stellar_radius,koi_srad_err1,koi_srad_err2,ra,dec,kepler_band_mag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


In [6]:
columns = ['koi_period_err1', 'koi_period_err2', 'koi_time0bk_err1', 'koi_time0bk_err2', 'koi_impact_err1', 'koi_impact_err2', 'koi_duration_err1', 'koi_duration_err2', 'koi_depth_err1', 'koi_depth_err2', 'koi_prad_err1', 'koi_prad_err2', 'koi_insol_err1', 'koi_insol_err2', 'koi_steff_err1', 'koi_steff_err2', 'koi_slogg_err1', 'koi_slogg_err2', 'koi_srad_err1', 'koi_srad_err2', 'ra', 'dec', 'kepler_band_mag']
df_cleaned_headers.drop(columns, inplace=True, axis=1)
df_cleaned_headers.head()

Unnamed: 0,koi_disposition,not_transit_like_flag,stellar_eclipse_flag,centroid_offset_flag,ephemeris_match_contamination_flag,orbital_period_days,transit_epoch,impact_parameter,transit_duration_hours,transit_depth_ppm,planetary_radius,temperature_kelvin,insolation_flux,transit_signal_to_noise,tce_planet_number,stellar_effective_temperature_kelvin,stellar_surface_gravity,stellar_radius
0,CONFIRMED,0,0,0,0,54.418383,162.51384,0.586,4.507,874.8,2.83,443,9.11,25.8,2,5455,4.467,0.927
1,FALSE POSITIVE,0,1,0,0,19.89914,175.850252,0.969,1.7822,10829.0,14.6,638,39.3,76.3,1,5853,4.544,0.868
2,FALSE POSITIVE,0,1,0,0,1.736952,170.307565,1.276,2.40641,8079.2,33.46,1395,891.96,505.6,1,5805,4.564,0.791
3,CONFIRMED,0,0,0,0,2.525592,171.59555,0.701,1.6545,603.3,2.75,1406,926.16,40.9,1,6031,4.438,1.046
4,CONFIRMED,0,0,0,0,4.134435,172.97937,0.762,3.1402,686.0,2.77,1160,427.65,40.2,2,6046,4.486,0.972


# Select your features (columns)

In [7]:
# Set features. This will also be used as your x values.
selected_features = df_cleaned_headers[['orbital_period_days', 'transit_epoch', 'impact_parameter', 'transit_duration_hours', 'transit_depth_ppm', 'planetary_radius', 'temperature_kelvin', 'insolation_flux', 'transit_signal_to_noise', 'tce_planet_number', 'stellar_effective_temperature_kelvin', 'stellar_surface_gravity', 'stellar_radius', 'not_transit_like_flag', 'stellar_eclipse_flag', 'centroid_offset_flag', 'ephemeris_match_contamination_flag']]
selected_features.head()

Unnamed: 0,orbital_period_days,transit_epoch,impact_parameter,transit_duration_hours,transit_depth_ppm,planetary_radius,temperature_kelvin,insolation_flux,transit_signal_to_noise,tce_planet_number,stellar_effective_temperature_kelvin,stellar_surface_gravity,stellar_radius,not_transit_like_flag,stellar_eclipse_flag,centroid_offset_flag,ephemeris_match_contamination_flag
0,54.418383,162.51384,0.586,4.507,874.8,2.83,443,9.11,25.8,2,5455,4.467,0.927,0,0,0,0
1,19.89914,175.850252,0.969,1.7822,10829.0,14.6,638,39.3,76.3,1,5853,4.544,0.868,0,1,0,0
2,1.736952,170.307565,1.276,2.40641,8079.2,33.46,1395,891.96,505.6,1,5805,4.564,0.791,0,1,0,0
3,2.525592,171.59555,0.701,1.6545,603.3,2.75,1406,926.16,40.9,1,6031,4.438,1.046,0,0,0,0
4,4.134435,172.97937,0.762,3.1402,686.0,2.77,1160,427.65,40.2,2,6046,4.486,0.972,0,0,0,0


In [8]:
selected_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6991 entries, 0 to 6990
Data columns (total 17 columns):
orbital_period_days                     6991 non-null float64
transit_epoch                           6991 non-null float64
impact_parameter                        6991 non-null float64
transit_duration_hours                  6991 non-null float64
transit_depth_ppm                       6991 non-null float64
planetary_radius                        6991 non-null float64
temperature_kelvin                      6991 non-null int64
insolation_flux                         6991 non-null float64
transit_signal_to_noise                 6991 non-null float64
tce_planet_number                       6991 non-null int64
stellar_effective_temperature_kelvin    6991 non-null int64
stellar_surface_gravity                 6991 non-null float64
stellar_radius                          6991 non-null float64
not_transit_like_flag                   6991 non-null int64
stellar_eclipse_flag             

In [9]:
X = selected_features
y = df_cleaned_headers["koi_disposition"]
print(X.shape, y.shape)

(6991, 17) (6991,)


In [10]:
dataX = X.copy()
dataX_binary_encoded = pd.get_dummies(dataX)
dataX_binary_encoded.head()

Unnamed: 0,orbital_period_days,transit_epoch,impact_parameter,transit_duration_hours,transit_depth_ppm,planetary_radius,temperature_kelvin,insolation_flux,transit_signal_to_noise,tce_planet_number,stellar_effective_temperature_kelvin,stellar_surface_gravity,stellar_radius,not_transit_like_flag,stellar_eclipse_flag,centroid_offset_flag,ephemeris_match_contamination_flag
0,54.418383,162.51384,0.586,4.507,874.8,2.83,443,9.11,25.8,2,5455,4.467,0.927,0,0,0,0
1,19.89914,175.850252,0.969,1.7822,10829.0,14.6,638,39.3,76.3,1,5853,4.544,0.868,0,1,0,0
2,1.736952,170.307565,1.276,2.40641,8079.2,33.46,1395,891.96,505.6,1,5805,4.564,0.791,0,1,0,0
3,2.525592,171.59555,0.701,1.6545,603.3,2.75,1406,926.16,40.9,1,6031,4.438,1.046,0,0,0,0
4,4.134435,172.97937,0.762,3.1402,686.0,2.77,1160,427.65,40.2,2,6046,4.486,0.972,0,0,0,0


In [11]:
#preprocess "koi_disposition" use one hot to make numeric
data = y.copy()
data_binary_encoded = pd.get_dummies(data)
data_binary_encoded.head()

Unnamed: 0,CANDIDATE,CONFIRMED,FALSE POSITIVE
0,0,1,0
1,0,0,1
2,0,0,1
3,0,1,0
4,0,1,0


# Create a Train Test Split

Use `koi_disposition` for the y values

In [12]:
from sklearn.model_selection import train_test_split

y = pd.get_dummies(y)
print(X.shape, y.shape)
feature_names = X.columns
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

X_train.head()

(6991, 17) (6991, 3)


Unnamed: 0,orbital_period_days,transit_epoch,impact_parameter,transit_duration_hours,transit_depth_ppm,planetary_radius,temperature_kelvin,insolation_flux,transit_signal_to_noise,tce_planet_number,stellar_effective_temperature_kelvin,stellar_surface_gravity,stellar_radius,not_transit_like_flag,stellar_eclipse_flag,centroid_offset_flag,ephemeris_match_contamination_flag
5042,1.444486,132.31096,0.305,1.437,980.7,1.73,1051,290.01,12.0,1,4569,4.686,0.559,1,0,0,0
2838,14.265155,143.22024,0.353,3.601,66.3,1.06,853,125.45,13.9,1,5825,4.237,1.291,0,0,0,0
2347,1.746101,131.7921,0.032,1.299,54.2,1.24,1689,1924.75,9.2,2,4854,3.939,1.749,0,0,0,0
6834,1.476231,131.603,0.375,2.032,39.1,0.71,1802,2484.49,9.7,1,6325,4.409,1.112,0,1,0,0
4115,41.166761,158.917777,1.093,2.2133,5386.3,17.82,446,9.35,185.2,1,5193,4.503,0.805,0,1,0,0


In [13]:
y_train.head()

Unnamed: 0,CANDIDATE,CONFIRMED,FALSE POSITIVE
5042,0,0,1
2838,1,0,0
2347,1,0,0
6834,0,0,1
4115,0,0,1


In [14]:
y_test.head()

Unnamed: 0,CANDIDATE,CONFIRMED,FALSE POSITIVE
3737,0,0,1
5464,1,0,0
5031,0,0,1
5240,0,0,1
3509,0,1,0


# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [15]:
# Scale your data
from sklearn.preprocessing import MinMaxScaler
X_minmax = MinMaxScaler().fit(X_train)
y_minmax = MinMaxScaler().fit(y_train)

X_train_minmax = X_minmax.transform(X_train)
X_test_minmax = X_minmax.transform(X_test)
y_train_minmax = y_minmax.transform(y_train)
y_test_minmax = y_minmax.transform(y_test)

  return self.partial_fit(X, y)
  return self.partial_fit(X, y)


In [16]:
from sklearn import tree
# decision tree model
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train_minmax, y_train_minmax)
clf.score(X_test_minmax, y_test_minmax)

0.8518306636155606

In [17]:
clf

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [18]:
sorted(zip(clf.feature_importances_, feature_names), reverse=True)

[(0.1850671657800496, 'not_transit_like_flag'),
 (0.18451426519832592, 'centroid_offset_flag'),
 (0.1777361275931974, 'stellar_eclipse_flag'),
 (0.1540030292664817, 'transit_signal_to_noise'),
 (0.04854855579725562, 'impact_parameter'),
 (0.04350718714026859, 'orbital_period_days'),
 (0.033529508944781634, 'ephemeris_match_contamination_flag'),
 (0.02970795846452422, 'transit_duration_hours'),
 (0.029125605044770363, 'planetary_radius'),
 (0.01987632981308068, 'transit_epoch'),
 (0.018912733813228957, 'stellar_effective_temperature_kelvin'),
 (0.016108731996336757, 'transit_depth_ppm'),
 (0.015375917156604605, 'stellar_radius'),
 (0.014331116330143912, 'temperature_kelvin'),
 (0.012337606618429295, 'stellar_surface_gravity'),
 (0.009350986992751407, 'tce_planet_number'),
 (0.00796717404976924, 'insolation_flux')]

# Train the Model



In [19]:
# train the model
print(f"Training Data Score: {clf.score(X_train_minmax, y_train_minmax)}")
print(f"Testing Data Score: {clf.score(X_test_minmax, y_test_minmax)}")

Training Data Score: 1.0
Testing Data Score: 0.8518306636155606


# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [20]:
# see what parameters are available to hypertune
clf.get_params()

{'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'presort': False,
 'random_state': None,
 'splitter': 'best'}

In [21]:
# try a random search
from sklearn.model_selection import RandomizedSearchCV

In [22]:
criterion = ["gini", "entropy"]
max_depth = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None]
max_features = ['auto', 'sqrt']
min_samples_leaf = [1, 2, 4]
min_samples_split = [2, 5, 10]
splitter = ["best", "random"]
param_grid = dict(criterion=criterion, max_depth=max_depth, max_features=max_features, min_samples_leaf=min_samples_leaf, min_samples_split=min_samples_split, splitter=splitter)

In [23]:
import time
random = RandomizedSearchCV(clf, param_distributions=param_grid, cv = 5, verbose=2, random_state=42, n_jobs=-1)  

start_time = time.time()
random_result = random.fit(X_train_minmax, y_train_minmax)
# Summarize results
print("Best: %f using %s" % (random_result.best_score_, random_result.best_params_))
print("Execution time: " + str((time.time() - start_time)) + ' sec')

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Best: 0.831394 using {'splitter': 'best', 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'auto', 'max_depth': 90, 'criterion': 'entropy'}
Execution time: 2.367894411087036 sec


[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done  35 out of  50 | elapsed:    2.2s remaining:    0.9s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    2.2s finished


In [24]:
print(random.best_params_)
print(random.best_score_)

{'splitter': 'best', 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'auto', 'max_depth': 90, 'criterion': 'entropy'}
0.8313942399389662


In [25]:
# Make predictions with the hypertuned model
predictionsX = random.predict(X_test_minmax)
predictionsX

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       ...,
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.]])

In [26]:
# Calculate classification report
from sklearn.metrics import classification_report
print(classification_report(y_test_minmax, predictionsX,
                            target_names=["CANDIDATE", "CONFIRMED", "FALSE POSITIVE"]))

                precision    recall  f1-score   support

     CANDIDATE       0.75      0.71      0.73       422
     CONFIRMED       0.75      0.75      0.75       450
FALSE POSITIVE       0.99      0.97      0.98       876

     micro avg       0.87      0.85      0.86      1748
     macro avg       0.83      0.81      0.82      1748
  weighted avg       0.87      0.85      0.86      1748
   samples avg       0.85      0.85      0.85      1748



  'precision', 'predicted', average, warn_for)


In [27]:
from sklearn.metrics import f1_score
print(f1_score(y_test_minmax, predictionsX, average='weighted', labels=np.unique(predictionsX)))

0.7380721444811349


# Save the Model

In [28]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'jason_klug_decision_tree.sav'
joblib.dump(clf, filename)

['jason_klug_decision_tree.sav']