In [1]:
import pandas as pd

# Read the CSV and Perform Basic Data Cleaning

In [2]:
df = pd.read_csv("exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


# Select your features (columns)

In [3]:
df.columns

Index(['koi_disposition', 'koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co',
       'koi_fpflag_ec', 'koi_period', 'koi_period_err1', 'koi_period_err2',
       'koi_time0bk', 'koi_time0bk_err1', 'koi_time0bk_err2', 'koi_impact',
       'koi_impact_err1', 'koi_impact_err2', 'koi_duration',
       'koi_duration_err1', 'koi_duration_err2', 'koi_depth', 'koi_depth_err1',
       'koi_depth_err2', 'koi_prad', 'koi_prad_err1', 'koi_prad_err2',
       'koi_teq', 'koi_insol', 'koi_insol_err1', 'koi_insol_err2',
       'koi_model_snr', 'koi_tce_plnt_num', 'koi_steff', 'koi_steff_err1',
       'koi_steff_err2', 'koi_slogg', 'koi_slogg_err1', 'koi_slogg_err2',
       'koi_srad', 'koi_srad_err1', 'koi_srad_err2', 'ra', 'dec',
       'koi_kepmag'],
      dtype='object')

In [4]:
# Set features. This will also be used as your x values.

selected_features = df[[
    'koi_fpflag_nt',
    'koi_fpflag_ss',
    'koi_fpflag_co',
    'koi_fpflag_ec',
    'koi_period',
    'koi_impact',
    'koi_duration',
    'koi_prad',
    'koi_model_snr',
    'ra',
    'koi_kepmag'
]]
selected_features.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_impact,koi_duration,koi_prad,koi_model_snr,ra,koi_kepmag
0,0,0,0,0,54.418383,0.586,4.507,2.83,25.8,291.93423,15.347
1,0,1,0,0,19.89914,0.969,1.7822,14.6,76.3,297.00482,15.436
2,0,1,0,0,1.736952,1.276,2.40641,33.46,505.6,285.53461,15.597
3,0,0,0,0,2.525592,0.701,1.6545,2.75,40.9,288.75488,15.509
4,0,0,0,0,4.134435,0.762,3.1402,2.77,40.2,296.28613,15.714


# Create a Train Test Split

Use `koi_disposition` for the y values

In [5]:
X = df.drop('koi_disposition', axis=1)
y = df['koi_disposition']
print(X.shape, y.shape)

(6991, 40) (6991,)


In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [7]:
X_train.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
6080,1,0,0,0,12.496435,0.0002213,-0.0002213,132.0358,0.0143,-0.0143,...,-286,3.805,0.39,-0.13,2.73,0.535,-1.248,289.2308,44.412483,13.054
3001,0,0,0,0,11.615625,0.0001528,-0.0001528,131.96843,0.00823,-0.00823,...,-72,4.083,0.368,-0.092,1.453,0.218,-0.51,293.52756,41.111439,15.162
570,0,1,0,0,10.980246,6.93e-07,-6.93e-07,137.137607,5.3e-05,-5.3e-05,...,-159,4.462,0.098,-0.182,0.897,0.238,-0.119,282.79764,43.578129,14.212
4897,1,0,0,0,466.90824,0.01194,-0.01194,136.3731,0.019,-0.019,...,-146,4.456,0.102,-0.361,0.867,0.448,-0.103,297.65436,43.178551,15.202
625,0,1,1,1,1.061933,1.25e-06,-1.25e-06,133.850441,0.000978,-0.000978,...,-167,3.975,0.259,-0.111,1.851,0.383,-0.575,288.90253,44.632992,12.953


# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [8]:
from numpy import asarray
from sklearn.preprocessing import MinMaxScaler

# define min max scaler
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [9]:
from sklearn import tree
from sklearn.metrics import classification_report

clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train_scaled, y_train)
clf.score(X_test_scaled, y_test)

0.8495423340961098

# Train the Model

In [10]:
from sklearn.ensemble import RandomForestClassifier
model_rf = RandomForestClassifier(n_estimators=200)
model_rf = model_rf.fit(X_train_scaled, y_train)

In [11]:
print(f"Training Data Score: {model_rf.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model_rf.score(X_test_scaled, y_test)}")

Training Data Score: 1.0
Testing Data Score: 0.8958810068649885


In [12]:
sorted(zip(model_rf.feature_importances_, selected_features), reverse=True)

[(0.1059143287532017, 'koi_fpflag_co'),
 (0.09191250360176863, 'koi_fpflag_nt'),
 (0.07373142998853616, 'koi_fpflag_ss'),
 (0.03540534711133361, 'koi_fpflag_ec'),
 (0.023642929881681138, 'ra'),
 (0.02206813354143246, 'koi_period'),
 (0.019627835223467095, 'koi_model_snr'),
 (0.01906735998413219, 'koi_kepmag'),
 (0.01791514005224081, 'koi_duration'),
 (0.01695081772320626, 'koi_impact'),
 (0.014046896178995133, 'koi_prad')]

In [13]:
print(classification_report(model_rf.predict(X_test_scaled), y_test))

                precision    recall  f1-score   support

     CANDIDATE       0.74      0.83      0.78       373
     CONFIRMED       0.84      0.81      0.82       472
FALSE POSITIVE       1.00      0.97      0.98       903

      accuracy                           0.90      1748
     macro avg       0.86      0.87      0.86      1748
  weighted avg       0.90      0.90      0.90      1748



In [14]:
from pprint import pprint
# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(model_rf.get_params())

Parameters currently in use:

{'bootstrap': True,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 200,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}


# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [15]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV

param_grid = {
    'max_depth': [110, 150, 200, 250],
    'n_estimators': [250, 300, 350, 500]
}
grid_search = GridSearchCV(model_rf, param_grid,  cv = 3, n_jobs = -1, verbose = 2)

In [16]:
# Fit the grid search to the data
grid_search.fit(X_train_scaled, y_train)
print(grid_search.best_params_)

Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   49.9s
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:  1.2min finished


{'max_depth': 110, 'n_estimators': 350}


In [17]:
print(grid_search.best_score_)

0.8884226587831394


In [18]:
grid_search.score(X_train_scaled, y_train)

1.0

In [19]:
grid_search.score(X_test_scaled, y_test)

0.9044622425629291

In [20]:
# Make predictions with the hypertuned model
predictions = grid_search.predict(X_test_scaled)

# Calculate classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

                precision    recall  f1-score   support

     CANDIDATE       0.84      0.76      0.80       422
     CONFIRMED       0.82      0.86      0.84       450
FALSE POSITIVE       0.97      1.00      0.99       876

      accuracy                           0.90      1748
     macro avg       0.88      0.87      0.87      1748
  weighted avg       0.90      0.90      0.90      1748



# Save the Model

In [21]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'harini_random_forest_model.sav'
joblib.dump(model_rf, filename)

['harini_random_forest_model.sav']