In [None]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade

In [None]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
!pip install joblib

In [1]:
import pandas as pd

# Read the CSV and Perform Basic Data Cleaning

In [2]:
df = pd.read_csv("exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


# Select your features (columns)

In [3]:
# Set features. This will also be used as your x values.
selected_features = df.drop(['koi_disposition'], axis=1)

# Create a Train Test Split

Use `koi_disposition` for the y values

In [4]:
target = df["koi_disposition"]

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(selected_features, target, random_state=42)

In [6]:
X_train.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
6122,0,0,0,0,6.768901,7.38e-05,-7.38e-05,133.07724,0.00844,-0.00844,...,-171,4.327,0.153,-0.187,1.125,0.31,-0.207,294.40472,39.351681,14.725
6370,0,1,0,1,0.733726,6.06e-06,-6.06e-06,132.02005,0.00795,-0.00795,...,-175,4.578,0.033,-0.187,0.797,0.211,-0.056,284.50391,42.46386,15.77
2879,1,0,0,0,7.652707,6.54e-05,-6.54e-05,134.46038,0.00619,-0.00619,...,-189,4.481,0.05,-0.2,0.963,0.29,-0.097,295.50211,38.98354,13.099
107,0,0,0,0,7.953547,1.91e-05,-1.91e-05,174.66224,0.00182,-0.00182,...,-85,4.536,0.056,-0.016,0.779,0.023,-0.049,291.15878,40.750271,15.66
29,0,0,0,0,4.959319,5.15e-07,-5.15e-07,172.258529,8.3e-05,-8.3e-05,...,-77,4.359,0.11,-0.11,1.082,0.173,-0.13,292.16705,48.727589,15.263


# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [7]:
# Scale your data
from sklearn.preprocessing import MinMaxScaler

X_scaler = MinMaxScaler().fit(X_train)

In [8]:
# Transform the training and testing data using the X_scaler
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


# Train the Model - Random Forest 



In [9]:
# Using Random Forest
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)

In [10]:
# Fit the model
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.8998855835240275

In [11]:
# Validate the model using the test data
print(f"Training Data Score: {rf.score(X_train, y_train)}")
print(f"Testing Data Score: {rf.score(X_test, y_test)}")

Training Data Score: 1.0
Testing Data Score: 0.8998855835240275


# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [12]:
rf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 200,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [13]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV

# define the parameter values that should be searched
param_grid = { 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}
print(param_grid)


{'n_estimators': [200, 500], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [4, 5, 6, 7, 8], 'criterion': ['gini', 'entropy']}


In [14]:
# Train the model with GridSe
# instantiate the grid
grid = GridSearchCV(rf, param_grid, cv=3, verbose=1, n_jobs=-1)

# Fit the model using the grid search estimator. 
grid.fit(X_train, y_train)

Fitting 3 folds for each of 60 candidates, totalling 180 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:  6.1min finished


GridSearchCV(cv=3, estimator=RandomForestClassifier(n_estimators=200),
             n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [4, 5, 6, 7, 8],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'n_estimators': [200, 500]},
             verbose=1)

In [15]:
# List the best parameters for this dataset
print(grid.best_params_)
print(grid.best_score_)
print(grid.best_estimator_)

{'criterion': 'entropy', 'max_depth': 8, 'max_features': 'auto', 'n_estimators': 200}
0.8874671060818219
RandomForestClassifier(criterion='entropy', max_depth=8, n_estimators=200)


In [16]:
# Testing model
model3 = RandomForestClassifier(criterion='entropy', max_depth=8, n_estimators=200)
model3.fit(X_train_scaled, y_train)


RandomForestClassifier(criterion='entropy', max_depth=8, n_estimators=200)

In [17]:
print(f"Testing Data Score: {model3.score(X_test_scaled, y_test)}")

Testing Data Score: 0.8890160183066361


## Feature Selection with Feature Importances

In [18]:
# Use feature importances to get the insignificant variables
importances = model3.feature_importances_
print(importances)

[0.1348324  0.10649975 0.14009659 0.0519333  0.02001932 0.01483964
 0.0134889  0.00591588 0.02491383 0.01944451 0.01230527 0.0041311
 0.0029487  0.010005   0.03437198 0.03416693 0.01669416 0.00960755
 0.00757794 0.05770413 0.0399819  0.03069751 0.01252582 0.01084232
 0.01606823 0.00985144 0.06377284 0.00139651 0.00317524 0.03132
 0.02660286 0.00350536 0.00270711 0.00411803 0.00388169 0.00657485
 0.00300824 0.00353388 0.00237176 0.00256754]


In [20]:
# Sort the features with the most important ones on the top
feature_names = df.columns
feature_importance = sorted(zip(rf.feature_importances_, feature_names), reverse=True)
feature_importance

[(0.10969206253939365, 'koi_fpflag_ss'),
 (0.09919682212677283, 'koi_disposition'),
 (0.06798665990110278, 'koi_fpflag_nt'),
 (0.0540933686808439, 'koi_insol_err2'),
 (0.04563358243272647, 'koi_depth_err2'),
 (0.03529373075550331, 'koi_prad'),
 (0.03476563755847626, 'koi_fpflag_co'),
 (0.03431342113450309, 'koi_duration_err1'),
 (0.03302655462258555, 'koi_prad_err1'),
 (0.032697126762067145, 'koi_duration'),
 (0.032170828723411475, 'koi_steff'),
 (0.027236867973039862, 'koi_steff_err1'),
 (0.02471045561290443, 'koi_time0bk_err1'),
 (0.023248927373629283, 'koi_time0bk'),
 (0.02266749931811483, 'koi_impact_err2'),
 (0.02254597657106705, 'koi_fpflag_ec'),
 (0.020934564081467, 'koi_duration_err2'),
 (0.018899733591843146, 'koi_time0bk_err2'),
 (0.017076839005701113, 'koi_period_err1'),
 (0.01663751439810807, 'koi_insol'),
 (0.01616792942142308, 'koi_period'),
 (0.01598658373144781, 'koi_teq'),
 (0.015560797246980451, 'koi_insol_err1'),
 (0.015280387407704757, 'koi_prad_err2'),
 (0.01339388

In [21]:
# Get a new list of features with importance > 0.01
fi_selected_features = []
for importance in feature_importance:
    if importance[0] > 0.01:
        fi_selected_features.append(importance[1])

print(fi_selected_features)

['koi_fpflag_ss', 'koi_disposition', 'koi_fpflag_nt', 'koi_insol_err2', 'koi_depth_err2', 'koi_prad', 'koi_fpflag_co', 'koi_duration_err1', 'koi_prad_err1', 'koi_duration', 'koi_steff', 'koi_steff_err1', 'koi_time0bk_err1', 'koi_time0bk', 'koi_impact_err2', 'koi_fpflag_ec', 'koi_duration_err2', 'koi_time0bk_err2', 'koi_period_err1', 'koi_insol', 'koi_period', 'koi_teq', 'koi_insol_err1', 'koi_prad_err2', 'koi_depth', 'koi_depth_err1', 'koi_period_err2', 'koi_srad_err2', 'koi_srad', 'koi_impact', 'ra', 'koi_impact_err1', 'dec']


In [26]:
# Get a new df with selected features
new_selected_features = df[fi_selected_features].drop(["koi_disposition"], axis=1)

# Re split the samples
X_train, X_test, y_train, y_test = train_test_split(new_selected_features, target, random_state=42)

X_train.head()


Unnamed: 0,koi_fpflag_ss,koi_fpflag_nt,koi_insol_err2,koi_depth_err2,koi_prad,koi_fpflag_co,koi_duration_err1,koi_prad_err1,koi_duration,koi_steff,...,koi_prad_err2,koi_depth,koi_depth_err1,koi_period_err2,koi_srad_err2,koi_srad,koi_impact,ra,koi_impact_err1,dec
6122,0,0,-103.87,-13.2,1.24,0,0.306,0.34,3.616,5737,...,-0.23,123.1,13.2,-7.38e-05,-0.207,1.125,0.15,294.40472,0.305,39.351681
6370,1,0,-677.78,-9.3,0.86,0,0.282,0.23,2.309,5855,...,-0.06,114.6,9.3,-6.06e-06,-0.056,0.797,0.291,284.50391,0.193,42.46386
2879,0,1,-64.34,-42.9,3.21,0,0.0,0.97,79.8969,6328,...,-0.32,641.1,42.9,-6.54e-05,-0.097,0.963,0.97,295.50211,0.879,38.98354
107,0,0,-10.12,-25.3,2.25,0,0.0595,0.07,2.6312,4768,...,-0.14,875.4,25.3,-1.91e-05,-0.049,0.779,0.3,291.15878,0.145,40.750271
29,0,0,-93.21,-20.0,12.21,0,0.0075,1.96,2.22739,5712,...,-1.46,9802.0,20.0,-5.15e-07,-0.13,1.082,0.831,292.16705,0.016,48.727589


In [27]:
# Scale the data
X_scaler = MinMaxScaler().fit(X_train)

# Transform the training and testing data using the X_scaler
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [28]:
# Testing model
fi_model3 = RandomForestClassifier(criterion='entropy', max_depth=8, n_estimators=200)
fi_model3.fit(X_train_scaled, y_train)

RandomForestClassifier(criterion='entropy', max_depth=8, n_estimators=200)

In [29]:
print(f"Training Data Score: {fi_model3.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {fi_model3.score(X_test_scaled, y_test)}")

Training Data Score: 0.8989128361625024
Testing Data Score: 0.8718535469107551


# Save the Model

In [30]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'model_rf.sav'
joblib.dump(model3, filename)

['model_rf.sav']