# Necessary Upgrades
Run the cells in this section to make sure you have the latest version of sklearn and joblib.

Restart your kernel after installing.

In [1]:
## Update sklearn to prevent version mismatches
# !pip install sklearn --upgrade

In [2]:
## install joblib. This will be used to save your model. 
# !pip install joblib
## Restart your kernel after installing 

# Import Dependencies

In [3]:
import warnings
warnings.simplefilter('ignore')

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

import joblib

# Read the CSV and Perform Basic Data Cleaning

In [12]:
#read csv files in

df2019 = pd.read_csv("NBA_data/NBA_Season_avgs_2019_20.csv")
df2018 = pd.read_csv("NBA_data/NBA_Season_avgs_2018_19.csv")
df2017 = pd.read_csv("NBA_data/NBA_Season_avgs_2017_18.csv")
df2016 = pd.read_csv("NBA_data/NBA_Season_avgs_2016_17.csv")
df2015 = pd.read_csv("NBA_data/NBA_Season_avgs_2015_16.csv")
df2014 = pd.read_csv("NBA_data/NBA_Season_avgs_2014_15.csv")

# add year columns to each

df2019['Year'] = "2019"
df2018['Year'] = "2018"
df2017['Year'] = "2017"
df2016['Year'] = "2016"
df2015['Year'] = "2015"
df2014['Year'] = "2014"

In [13]:
final_df = pd.concat([df2019, df2018, df2017, df2016, df2015, df2014], axis=0)
final_df = final_df.dropna()
final_df["playoffs_y_n"] = np.where(final_df['Rk'] <= 16, 1, 0)

In [14]:
final_df_2018 = final_df.loc[final_df["Year"] == '2018']
final_df_2017 = final_df.loc[final_df["Year"] == '2017']
final_df_2016 = final_df.loc[final_df["Year"] == '2016']
final_df_2015 = final_df.loc[final_df["Year"] == '2015']

In [15]:
final_df

Unnamed: 0,Rk,Team,G,W,L,MP,FG,FGA,FG%,3P,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year,playoffs_y_n
0,1.0,Dallas Mavericks,67.0,53.0,12.0,241.5,41.6,90.0,0.462,15.3,...,36.4,47.0,24.5,6.3,5.0,12.8,19.0,116.4,2019,1
1,2.0,Milwaukee Bucks*,65.0,49.0,14.0,240.8,43.5,91.2,0.477,13.7,...,42.2,51.7,25.9,7.4,6.0,14.9,19.2,118.6,2019,1
2,3.0,Houston Rockets,64.0,44.0,20.0,241.2,41.1,90.7,0.454,15.4,...,34.6,44.9,21.5,8.5,5.1,14.7,21.6,118.1,2019,1
3,4.0,Portland Trail Blazers,66.0,46.0,18.0,240.8,41.9,90.9,0.461,12.6,...,35.4,45.5,20.2,6.1,6.2,13.0,21.4,113.6,2019,1
4,5.0,Atlanta Hawks,67.0,40.0,27.0,243.0,40.6,90.6,0.449,12.0,...,33.4,43.3,24.0,7.8,5.1,16.2,23.1,111.8,2019,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25,26.0,Utah Jazz,82.0,25.0,57.0,240.3,35.4,79.2,0.447,7.4,...,31.9,44.0,19.9,7.6,6.0,15.3,19.3,95.1,2014,0
26,27.0,Miami Heat,82.0,21.0,61.0,240.6,35.2,77.2,0.456,6.8,...,30.0,39.1,19.8,7.8,4.5,14.8,20.0,94.7,2014,0
27,28.0,Charlotte Hornets,82.0,16.0,66.0,242.7,35.5,84.5,0.420,6.1,...,34.1,44.1,20.2,6.1,5.5,11.9,18.2,94.2,2014,0
28,29.0,Philadelphia 76ers,82.0,18.0,64.0,241.5,33.7,82.6,0.408,8.4,...,30.9,42.9,20.5,9.6,5.9,17.7,21.7,92.0,2014,0


# Select your features (columns) and y-variable.
Column definitions are available at the end of this notebook.

Use `koi_disposition` for the y values

In [6]:
y = df[['koi_disposition']]

In [7]:
# test with a subset
# selected_features = df[['koi_fpflag_nt', 'koi_prad', 'koi_teq', 'koi_srad']]

# all the columns minus y
selected_features = df.drop(columns=["koi_disposition"])
feature_names = selected_features.columns
# selected_features.head()

# Create a Train Test Split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(selected_features, y, random_state=42, stratify = y)
# X_train.head()

# Pre-processing

Scale the data and perform some feature selection

In [9]:
# Scale your data

X_standard_scaler = StandardScaler().fit(X_train)
X_minmax_scaler = MinMaxScaler().fit(X_train)

X_train_minmax_scaled = X_minmax_scaler.transform(X_train)
X_test_minmax_scaled = X_minmax_scaler.transform(X_test)

X_train_standard_scaled = X_standard_scaler.transform(X_train)
X_test_standard_scaled = X_standard_scaler.transform(X_test)

# Train the Model



In [10]:
rfc_minmax = RandomForestClassifier()
rfc_minmax.fit(X_train_minmax_scaled, y_train)

rfc_standard = RandomForestClassifier()
rfc_standard.fit(X_train_standard_scaled, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [11]:
print(f"Minmax Training Data Score: {rfc_minmax.score(X_train_minmax_scaled, y_train)}")
print(f"Minmax Testing Data Score: {rfc_minmax.score(X_test_minmax_scaled, y_test)}")

minmax_predictions = rfc_minmax.predict(X_test_minmax_scaled)
print(classification_report(y_test, minmax_predictions))

print ("---")

print(f"Standard Training Data Score: {rfc_standard.score(X_train_standard_scaled, y_train)}")
print(f"Standard Testing Data Score: {rfc_standard.score(X_test_standard_scaled, y_test)}")

standard_predictions = rfc_standard.predict(X_test_standard_scaled)
print(classification_report(y_test, standard_predictions))

Minmax Training Data Score: 1.0
Minmax Testing Data Score: 0.9067505720823799
                precision    recall  f1-score   support

     CANDIDATE       0.84      0.78      0.81       422
     CONFIRMED       0.81      0.84      0.83       450
FALSE POSITIVE       0.99      1.00      0.99       876

      accuracy                           0.91      1748
     macro avg       0.88      0.88      0.88      1748
  weighted avg       0.91      0.91      0.91      1748

---
Standard Training Data Score: 1.0
Standard Testing Data Score: 0.9078947368421053
                precision    recall  f1-score   support

     CANDIDATE       0.84      0.79      0.81       422
     CONFIRMED       0.82      0.84      0.83       450
FALSE POSITIVE       0.98      1.00      0.99       876

      accuracy                           0.91      1748
     macro avg       0.88      0.88      0.88      1748
  weighted avg       0.91      0.91      0.91      1748



In [12]:
# minmax scaler importance
# print ("minmax scaler importance")
# importances = rfc_minmax.feature_importances_
# sorted(zip(importances, feature_names), reverse=True)

In [13]:
# standard scaler importance
# print ("standard scaler importance")
# importances = rfc_standard.feature_importances_
# sorted(zip(importances, feature_names), reverse=True)

# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [14]:
# Create the GridSearchCV model

param_grid = { 
    'n_estimators': [200],
    "min_samples_split": [2, 5, 10, 15, 100],
    'max_depth' : [5, 8, 15, 25, 30],
    "min_samples_leaf": [1, 2, 5, 10] 
}

rfc_minmax_search = RandomForestClassifier()
rfc_grid_minmax = GridSearchCV(estimator=rfc_minmax_search, param_grid=param_grid, cv= 5, verbose = 2)

rfc_standard_search = RandomForestClassifier()
rfc_grid_standard = GridSearchCV(estimator=rfc_standard_search, param_grid=param_grid, cv= 5, verbose = 2)

In [15]:
# Train the model with GridSearch
best_rfc_model_minmax = rfc_grid_minmax.fit(X_train_minmax_scaled, y_train)
best_rfc_model_standard = rfc_grid_standard.fit(X_train_standard_scaled, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200, total=   1.1s
[CV] max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.0s remaining:    0.0s


[CV]  max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200, total=   0.8s
[CV] max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200 
[CV]  max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200, total=   0.8s
[CV] max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200 
[CV]  max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200, total=   0.8s
[CV] max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200 
[CV]  max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200, total=   1.0s
[CV] max_depth=5, min_samples_leaf=1, min_samples_split=5, n_estimators=200 
[CV]  max_depth=5, min_samples_leaf=1, min_samples_split=5, n_estimators=200, total=   0.8s
[CV] max_depth=5, min_samples_leaf=1, min_samples_split=5, n_estimators=200 
[CV]  max_depth=5, min_samples_leaf=1, min_samples_split=5, n_estimators=200, total=   0.8s
[CV] max_depth=5, min_samples_leaf=1, min_samples_split=5, n_es

[CV]  max_depth=5, min_samples_leaf=5, min_samples_split=2, n_estimators=200, total=   0.9s
[CV] max_depth=5, min_samples_leaf=5, min_samples_split=2, n_estimators=200 
[CV]  max_depth=5, min_samples_leaf=5, min_samples_split=2, n_estimators=200, total=   0.8s
[CV] max_depth=5, min_samples_leaf=5, min_samples_split=2, n_estimators=200 
[CV]  max_depth=5, min_samples_leaf=5, min_samples_split=2, n_estimators=200, total=   0.8s
[CV] max_depth=5, min_samples_leaf=5, min_samples_split=2, n_estimators=200 
[CV]  max_depth=5, min_samples_leaf=5, min_samples_split=2, n_estimators=200, total=   0.7s
[CV] max_depth=5, min_samples_leaf=5, min_samples_split=2, n_estimators=200 
[CV]  max_depth=5, min_samples_leaf=5, min_samples_split=2, n_estimators=200, total=   0.8s
[CV] max_depth=5, min_samples_leaf=5, min_samples_split=5, n_estimators=200 
[CV]  max_depth=5, min_samples_leaf=5, min_samples_split=5, n_estimators=200, total=   0.8s
[CV] max_depth=5, min_samples_leaf=5, min_samples_split=5, n_es

[CV]  max_depth=5, min_samples_leaf=10, min_samples_split=100, n_estimators=200, total=   0.8s
[CV] max_depth=5, min_samples_leaf=10, min_samples_split=100, n_estimators=200 
[CV]  max_depth=5, min_samples_leaf=10, min_samples_split=100, n_estimators=200, total=   0.8s
[CV] max_depth=8, min_samples_leaf=1, min_samples_split=2, n_estimators=200 
[CV]  max_depth=8, min_samples_leaf=1, min_samples_split=2, n_estimators=200, total=   1.0s
[CV] max_depth=8, min_samples_leaf=1, min_samples_split=2, n_estimators=200 
[CV]  max_depth=8, min_samples_leaf=1, min_samples_split=2, n_estimators=200, total=   1.0s
[CV] max_depth=8, min_samples_leaf=1, min_samples_split=2, n_estimators=200 
[CV]  max_depth=8, min_samples_leaf=1, min_samples_split=2, n_estimators=200, total=   1.0s
[CV] max_depth=8, min_samples_leaf=1, min_samples_split=2, n_estimators=200 
[CV]  max_depth=8, min_samples_leaf=1, min_samples_split=2, n_estimators=200, total=   1.0s
[CV] max_depth=8, min_samples_leaf=1, min_samples_spli

[CV]  max_depth=8, min_samples_leaf=2, min_samples_split=100, n_estimators=200, total=   1.0s
[CV] max_depth=8, min_samples_leaf=2, min_samples_split=100, n_estimators=200 
[CV]  max_depth=8, min_samples_leaf=2, min_samples_split=100, n_estimators=200, total=   1.0s
[CV] max_depth=8, min_samples_leaf=2, min_samples_split=100, n_estimators=200 
[CV]  max_depth=8, min_samples_leaf=2, min_samples_split=100, n_estimators=200, total=   1.0s
[CV] max_depth=8, min_samples_leaf=5, min_samples_split=2, n_estimators=200 
[CV]  max_depth=8, min_samples_leaf=5, min_samples_split=2, n_estimators=200, total=   1.0s
[CV] max_depth=8, min_samples_leaf=5, min_samples_split=2, n_estimators=200 
[CV]  max_depth=8, min_samples_leaf=5, min_samples_split=2, n_estimators=200, total=   1.0s
[CV] max_depth=8, min_samples_leaf=5, min_samples_split=2, n_estimators=200 
[CV]  max_depth=8, min_samples_leaf=5, min_samples_split=2, n_estimators=200, total=   1.0s
[CV] max_depth=8, min_samples_leaf=5, min_samples_spl

[CV]  max_depth=8, min_samples_leaf=10, min_samples_split=100, n_estimators=200, total=   0.9s
[CV] max_depth=8, min_samples_leaf=10, min_samples_split=100, n_estimators=200 
[CV]  max_depth=8, min_samples_leaf=10, min_samples_split=100, n_estimators=200, total=   1.0s
[CV] max_depth=8, min_samples_leaf=10, min_samples_split=100, n_estimators=200 
[CV]  max_depth=8, min_samples_leaf=10, min_samples_split=100, n_estimators=200, total=   1.0s
[CV] max_depth=8, min_samples_leaf=10, min_samples_split=100, n_estimators=200 
[CV]  max_depth=8, min_samples_leaf=10, min_samples_split=100, n_estimators=200, total=   0.9s
[CV] max_depth=8, min_samples_leaf=10, min_samples_split=100, n_estimators=200 
[CV]  max_depth=8, min_samples_leaf=10, min_samples_split=100, n_estimators=200, total=   0.9s
[CV] max_depth=15, min_samples_leaf=1, min_samples_split=2, n_estimators=200 
[CV]  max_depth=15, min_samples_leaf=1, min_samples_split=2, n_estimators=200, total=   1.3s
[CV] max_depth=15, min_samples_lea

[CV]  max_depth=15, min_samples_leaf=2, min_samples_split=15, n_estimators=200, total=   1.4s
[CV] max_depth=15, min_samples_leaf=2, min_samples_split=15, n_estimators=200 
[CV]  max_depth=15, min_samples_leaf=2, min_samples_split=15, n_estimators=200, total=   1.3s
[CV] max_depth=15, min_samples_leaf=2, min_samples_split=100, n_estimators=200 
[CV]  max_depth=15, min_samples_leaf=2, min_samples_split=100, n_estimators=200, total=   1.1s
[CV] max_depth=15, min_samples_leaf=2, min_samples_split=100, n_estimators=200 
[CV]  max_depth=15, min_samples_leaf=2, min_samples_split=100, n_estimators=200, total=   1.2s
[CV] max_depth=15, min_samples_leaf=2, min_samples_split=100, n_estimators=200 
[CV]  max_depth=15, min_samples_leaf=2, min_samples_split=100, n_estimators=200, total=   1.2s
[CV] max_depth=15, min_samples_leaf=2, min_samples_split=100, n_estimators=200 
[CV]  max_depth=15, min_samples_leaf=2, min_samples_split=100, n_estimators=200, total=   1.1s
[CV] max_depth=15, min_samples_le

[CV]  max_depth=15, min_samples_leaf=10, min_samples_split=15, n_estimators=200, total=   1.3s
[CV] max_depth=15, min_samples_leaf=10, min_samples_split=15, n_estimators=200 
[CV]  max_depth=15, min_samples_leaf=10, min_samples_split=15, n_estimators=200, total=   1.2s
[CV] max_depth=15, min_samples_leaf=10, min_samples_split=15, n_estimators=200 
[CV]  max_depth=15, min_samples_leaf=10, min_samples_split=15, n_estimators=200, total=   1.2s
[CV] max_depth=15, min_samples_leaf=10, min_samples_split=15, n_estimators=200 
[CV]  max_depth=15, min_samples_leaf=10, min_samples_split=15, n_estimators=200, total=   1.2s
[CV] max_depth=15, min_samples_leaf=10, min_samples_split=100, n_estimators=200 
[CV]  max_depth=15, min_samples_leaf=10, min_samples_split=100, n_estimators=200, total=   1.1s
[CV] max_depth=15, min_samples_leaf=10, min_samples_split=100, n_estimators=200 
[CV]  max_depth=15, min_samples_leaf=10, min_samples_split=100, n_estimators=200, total=   1.1s
[CV] max_depth=15, min_sam

[CV]  max_depth=25, min_samples_leaf=2, min_samples_split=10, n_estimators=200, total=   1.5s
[CV] max_depth=25, min_samples_leaf=2, min_samples_split=15, n_estimators=200 
[CV]  max_depth=25, min_samples_leaf=2, min_samples_split=15, n_estimators=200, total=   1.4s
[CV] max_depth=25, min_samples_leaf=2, min_samples_split=15, n_estimators=200 
[CV]  max_depth=25, min_samples_leaf=2, min_samples_split=15, n_estimators=200, total=   1.5s
[CV] max_depth=25, min_samples_leaf=2, min_samples_split=15, n_estimators=200 
[CV]  max_depth=25, min_samples_leaf=2, min_samples_split=15, n_estimators=200, total=   1.9s
[CV] max_depth=25, min_samples_leaf=2, min_samples_split=15, n_estimators=200 
[CV]  max_depth=25, min_samples_leaf=2, min_samples_split=15, n_estimators=200, total=   1.5s
[CV] max_depth=25, min_samples_leaf=2, min_samples_split=15, n_estimators=200 
[CV]  max_depth=25, min_samples_leaf=2, min_samples_split=15, n_estimators=200, total=   1.5s
[CV] max_depth=25, min_samples_leaf=2, mi

[CV]  max_depth=25, min_samples_leaf=10, min_samples_split=10, n_estimators=200, total=   1.2s
[CV] max_depth=25, min_samples_leaf=10, min_samples_split=10, n_estimators=200 
[CV]  max_depth=25, min_samples_leaf=10, min_samples_split=10, n_estimators=200, total=   1.3s
[CV] max_depth=25, min_samples_leaf=10, min_samples_split=10, n_estimators=200 
[CV]  max_depth=25, min_samples_leaf=10, min_samples_split=10, n_estimators=200, total=   1.2s
[CV] max_depth=25, min_samples_leaf=10, min_samples_split=15, n_estimators=200 
[CV]  max_depth=25, min_samples_leaf=10, min_samples_split=15, n_estimators=200, total=   1.2s
[CV] max_depth=25, min_samples_leaf=10, min_samples_split=15, n_estimators=200 
[CV]  max_depth=25, min_samples_leaf=10, min_samples_split=15, n_estimators=200, total=   1.2s
[CV] max_depth=25, min_samples_leaf=10, min_samples_split=15, n_estimators=200 
[CV]  max_depth=25, min_samples_leaf=10, min_samples_split=15, n_estimators=200, total=   1.2s
[CV] max_depth=25, min_samples

[CV]  max_depth=30, min_samples_leaf=2, min_samples_split=10, n_estimators=200, total=   1.4s
[CV] max_depth=30, min_samples_leaf=2, min_samples_split=10, n_estimators=200 
[CV]  max_depth=30, min_samples_leaf=2, min_samples_split=10, n_estimators=200, total=   1.5s
[CV] max_depth=30, min_samples_leaf=2, min_samples_split=10, n_estimators=200 
[CV]  max_depth=30, min_samples_leaf=2, min_samples_split=10, n_estimators=200, total=   1.4s
[CV] max_depth=30, min_samples_leaf=2, min_samples_split=10, n_estimators=200 
[CV]  max_depth=30, min_samples_leaf=2, min_samples_split=10, n_estimators=200, total=   1.4s
[CV] max_depth=30, min_samples_leaf=2, min_samples_split=10, n_estimators=200 
[CV]  max_depth=30, min_samples_leaf=2, min_samples_split=10, n_estimators=200, total=   1.4s
[CV] max_depth=30, min_samples_leaf=2, min_samples_split=15, n_estimators=200 
[CV]  max_depth=30, min_samples_leaf=2, min_samples_split=15, n_estimators=200, total=   1.4s
[CV] max_depth=30, min_samples_leaf=2, mi

[CV]  max_depth=30, min_samples_leaf=10, min_samples_split=5, n_estimators=200, total=   1.3s
[CV] max_depth=30, min_samples_leaf=10, min_samples_split=5, n_estimators=200 
[CV]  max_depth=30, min_samples_leaf=10, min_samples_split=5, n_estimators=200, total=   1.2s
[CV] max_depth=30, min_samples_leaf=10, min_samples_split=10, n_estimators=200 
[CV]  max_depth=30, min_samples_leaf=10, min_samples_split=10, n_estimators=200, total=   1.3s
[CV] max_depth=30, min_samples_leaf=10, min_samples_split=10, n_estimators=200 
[CV]  max_depth=30, min_samples_leaf=10, min_samples_split=10, n_estimators=200, total=   1.3s
[CV] max_depth=30, min_samples_leaf=10, min_samples_split=10, n_estimators=200 
[CV]  max_depth=30, min_samples_leaf=10, min_samples_split=10, n_estimators=200, total=   1.3s
[CV] max_depth=30, min_samples_leaf=10, min_samples_split=10, n_estimators=200 
[CV]  max_depth=30, min_samples_leaf=10, min_samples_split=10, n_estimators=200, total=   1.2s
[CV] max_depth=30, min_samples_le

[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:  9.6min finished


Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200, total=   0.9s
[CV] max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.8s remaining:    0.0s


[CV]  max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200, total=   0.9s
[CV] max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200 
[CV]  max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200, total=   0.9s
[CV] max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200 
[CV]  max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200, total=   0.9s
[CV] max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200 
[CV]  max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200, total=   0.9s
[CV] max_depth=5, min_samples_leaf=1, min_samples_split=5, n_estimators=200 
[CV]  max_depth=5, min_samples_leaf=1, min_samples_split=5, n_estimators=200, total=   0.9s
[CV] max_depth=5, min_samples_leaf=1, min_samples_split=5, n_estimators=200 
[CV]  max_depth=5, min_samples_leaf=1, min_samples_split=5, n_estimators=200, total=   0.9s
[CV] max_depth=5, min_samples_leaf=1, min_samples_split=5, n_es

[CV]  max_depth=5, min_samples_leaf=5, min_samples_split=2, n_estimators=200, total=   0.8s
[CV] max_depth=5, min_samples_leaf=5, min_samples_split=2, n_estimators=200 
[CV]  max_depth=5, min_samples_leaf=5, min_samples_split=2, n_estimators=200, total=   0.8s
[CV] max_depth=5, min_samples_leaf=5, min_samples_split=2, n_estimators=200 
[CV]  max_depth=5, min_samples_leaf=5, min_samples_split=2, n_estimators=200, total=   0.8s
[CV] max_depth=5, min_samples_leaf=5, min_samples_split=2, n_estimators=200 
[CV]  max_depth=5, min_samples_leaf=5, min_samples_split=2, n_estimators=200, total=   0.8s
[CV] max_depth=5, min_samples_leaf=5, min_samples_split=2, n_estimators=200 
[CV]  max_depth=5, min_samples_leaf=5, min_samples_split=2, n_estimators=200, total=   0.8s
[CV] max_depth=5, min_samples_leaf=5, min_samples_split=5, n_estimators=200 
[CV]  max_depth=5, min_samples_leaf=5, min_samples_split=5, n_estimators=200, total=   0.8s
[CV] max_depth=5, min_samples_leaf=5, min_samples_split=5, n_es

[CV]  max_depth=5, min_samples_leaf=10, min_samples_split=100, n_estimators=200, total=   0.8s
[CV] max_depth=5, min_samples_leaf=10, min_samples_split=100, n_estimators=200 
[CV]  max_depth=5, min_samples_leaf=10, min_samples_split=100, n_estimators=200, total=   0.8s
[CV] max_depth=8, min_samples_leaf=1, min_samples_split=2, n_estimators=200 
[CV]  max_depth=8, min_samples_leaf=1, min_samples_split=2, n_estimators=200, total=   1.0s
[CV] max_depth=8, min_samples_leaf=1, min_samples_split=2, n_estimators=200 
[CV]  max_depth=8, min_samples_leaf=1, min_samples_split=2, n_estimators=200, total=   1.0s
[CV] max_depth=8, min_samples_leaf=1, min_samples_split=2, n_estimators=200 
[CV]  max_depth=8, min_samples_leaf=1, min_samples_split=2, n_estimators=200, total=   1.0s
[CV] max_depth=8, min_samples_leaf=1, min_samples_split=2, n_estimators=200 
[CV]  max_depth=8, min_samples_leaf=1, min_samples_split=2, n_estimators=200, total=   1.0s
[CV] max_depth=8, min_samples_leaf=1, min_samples_spli

[CV]  max_depth=8, min_samples_leaf=2, min_samples_split=100, n_estimators=200, total=   1.0s
[CV] max_depth=8, min_samples_leaf=2, min_samples_split=100, n_estimators=200 
[CV]  max_depth=8, min_samples_leaf=2, min_samples_split=100, n_estimators=200, total=   1.0s
[CV] max_depth=8, min_samples_leaf=2, min_samples_split=100, n_estimators=200 
[CV]  max_depth=8, min_samples_leaf=2, min_samples_split=100, n_estimators=200, total=   1.0s
[CV] max_depth=8, min_samples_leaf=5, min_samples_split=2, n_estimators=200 
[CV]  max_depth=8, min_samples_leaf=5, min_samples_split=2, n_estimators=200, total=   1.0s
[CV] max_depth=8, min_samples_leaf=5, min_samples_split=2, n_estimators=200 
[CV]  max_depth=8, min_samples_leaf=5, min_samples_split=2, n_estimators=200, total=   1.0s
[CV] max_depth=8, min_samples_leaf=5, min_samples_split=2, n_estimators=200 
[CV]  max_depth=8, min_samples_leaf=5, min_samples_split=2, n_estimators=200, total=   1.0s
[CV] max_depth=8, min_samples_leaf=5, min_samples_spl

[CV]  max_depth=8, min_samples_leaf=10, min_samples_split=100, n_estimators=200, total=   0.9s
[CV] max_depth=8, min_samples_leaf=10, min_samples_split=100, n_estimators=200 
[CV]  max_depth=8, min_samples_leaf=10, min_samples_split=100, n_estimators=200, total=   1.0s
[CV] max_depth=8, min_samples_leaf=10, min_samples_split=100, n_estimators=200 
[CV]  max_depth=8, min_samples_leaf=10, min_samples_split=100, n_estimators=200, total=   1.0s
[CV] max_depth=8, min_samples_leaf=10, min_samples_split=100, n_estimators=200 
[CV]  max_depth=8, min_samples_leaf=10, min_samples_split=100, n_estimators=200, total=   0.9s
[CV] max_depth=8, min_samples_leaf=10, min_samples_split=100, n_estimators=200 
[CV]  max_depth=8, min_samples_leaf=10, min_samples_split=100, n_estimators=200, total=   0.9s
[CV] max_depth=15, min_samples_leaf=1, min_samples_split=2, n_estimators=200 
[CV]  max_depth=15, min_samples_leaf=1, min_samples_split=2, n_estimators=200, total=   1.3s
[CV] max_depth=15, min_samples_lea

[CV]  max_depth=15, min_samples_leaf=2, min_samples_split=15, n_estimators=200, total=   1.3s
[CV] max_depth=15, min_samples_leaf=2, min_samples_split=15, n_estimators=200 
[CV]  max_depth=15, min_samples_leaf=2, min_samples_split=15, n_estimators=200, total=   1.4s
[CV] max_depth=15, min_samples_leaf=2, min_samples_split=100, n_estimators=200 
[CV]  max_depth=15, min_samples_leaf=2, min_samples_split=100, n_estimators=200, total=   1.2s
[CV] max_depth=15, min_samples_leaf=2, min_samples_split=100, n_estimators=200 
[CV]  max_depth=15, min_samples_leaf=2, min_samples_split=100, n_estimators=200, total=   1.2s
[CV] max_depth=15, min_samples_leaf=2, min_samples_split=100, n_estimators=200 
[CV]  max_depth=15, min_samples_leaf=2, min_samples_split=100, n_estimators=200, total=   1.2s
[CV] max_depth=15, min_samples_leaf=2, min_samples_split=100, n_estimators=200 
[CV]  max_depth=15, min_samples_leaf=2, min_samples_split=100, n_estimators=200, total=   1.2s
[CV] max_depth=15, min_samples_le

[CV]  max_depth=15, min_samples_leaf=10, min_samples_split=15, n_estimators=200, total=   1.2s
[CV] max_depth=15, min_samples_leaf=10, min_samples_split=15, n_estimators=200 
[CV]  max_depth=15, min_samples_leaf=10, min_samples_split=15, n_estimators=200, total=   1.3s
[CV] max_depth=15, min_samples_leaf=10, min_samples_split=15, n_estimators=200 
[CV]  max_depth=15, min_samples_leaf=10, min_samples_split=15, n_estimators=200, total=   1.3s
[CV] max_depth=15, min_samples_leaf=10, min_samples_split=15, n_estimators=200 
[CV]  max_depth=15, min_samples_leaf=10, min_samples_split=15, n_estimators=200, total=   1.3s
[CV] max_depth=15, min_samples_leaf=10, min_samples_split=100, n_estimators=200 
[CV]  max_depth=15, min_samples_leaf=10, min_samples_split=100, n_estimators=200, total=   1.2s
[CV] max_depth=15, min_samples_leaf=10, min_samples_split=100, n_estimators=200 
[CV]  max_depth=15, min_samples_leaf=10, min_samples_split=100, n_estimators=200, total=   1.2s
[CV] max_depth=15, min_sam

[CV]  max_depth=25, min_samples_leaf=2, min_samples_split=10, n_estimators=200, total=   1.4s
[CV] max_depth=25, min_samples_leaf=2, min_samples_split=15, n_estimators=200 
[CV]  max_depth=25, min_samples_leaf=2, min_samples_split=15, n_estimators=200, total=   1.3s
[CV] max_depth=25, min_samples_leaf=2, min_samples_split=15, n_estimators=200 
[CV]  max_depth=25, min_samples_leaf=2, min_samples_split=15, n_estimators=200, total=   1.4s
[CV] max_depth=25, min_samples_leaf=2, min_samples_split=15, n_estimators=200 
[CV]  max_depth=25, min_samples_leaf=2, min_samples_split=15, n_estimators=200, total=   1.4s
[CV] max_depth=25, min_samples_leaf=2, min_samples_split=15, n_estimators=200 
[CV]  max_depth=25, min_samples_leaf=2, min_samples_split=15, n_estimators=200, total=   1.4s
[CV] max_depth=25, min_samples_leaf=2, min_samples_split=15, n_estimators=200 
[CV]  max_depth=25, min_samples_leaf=2, min_samples_split=15, n_estimators=200, total=   1.4s
[CV] max_depth=25, min_samples_leaf=2, mi

[CV]  max_depth=25, min_samples_leaf=10, min_samples_split=10, n_estimators=200, total=   1.2s
[CV] max_depth=25, min_samples_leaf=10, min_samples_split=10, n_estimators=200 
[CV]  max_depth=25, min_samples_leaf=10, min_samples_split=10, n_estimators=200, total=   1.2s
[CV] max_depth=25, min_samples_leaf=10, min_samples_split=10, n_estimators=200 
[CV]  max_depth=25, min_samples_leaf=10, min_samples_split=10, n_estimators=200, total=   1.3s
[CV] max_depth=25, min_samples_leaf=10, min_samples_split=15, n_estimators=200 
[CV]  max_depth=25, min_samples_leaf=10, min_samples_split=15, n_estimators=200, total=   1.3s
[CV] max_depth=25, min_samples_leaf=10, min_samples_split=15, n_estimators=200 
[CV]  max_depth=25, min_samples_leaf=10, min_samples_split=15, n_estimators=200, total=   1.3s
[CV] max_depth=25, min_samples_leaf=10, min_samples_split=15, n_estimators=200 
[CV]  max_depth=25, min_samples_leaf=10, min_samples_split=15, n_estimators=200, total=   1.3s
[CV] max_depth=25, min_samples

[CV]  max_depth=30, min_samples_leaf=2, min_samples_split=10, n_estimators=200, total=   1.4s
[CV] max_depth=30, min_samples_leaf=2, min_samples_split=10, n_estimators=200 
[CV]  max_depth=30, min_samples_leaf=2, min_samples_split=10, n_estimators=200, total=   1.4s
[CV] max_depth=30, min_samples_leaf=2, min_samples_split=10, n_estimators=200 
[CV]  max_depth=30, min_samples_leaf=2, min_samples_split=10, n_estimators=200, total=   1.4s
[CV] max_depth=30, min_samples_leaf=2, min_samples_split=10, n_estimators=200 
[CV]  max_depth=30, min_samples_leaf=2, min_samples_split=10, n_estimators=200, total=   1.4s
[CV] max_depth=30, min_samples_leaf=2, min_samples_split=10, n_estimators=200 
[CV]  max_depth=30, min_samples_leaf=2, min_samples_split=10, n_estimators=200, total=   1.4s
[CV] max_depth=30, min_samples_leaf=2, min_samples_split=15, n_estimators=200 
[CV]  max_depth=30, min_samples_leaf=2, min_samples_split=15, n_estimators=200, total=   1.4s
[CV] max_depth=30, min_samples_leaf=2, mi

[CV]  max_depth=30, min_samples_leaf=10, min_samples_split=5, n_estimators=200, total=   1.3s
[CV] max_depth=30, min_samples_leaf=10, min_samples_split=5, n_estimators=200 
[CV]  max_depth=30, min_samples_leaf=10, min_samples_split=5, n_estimators=200, total=   1.3s
[CV] max_depth=30, min_samples_leaf=10, min_samples_split=10, n_estimators=200 
[CV]  max_depth=30, min_samples_leaf=10, min_samples_split=10, n_estimators=200, total=   1.3s
[CV] max_depth=30, min_samples_leaf=10, min_samples_split=10, n_estimators=200 
[CV]  max_depth=30, min_samples_leaf=10, min_samples_split=10, n_estimators=200, total=   1.3s
[CV] max_depth=30, min_samples_leaf=10, min_samples_split=10, n_estimators=200 
[CV]  max_depth=30, min_samples_leaf=10, min_samples_split=10, n_estimators=200, total=   1.3s
[CV] max_depth=30, min_samples_leaf=10, min_samples_split=10, n_estimators=200 
[CV]  max_depth=30, min_samples_leaf=10, min_samples_split=10, n_estimators=200, total=   1.3s
[CV] max_depth=30, min_samples_le

[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:  9.7min finished


In [16]:
print(f"Minmax best params: {best_rfc_model_minmax.best_params_}")
print(f"Minmax best score: {best_rfc_model_minmax.best_score_}")
print(f"Minmax best best estimator: {best_rfc_model_minmax.best_estimator_}")
print ("---")
print(f"Standard best params: {best_rfc_model_standard.best_params_}")
print(f"Standard best score: {best_rfc_model_standard.best_score_}")
print(f"Standard best best estimator: {best_rfc_model_standard.best_estimator_}")

Minmax best params: {'max_depth': 30, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 200}
Minmax best score: 0.897194529140803
Minmax best best estimator: RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=30, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=5,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
---
Standard best params: {'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}
Standard best score: 0.8970042352222036
Standard best best estimator: RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criter

# Test Model

In [17]:
print(f"Minmax Training Data Score: {best_rfc_model_minmax.score(X_train_minmax_scaled, y_train)}")
print(f"Minmax Testing Data Score: {best_rfc_model_minmax.score(X_test_minmax_scaled, y_test)}")

minmax_predictions = best_rfc_model_minmax.predict(X_test_minmax_scaled)
print(classification_report(y_test, minmax_predictions))

print ("---")

print(f"Standard Training Data Score: {best_rfc_model_standard.score(X_train_standard_scaled, y_train)}")
print(f"Standard Testing Data Score: {best_rfc_model_standard.score(X_test_standard_scaled, y_test)}")

standard_predictions = best_rfc_model_standard.predict(X_test_standard_scaled)
print(classification_report(y_test, standard_predictions))

Minmax Training Data Score: 0.9853137516688919
Minmax Testing Data Score: 0.9038901601830663
                precision    recall  f1-score   support

     CANDIDATE       0.84      0.77      0.80       422
     CONFIRMED       0.81      0.84      0.83       450
FALSE POSITIVE       0.98      1.00      0.99       876

      accuracy                           0.90      1748
     macro avg       0.88      0.87      0.87      1748
  weighted avg       0.90      0.90      0.90      1748

---
Standard Training Data Score: 0.9975205035285142
Standard Testing Data Score: 0.9056064073226545
                precision    recall  f1-score   support

     CANDIDATE       0.84      0.77      0.81       422
     CONFIRMED       0.81      0.85      0.83       450
FALSE POSITIVE       0.98      1.00      0.99       876

      accuracy                           0.91      1748
     macro avg       0.88      0.87      0.88      1748
  weighted avg       0.90      0.91      0.90      1748



In [18]:
# see how it compares
minmax_predicted = minmax_predictions[:20]
standard_predicted = standard_predictions[:20]
actual = y_test["koi_disposition"][:20].tolist()
pd.DataFrame({"Actual": actual, "minmax scaler": minmax_predicted, "standard scaler": standard_predicted}).reset_index(drop=True)

minmax scaler


Unnamed: 0,Actual,minmax scaler,standard scaler
0,CANDIDATE,CANDIDATE,CANDIDATE
1,FALSE POSITIVE,FALSE POSITIVE,FALSE POSITIVE
2,FALSE POSITIVE,FALSE POSITIVE,FALSE POSITIVE
3,FALSE POSITIVE,FALSE POSITIVE,FALSE POSITIVE
4,CANDIDATE,CONFIRMED,CONFIRMED
5,FALSE POSITIVE,FALSE POSITIVE,FALSE POSITIVE
6,CANDIDATE,CANDIDATE,CANDIDATE
7,FALSE POSITIVE,FALSE POSITIVE,FALSE POSITIVE
8,FALSE POSITIVE,FALSE POSITIVE,FALSE POSITIVE
9,FALSE POSITIVE,FALSE POSITIVE,FALSE POSITIVE


# Save the Model

In [21]:
minmax_rfc = 'models/rfc_minmax.h5'
joblib.dump(best_rfc_model_minmax, minmax_rfc)

standard_rfc = 'models/rfc_standard.h5'
joblib.dump(best_rfc_model_standard, standard_rfc)

['models/rfc_standard.h5']

In [22]:
loaded_model = joblib.load("models/rfc_minmax.h5")
print(f"{loaded_model.score(X_test_minmax_scaled, y_test)}")

loaded_model = joblib.load("models/rfc_standard.h5")
print(f"{loaded_model.score(X_test_standard_scaled, y_test)}")

0.9038901601830663
0.9056064073226545


# Column Details

### Exoplanet Archive Information
* 'koi_disposition': The pipeline flag that designates the most probable physical explanation of the KOI

### Project Disposition Columns
* 'koi_fpflag_nt': A KOI whose light curve is not consistent with that of a transiting planet.
* 'koi_fpflag_ss': A KOI that is observed to have a significant secondary event, transit shape, or out-of-eclipse variability, which indicates that the transit-like event is most likely caused by an eclipsing binary.
* 'koi_fpflag_co': The source of the signal is from a nearby star, as inferred by measuring the centroid location of the image both in and out of transit, or by the strength of the transit signal in the target's outer (halo) pixels as compared to the transit signal from the pixels in the optimal (or core) aperture.
* 'koi_fpflag_ec': The KOI shares the same period and epoch as another object and is judged to be the result of flux contamination in the aperture or electronic crosstalk.

### Transit Properties
Transit parameters delivered by the Kepler Project are typically best-fit parameters produced by a Mandel-Agol (2002) fit to a multi-quarter Kepler light curve, assuming a linear orbital ephemeris. Some of the parameters listed below are fit directly, other are derived from the best-fit parameters. Limb-darkening coefficients are fixed and pre-calculated from host star properties. Orbital Period, Transit Epoch, Planet-Star Radius Ratio, Planet-Star Distance over Star Radius and Impact Parameter are the free parameters in the fit. Matrix covariances are adopted as errors to the fit parameters; they therefore ignore the effects of correlation between the fit parameters and are likely to be underestimates.

See the links in the Purpose of KOI document for each activity table for more details on the fits for each delivery.

Scaled planetary parameters combine the dimensionless fit parameters with physical stellar parameters to produce planet characteristics in physical units.

Best-fit planetary transit parameters are typically normalized to the size of the host star. Physical planet parameters may be derived by scaling to the star's size and temperature. Transit parameters also depend weakly upon the limb-darkening coefficients which are derived from the stellar parameters (e.g., Claret and Bloemen 2011).

* 'koi_period': The interval between consecutive planetary transits.
* 'koi_time0bk': The time corresponding to the center of the first detected transit in Barycentric Julian Day (BJD) minus a constant offset of 2,454,833.0 days. The offset corresponds to 12:00 on Jan 1, 2009 UTC.
* 'koi_impact': The sky-projected distance between the center of the stellar disc and the center of the planet disc at conjunction, normalized by the stellar radius
* 'koi_duration': The duration of the observed transits. Duration is measured from first contact between the planet and star until last contact. Contact times are typically computed from a best-fit model produced by a Mandel-Agol (2002) model fit to a multi-quarter Kepler light curve, assuming a linear orbital ephemeris.
* 'koi_depth': The fraction of stellar flux lost at the minimum of the planetary transit. Transit depths are typically computed from a best-fit model produced by a Mandel-Agol (2002) model fit to a multi-quarter Kepler light curve, assuming a linear orbital ephemeris.
* 'koi_prad': The radius of the planet. Planetary radius is the product of the planet star radius ratio and the stellar radius.
* 'koi_teq': Approximation for the temperature of the planet. 
* 'koi_insol': Insolation flux is another way to give the equilibrium temperature.

### Threshold-Crossing Event (TCE) Information
The Transiting Planet Search (TPS) module of the Kepler data analysis pipeline performs a detection test for planet transits in the multi-quarter, gap-filled flux time series. The TPS module detrends each quarterly PDC light curve to remove edge effects around data gaps and then combines the data segments together, filling gaps with interpolated data so as to condition the flux time series for a matched filter. The module applies an adaptive, wavelet-based matched filter (Jenkins 2002, Jenkins et al. 2010 and Tenenbaum et al. (2012)) to perform a joint characterization of observation noise and detection of transit-like features in the light curve.

The TPS module estimates the Power Spectral Density of the flux time series as a function in time. This provides coefficients for a whitening filter to accommodate non-stationary, non-white noise and yields Single Event Statistic (SES) time series components. These can be interpreted as measurements of the statistical significance of the presence of a transit of trial duration at each point in the time series.

Single Event Statistics are folded at each trial orbital period and the maximum Multiple Event Statistic (MES) is obtained over all trial periods and phases. The MES estimates the signal to noise ratio of the putative transit-like sequence against the measurement noise. The MES threshold for defining the sample of Threshold Crossing Events (TCEs) is provided within the Release Notes. For reference, a lower MES threshold of 7.1σ limits the number of false positives in the TCE sample due to statistical random noise to less than 1 over the primary mission (Jenkins, Caldwell and Borucki 2002).

* 'koi_model_snr': Transit depth normalized by the mean uncertainty in the flux during the transits.
* 'koi_tce_plnt_num': TCE Planet Number federated to the KOI.

### Stellar Parameters
Stellar effective temperature, surface gravity, metallicity, radius, mass, and age should comprise a consistent set. Associated error estimates are 1-σ uncertainties.

* 'koi_steff': The photospheric temperature of the star.
* 'koi_slogg': The base-10 logarithm of the acceleration due to gravity at the surface of the star.
* 'koi_srad': The photospheric radius of the star

### KIC Parameters
* 'ra': KIC Right Ascension
* 'dec': KIC Declination
* 'koi_kepmag': Kepler-band (mag)