In [None]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade

In [None]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
!pip install joblib

In [1]:
import pandas as pd
from sklearn import tree
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.neighbors import KNeighborsClassifier
import joblib

In [2]:
df = pd.read_csv("exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()

In [3]:
# Set features. This will also be used as your x values.
X = df.drop('koi_disposition', axis=1)
y = df['koi_disposition'].values.reshape(-1, 1)
print(X.shape, y.shape)

(6991, 40) (6991, 1)


In [4]:
# Step 1: Label-encode data set
label_encoder = OneHotEncoder()
label_encoder.fit(y)
encoded_y = label_encoder.transform(y)

encoded_y[1]

<1x3 sparse matrix of type '<class 'numpy.float64'>'
	with 1 stored elements in Compressed Sparse Row format>

In [5]:
clf = ExtraTreesClassifier(n_estimators=50)
clf = clf.fit(X, encoded_y.toarray())
clf.feature_importances_  

model = SelectFromModel(clf, prefit=True)
X_new = model.transform(X)
X_new.shape               

(6991, 7)

In [6]:
# Scale your data


scaler = MinMaxScaler()

X_scaler = scaler.fit_transform(X_new)

In [7]:


X_train, X_test, y_train, y_test = train_test_split(X_new, encoded_y, random_state=42)

In [8]:
X_train

array([[ 0.000e+00,  0.000e+00,  0.000e+00, ...,  3.060e-01, -3.060e-01,
         1.080e+01],
       [ 0.000e+00,  1.000e+00,  0.000e+00, ...,  2.820e-01, -2.820e-01,
         1.380e+01],
       [ 1.000e+00,  0.000e+00,  0.000e+00, ...,  0.000e+00,  0.000e+00,
         2.543e+02],
       ...,
       [ 0.000e+00,  1.000e+00,  0.000e+00, ...,  3.060e-03, -3.060e-03,
         4.710e+02],
       [ 1.000e+00,  0.000e+00,  0.000e+00, ...,  4.600e+00, -4.600e+00,
         7.950e+01],
       [ 0.000e+00,  0.000e+00,  1.000e+00, ...,  1.760e-01, -1.760e-01,
         3.680e+01]])

# Train the Model



In [9]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train.toarray())
clf.score(X_test, y_test.toarray())

0.818649885583524

In [10]:


rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train.toarray())
rf.score(X_test, y_test.toarray())

0.8581235697940504

# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [11]:
# Create the GridSearch estimator along with a parameter object containing the values to adjust


t = tree.DecisionTreeClassifier()

param_grid = {'criterion': ['gini', 'entropy'],
              'splitter': ['best', 'random'],
              'max_leaf_nodes': [13, 47, 69, 420]}
grid_clf = GridSearchCV(t, param_grid, verbose=3)

grid_clf.fit(X_train, y_train.toarray())

grid_clf.score(X_test, y_test.toarray())

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] criterion=gini, max_leaf_nodes=13, splitter=best ................
[CV]  criterion=gini, max_leaf_nodes=13, splitter=best, score=0.887, total=   0.0s
[CV] criterion=gini, max_leaf_nodes=13, splitter=best ................
[CV]  criterion=gini, max_leaf_nodes=13, splitter=best, score=0.879, total=   0.0s
[CV] criterion=gini, max_leaf_nodes=13, splitter=best ................
[CV]  criterion=gini, max_leaf_nodes=13, splitter=best, score=0.883, total=   0.0s
[CV] criterion=gini, max_leaf_nodes=13, splitter=best ................
[CV]  criterion=gini, max_leaf_nodes=13, splitter=best, score=0.865, total=   0.0s
[CV] criterion=gini, max_leaf_nodes=13, splitter=best ................
[CV]  criterion=gini, max_leaf_nodes=13, splitter=best, score=0.864, total=   0.0s
[CV] criterion=gini, max_leaf_nodes=13, splitter=random ..............
[CV]  criterion=gini, max_leaf_nodes=13, splitter=random, score=0.871, total=   0.0s
[CV] criterio

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s


[CV]  criterion=gini, max_leaf_nodes=69, splitter=random, score=0.866, total=   0.0s
[CV] criterion=gini, max_leaf_nodes=69, splitter=random ..............
[CV]  criterion=gini, max_leaf_nodes=69, splitter=random, score=0.859, total=   0.0s
[CV] criterion=gini, max_leaf_nodes=420, splitter=best ...............
[CV]  criterion=gini, max_leaf_nodes=420, splitter=best, score=0.846, total=   0.0s
[CV] criterion=gini, max_leaf_nodes=420, splitter=best ...............
[CV]  criterion=gini, max_leaf_nodes=420, splitter=best, score=0.815, total=   0.0s
[CV] criterion=gini, max_leaf_nodes=420, splitter=best ...............
[CV]  criterion=gini, max_leaf_nodes=420, splitter=best, score=0.847, total=   0.0s
[CV] criterion=gini, max_leaf_nodes=420, splitter=best ...............
[CV]  criterion=gini, max_leaf_nodes=420, splitter=best, score=0.844, total=   0.0s
[CV] criterion=gini, max_leaf_nodes=420, splitter=best ...............
[CV]  criterion=gini, max_leaf_nodes=420, splitter=best, score=0.847

[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:    0.6s finished


0.8861556064073226

In [12]:


kn = KNeighborsClassifier()

param_grid = {'n_neighbors': [1, 3, 5, 7],
              'metric': ['minkowski', 'euclidean', 'manhattan']}

grid_clf = GridSearchCV(kn, param_grid, verbose=3)

grid_clf.fit(X_train, y_train.toarray())

grid_clf.score(X_test, y_test.toarray())

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] metric=minkowski, n_neighbors=1 .................................
[CV] ..... metric=minkowski, n_neighbors=1, score=0.795, total=   0.1s
[CV] metric=minkowski, n_neighbors=1 .................................
[CV] ..... metric=minkowski, n_neighbors=1, score=0.766, total=   0.1s
[CV] metric=minkowski, n_neighbors=1 .................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s


[CV] ..... metric=minkowski, n_neighbors=1, score=0.786, total=   0.1s
[CV] metric=minkowski, n_neighbors=1 .................................
[CV] ..... metric=minkowski, n_neighbors=1, score=0.785, total=   0.1s
[CV] metric=minkowski, n_neighbors=1 .................................
[CV] ..... metric=minkowski, n_neighbors=1, score=0.786, total=   0.1s
[CV] metric=minkowski, n_neighbors=3 .................................
[CV] ..... metric=minkowski, n_neighbors=3, score=0.809, total=   0.1s
[CV] metric=minkowski, n_neighbors=3 .................................
[CV] ..... metric=minkowski, n_neighbors=3, score=0.801, total=   0.1s
[CV] metric=minkowski, n_neighbors=3 .................................
[CV] ..... metric=minkowski, n_neighbors=3, score=0.813, total=   0.1s
[CV] metric=minkowski, n_neighbors=3 .................................
[CV] ..... metric=minkowski, n_neighbors=3, score=0.796, total=   0.1s
[CV] metric=minkowski, n_neighbors=3 .................................
[CV] .

[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:    4.2s finished


0.7974828375286042

# Save the Model

In [None]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash

filename = 'your_name.sav'
joblib.dump(your_model, filename)