In [1]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade

Collecting sklearn
  Downloading sklearn-0.0.tar.gz (1.1 kB)
Building wheels for collected packages: sklearn
  Building wheel for sklearn (setup.py) ... [?25ldone
[?25h  Created wheel for sklearn: filename=sklearn-0.0-py2.py3-none-any.whl size=1316 sha256=3edc9e65f1ada7fa5e24b31363f0f29d5c047eb286b7129a4d2b4ca674e49a70
  Stored in directory: /Users/jlh/Library/Caches/pip/wheels/22/0b/40/fd3f795caaa1fb4c6cb738bc1f56100be1e57da95849bfc897
Successfully built sklearn
Installing collected packages: sklearn
Successfully installed sklearn-0.0


In [2]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
!pip install joblib



In [1]:
# Import our dependencies
import matplotlib.pyplot as plt
import pandas as pd
import os

# Read the CSV and Perform Basic Data Cleaning

In [2]:
df = pd.read_csv("cumulative.csv")
# Drop the null columns where all values are null
df = df.drop(columns=["kepler_name"])


In [3]:
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()


In [4]:
df = df.drop(columns=["rowid", "kepid", "koi_pdisposition", "koi_score", "koi_tce_delivname"])


# Select your features (columns)

In [5]:
# Set features. This will also be used as your x values.
#selected_features = df[['koi_disposition', 'koi_period', 'koi_time0bk', 'koi_impact', 'koi_duration', 'koi_depth', 'koi_prad', 'koi_teq', 'koi_insol', 'koi_model_snr', 'koi_steff', 'koi_slogg', 'koi_srad']]
#selected_features
X = df.drop(columns=["koi_disposition", "kepoi_name"], axis=1)

In [6]:
#dropped features based on importance
X = df.drop(columns=["koi_disposition", "kepoi_name", "koi_srad_err1", 'koi_teq', 'koi_period_err2', 'koi_time0bk_err2', 'koi_insol_err2', 'koi_depth_err2', 'koi_srad', 'koi_depth_err1', 'koi_tce_plnt_num', 'koi_prad_err2'], axis=1)

# Create a Train Test Split

Use `koi_disposition` for the y values

In [7]:
# choose our y variable
y = df["koi_disposition"]

In [8]:
# import our train_test_split dependency
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [9]:
X_train.head()
y_train.head()

7556         CANDIDATE
2272         CANDIDATE
5020         CONFIRMED
7434    FALSE POSITIVE
5058         CANDIDATE
Name: koi_disposition, dtype: object

# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [10]:
from sklearn.preprocessing import LabelEncoder

# Step 1: Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
label_encoder.fit(y_test)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

In [11]:
from keras.utils import to_categorical

# Step 2: One-hot encoding
hot_y_train = to_categorical(encoded_y_train)
hot_y_test = to_categorical(encoded_y_test)
hot_y_train

array([[1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       ...,
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.]], dtype=float32)

In [12]:
# scale the data
from sklearn.preprocessing import StandardScaler
X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [13]:
# review the accuracy of the machine learning model
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train_scaled, hot_y_train)
clf.score(X_test_scaled, hot_y_test)

0.8416196822142491

In [14]:
# This model does not exceed an 85% accuracy rating. How can we improve this model?
model1 = clf
model1

DecisionTreeClassifier()

In [15]:
feature_names = X.columns

In [16]:
# Sort the columns by their feature importance. By culling some of the less important columns we can reduce the work done by our computer
sorted(zip(clf.feature_importances_, feature_names), reverse=True)

[(0.20474763266167573, 'koi_fpflag_ss'),
 (0.18787617607797952, 'koi_fpflag_co'),
 (0.15020447861738895, 'koi_fpflag_nt'),
 (0.12373917060698204, 'koi_model_snr'),
 (0.031078683680212455, 'koi_fpflag_ec'),
 (0.027336663810552966, 'koi_impact'),
 (0.01872012300870629, 'koi_duration'),
 (0.018495236122294294, 'koi_prad_err1'),
 (0.01629322994792188, 'koi_prad'),
 (0.014994753968054317, 'dec'),
 (0.014734827502718347, 'koi_kepmag'),
 (0.014675173027460098, 'koi_slogg_err2'),
 (0.013196059100492258, 'koi_time0bk'),
 (0.012795742056719106, 'koi_slogg'),
 (0.012779439720260388, 'ra'),
 (0.01225171616977904, 'koi_depth'),
 (0.011916029016619423, 'koi_srad_err2'),
 (0.011038420216855664, 'koi_steff'),
 (0.010591811806552229, 'koi_impact_err1'),
 (0.010416454969314556, 'koi_period'),
 (0.010136178473875409, 'koi_steff_err1'),
 (0.009468561095833564, 'koi_impact_err2'),
 (0.00905404626918839, 'koi_period_err1'),
 (0.008900239474351597, 'koi_insol_err1'),
 (0.008773190206134237, 'koi_steff_err2')

In [17]:
# Below are the least important columns. Un comment them and paste them as our X variable
#X = df.drop(columns=["koi_disposition", "kepoi_name", "koi_srad_err1", 'koi_teq', 'koi_period_err2', 'koi_time0bk_err2', 'koi_insol_err2', 'koi_depth_err2', 'koi_srad', 'koi_depth_err1', 'koi_tce_plnt_num', 'koi_prad_err2'], axis=1)

# Tune the Model



In [18]:
# To increase the accuracy of the model, we'll need to tune the hyperparameters
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier

In [19]:
DecisionTreeClassifier.get_params(DecisionTreeClassifier)



{'ccp_alpha': None,
 'class_weight': None,
 'criterion': None,
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': None,
 'min_impurity_split': None,
 'min_samples_leaf': None,
 'min_samples_split': None,
 'min_weight_fraction_leaf': None,
 'presort': None,
 'random_state': None,
 'splitter': None}

In [33]:
# Decision Tree hyper parameters include the parameters below. We're going to use GridSearchCV to choose the best ones for us
scoring = {'AUC': 'roc_auc'}
param_grid = {'criterion':['gini', 'entropy'], 'max_depth':[5, 7, 10, 15, 25, 40, 65, 95, 150]}

gs = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5)
gs.fit(X_train_scaled, hot_y_train)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [5, 7, 10, 15, 25, 40, 65, 95, 150]})

In [34]:
print(gs.best_params_)
print(gs.best_score_)


{'criterion': 'entropy', 'max_depth': 7}
0.8650041238768822


In [35]:
predictions = gs.predict(X_test_scaled)

In [37]:
# Success! We've improved our accuracy from 84.1% to 87.5%, 3.5% increase. 
gs.score(X_test_scaled, hot_y_test)

0.8754484879548949

# Save the Model

In [25]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
#https://stackoverflow.com/questions/38709690/scikit-learn-using-gridsearchcv-on-decisiontreeclassifier
import joblib
filename = 'DecisionTree.sav'
joblib.dump(clf, filename)

['DecisionTree.sav']