# Exoplanet Machine Learning Dataset

In [1]:
# Update sklearn to prevent version mismatches
#!pip install sklearn --upgrade

In [2]:
# install joblib. 
# !pip install joblib

In [3]:
import pandas as pd

## Data Exploration and Cleaning

In [4]:
# Reading the provided dataset
df = pd.read_csv("exoplanet_data.csv")
df

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,2.479000e-04,-2.479000e-04,162.513840,0.003520,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.899140,1.490000e-05,-1.490000e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.630000e-07,-2.630000e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.285210,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.760000e-06,-3.760000e-06,171.595550,0.001130,...,-211,4.438,0.070,-0.210,1.046,0.334,-0.133,288.75488,48.226200,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.050000e-05,-1.050000e-05,172.979370,0.001900,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.224670,15.714
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6986,FALSE POSITIVE,0,0,0,1,8.589871,1.846000e-04,-1.846000e-04,132.016100,0.015700,...,-152,4.296,0.231,-0.189,1.088,0.313,-0.228,298.74921,46.973351,14.478
6987,FALSE POSITIVE,0,1,1,0,0.527699,1.160000e-07,-1.160000e-07,131.705093,0.000170,...,-166,4.529,0.035,-0.196,0.903,0.237,-0.079,297.18875,47.093819,14.082
6988,CANDIDATE,0,0,0,0,1.739849,1.780000e-05,-1.780000e-05,133.001270,0.007690,...,-220,4.444,0.056,-0.224,1.031,0.341,-0.114,286.50937,47.163219,14.757
6989,FALSE POSITIVE,0,0,1,0,0.681402,2.430000e-06,-2.430000e-06,132.181750,0.002850,...,-236,4.447,0.056,-0.224,1.041,0.341,-0.114,294.16489,47.176281,15.385


In [5]:
df.dtypes
# all are numbers expect koi_disp

koi_disposition       object
koi_fpflag_nt          int64
koi_fpflag_ss          int64
koi_fpflag_co          int64
koi_fpflag_ec          int64
koi_period           float64
koi_period_err1      float64
koi_period_err2      float64
koi_time0bk          float64
koi_time0bk_err1     float64
koi_time0bk_err2     float64
koi_impact           float64
koi_impact_err1      float64
koi_impact_err2      float64
koi_duration         float64
koi_duration_err1    float64
koi_duration_err2    float64
koi_depth            float64
koi_depth_err1       float64
koi_depth_err2       float64
koi_prad             float64
koi_prad_err1        float64
koi_prad_err2        float64
koi_teq                int64
koi_insol            float64
koi_insol_err1       float64
koi_insol_err2       float64
koi_model_snr        float64
koi_tce_plnt_num       int64
koi_steff              int64
koi_steff_err1         int64
koi_steff_err2         int64
koi_slogg            float64
koi_slogg_err1       float64
koi_slogg_err2

In [6]:
# checking unique values 
types = df["koi_disposition"].unique()
types

array(['CONFIRMED', 'FALSE POSITIVE', 'CANDIDATE'], dtype=object)

In [7]:
# checking for nans
df.isnull().sum()

koi_disposition      0
koi_fpflag_nt        0
koi_fpflag_ss        0
koi_fpflag_co        0
koi_fpflag_ec        0
koi_period           0
koi_period_err1      0
koi_period_err2      0
koi_time0bk          0
koi_time0bk_err1     0
koi_time0bk_err2     0
koi_impact           0
koi_impact_err1      0
koi_impact_err2      0
koi_duration         0
koi_duration_err1    0
koi_duration_err2    0
koi_depth            0
koi_depth_err1       0
koi_depth_err2       0
koi_prad             0
koi_prad_err1        0
koi_prad_err2        0
koi_teq              0
koi_insol            0
koi_insol_err1       0
koi_insol_err2       0
koi_model_snr        0
koi_tce_plnt_num     0
koi_steff            0
koi_steff_err1       0
koi_steff_err2       0
koi_slogg            0
koi_slogg_err1       0
koi_slogg_err2       0
koi_srad             0
koi_srad_err1        0
koi_srad_err2        0
ra                   0
dec                  0
koi_kepmag           0
dtype: int64

## Select your features 

In [8]:
# Set x as every column except koi_dis
X = df.drop("koi_disposition", axis=1)
# Set y to koi_dis
y = df["koi_disposition"]

In [9]:
# Set features. This will also be used as your x values.
from sklearn.ensemble import RandomForestClassifier

# Create a random forest classifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X,y)

# Random Forests in sklearn will automatically calculate feature importance
# in this case it would be the petal 
importances = rf.feature_importances_

# We can sort the features by their importance
sorted(zip(rf.feature_importances_, X.columns), reverse=True)

[(0.11244928956525081, 'koi_fpflag_co'),
 (0.09539988189020576, 'koi_fpflag_nt'),
 (0.0624487180313217, 'koi_model_snr'),
 (0.06201620743395646, 'koi_fpflag_ss'),
 (0.054746112249074386, 'koi_prad'),
 (0.03722462340015902, 'koi_prad_err1'),
 (0.036351055491291785, 'koi_fpflag_ec'),
 (0.03422835698355081, 'koi_duration_err1'),
 (0.033693823154724585, 'koi_duration_err2'),
 (0.03262291367585332, 'koi_prad_err2'),
 (0.03244824889516049, 'koi_steff_err1'),
 (0.02691942064734593, 'koi_steff_err2'),
 (0.022855778588003264, 'koi_duration'),
 (0.021696377586824366, 'koi_time0bk_err2'),
 (0.01969202667773827, 'koi_period'),
 (0.019323746818660135, 'koi_depth'),
 (0.018538934912990246, 'koi_time0bk_err1'),
 (0.018511582679384294, 'koi_insol_err1'),
 (0.01769132704767722, 'koi_impact'),
 (0.016616613837322526, 'koi_period_err2'),
 (0.01578870445841781, 'koi_period_err1'),
 (0.015715560561119887, 'koi_teq'),
 (0.015709714183829115, 'koi_insol'),
 (0.014046270104407883, 'koi_insol_err2'),
 (0.01359

In [10]:
# chosing the top important features
X = df[['koi_fpflag_co', 'koi_fpflag_nt', 'koi_fpflag_ss', 'koi_model_snr', 'koi_prad']]

## Create a Train Test Split

Use `koi_disposition` for the y values

In [11]:
# split the data up between training and testing data (3/4:1/4)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [12]:
X_train.head()

Unnamed: 0,koi_fpflag_co,koi_fpflag_nt,koi_fpflag_ss,koi_model_snr,koi_prad
6122,0,0,0,10.8,1.24
6370,0,0,1,13.8,0.86
2879,0,1,0,254.3,3.21
107,0,0,0,38.4,2.25
29,0,0,0,696.5,12.21


## Pre-Processing

In [13]:
# Scale your data
# do not scale Y because it is categorical 
from sklearn.preprocessing import MinMaxScaler

X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


## Train the Model & Tune Hyperparameters



### Logistic Regression Model

Logistic Regression is used when the dependent variable(target) is categorical.

In [14]:
# Create a Logistic Regression Model
from sklearn.linear_model import LogisticRegression
logistic_classifier = LogisticRegression(max_iter = 1000)
logistic_classifier

LogisticRegression(max_iter=1000)

In [15]:
# Set up hyperparameter grid 
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [1, 5, 10, 50],
              'penalty': ('l2', 'none')}

# Set up the grid search
logistic = GridSearchCV(logistic_classifier, param_grid, verbose=3)

In [16]:
# Fit the model using the grid search estimator. 
logistic.fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] C=1, penalty=l2 .................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ..................... C=1, penalty=l2, score=0.785, total=   1.1s
[CV] C=1, penalty=l2 .................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.1s remaining:    0.0s


[CV] ..................... C=1, penalty=l2, score=0.776, total=   1.1s
[CV] C=1, penalty=l2 .................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.2s remaining:    0.0s


[CV] ..................... C=1, penalty=l2, score=0.761, total=   0.9s
[CV] C=1, penalty=l2 .................................................
[CV] ..................... C=1, penalty=l2, score=0.778, total=   0.9s
[CV] C=1, penalty=l2 .................................................
[CV] ..................... C=1, penalty=l2, score=0.774, total=   0.3s
[CV] C=1, penalty=none ...............................................
[CV] ................... C=1, penalty=none, score=0.788, total=   0.5s
[CV] C=1, penalty=none ...............................................
[CV] ................... C=1, penalty=none, score=0.777, total=   0.5s
[CV] C=1, penalty=none ...............................................
[CV] ................... C=1, penalty=none, score=0.763, total=   0.5s
[CV] C=1, penalty=none ...............................................
[CV] ................... C=1, penalty=none, score=0.781, total=   1.0s
[CV] C=1, penalty=none ...............................................
[CV] .

  "Setting penalty='none' will ignore the C and l1_ratio "


[CV] ................... C=5, penalty=none, score=0.788, total=   0.9s
[CV] C=5, penalty=none ...............................................


  "Setting penalty='none' will ignore the C and l1_ratio "


[CV] ................... C=5, penalty=none, score=0.777, total=   0.5s
[CV] C=5, penalty=none ...............................................


  "Setting penalty='none' will ignore the C and l1_ratio "


[CV] ................... C=5, penalty=none, score=0.763, total=   0.5s
[CV] C=5, penalty=none ...............................................


  "Setting penalty='none' will ignore the C and l1_ratio "


[CV] ................... C=5, penalty=none, score=0.781, total=   1.1s
[CV] C=5, penalty=none ...............................................


  "Setting penalty='none' will ignore the C and l1_ratio "


[CV] ................... C=5, penalty=none, score=0.775, total=   0.9s
[CV] C=10, penalty=l2 ................................................
[CV] .................... C=10, penalty=l2, score=0.789, total=   1.1s
[CV] C=10, penalty=l2 ................................................
[CV] .................... C=10, penalty=l2, score=0.776, total=   0.4s
[CV] C=10, penalty=l2 ................................................
[CV] .................... C=10, penalty=l2, score=0.764, total=   0.7s
[CV] C=10, penalty=l2 ................................................
[CV] .................... C=10, penalty=l2, score=0.778, total=   0.5s
[CV] C=10, penalty=l2 ................................................
[CV] .................... C=10, penalty=l2, score=0.775, total=   0.3s
[CV] C=10, penalty=none ..............................................


  "Setting penalty='none' will ignore the C and l1_ratio "


[CV] .................. C=10, penalty=none, score=0.788, total=   0.4s
[CV] C=10, penalty=none ..............................................


  "Setting penalty='none' will ignore the C and l1_ratio "


[CV] .................. C=10, penalty=none, score=0.777, total=   0.5s
[CV] C=10, penalty=none ..............................................


  "Setting penalty='none' will ignore the C and l1_ratio "


[CV] .................. C=10, penalty=none, score=0.763, total=   0.6s
[CV] C=10, penalty=none ..............................................


  "Setting penalty='none' will ignore the C and l1_ratio "


[CV] .................. C=10, penalty=none, score=0.781, total=   1.3s
[CV] C=10, penalty=none ..............................................


  "Setting penalty='none' will ignore the C and l1_ratio "


[CV] .................. C=10, penalty=none, score=0.775, total=   0.5s
[CV] C=50, penalty=l2 ................................................
[CV] .................... C=50, penalty=l2, score=0.786, total=   1.5s
[CV] C=50, penalty=l2 ................................................
[CV] .................... C=50, penalty=l2, score=0.776, total=   0.4s
[CV] C=50, penalty=l2 ................................................
[CV] .................... C=50, penalty=l2, score=0.764, total=   0.8s
[CV] C=50, penalty=l2 ................................................
[CV] .................... C=50, penalty=l2, score=0.778, total=   0.9s
[CV] C=50, penalty=l2 ................................................
[CV] .................... C=50, penalty=l2, score=0.774, total=   0.4s
[CV] C=50, penalty=none ..............................................


  "Setting penalty='none' will ignore the C and l1_ratio "


[CV] .................. C=50, penalty=none, score=0.788, total=   0.6s
[CV] C=50, penalty=none ..............................................


  "Setting penalty='none' will ignore the C and l1_ratio "


[CV] .................. C=50, penalty=none, score=0.777, total=   0.5s
[CV] C=50, penalty=none ..............................................


  "Setting penalty='none' will ignore the C and l1_ratio "


[CV] .................. C=50, penalty=none, score=0.763, total=   0.9s
[CV] C=50, penalty=none ..............................................


  "Setting penalty='none' will ignore the C and l1_ratio "


[CV] .................. C=50, penalty=none, score=0.781, total=   1.1s
[CV] C=50, penalty=none ..............................................


  "Setting penalty='none' will ignore the C and l1_ratio "


[CV] .................. C=50, penalty=none, score=0.775, total=   0.5s


[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:   28.4s finished


GridSearchCV(estimator=LogisticRegression(max_iter=1000),
             param_grid={'C': [1, 5, 10, 50], 'penalty': ('l2', 'none')},
             verbose=3)

In [17]:
# List the best parameters for this dataset
print(logistic.best_params_)

# List the best score
print(logistic.best_score_)

{'C': 1, 'penalty': 'none'}
0.7766549749306865


In [18]:
# Score the model
logistic_model = logistic.best_estimator_
print(f"Logistic Model Training Data Score: {logistic_model.score(X_train_scaled, y_train)}")
print(f"Logistic Model Testing Data Score: {logistic_model.score(X_test_scaled, y_test)}")

Logistic Model Training Data Score: 0.7379362960137326
Logistic Model Testing Data Score: 0.7437070938215103


### Save the Model

In [19]:
import joblib
filename = 'logistic_model.sav'
joblib.dump(logistic_model, filename)

['logistic_model.sav']

### SVM Model

"In the SVM algorithm, we plot each data item as a point in n-dimensional space (where n is number of features you have) with the value of each feature being the value of a particular coordinate. Then, we perform classification by finding the hyper-plane that differentiates the two classes very well." - www.analyticsvidhya.com

In [20]:
from sklearn.svm import SVC

# creates SVC model
svm_model = SVC(kernel='linear')
svm_model

SVC(kernel='linear')

In [21]:
# sets up hyperparameter grid
param_grid = {'C': [1, 5, 10, 50], 
             'gamma': [1,5,10,50,100]}

# grid search
svm_grid = GridSearchCV(svm_model, param_grid, verbose=3)

In [22]:
# fits the model
svm_grid.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] C=1, gamma=1 ....................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ........................ C=1, gamma=1, score=0.742, total=   0.3s
[CV] C=1, gamma=1 ....................................................
[CV] ........................ C=1, gamma=1, score=0.733, total=   0.2s
[CV] C=1, gamma=1 ....................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.5s remaining:    0.0s


[CV] ........................ C=1, gamma=1, score=0.743, total=   0.2s
[CV] C=1, gamma=1 ....................................................
[CV] ........................ C=1, gamma=1, score=0.735, total=   0.1s
[CV] C=1, gamma=1 ....................................................
[CV] ........................ C=1, gamma=1, score=0.738, total=   0.2s
[CV] C=1, gamma=5 ....................................................
[CV] ........................ C=1, gamma=5, score=0.742, total=   0.1s
[CV] C=1, gamma=5 ....................................................
[CV] ........................ C=1, gamma=5, score=0.733, total=   0.4s
[CV] C=1, gamma=5 ....................................................
[CV] ........................ C=1, gamma=5, score=0.743, total=   0.3s
[CV] C=1, gamma=5 ....................................................
[CV] ........................ C=1, gamma=5, score=0.735, total=   0.1s
[CV] C=1, gamma=5 ....................................................
[CV] .

[CV] ...................... C=10, gamma=10, score=0.735, total=   0.1s
[CV] C=10, gamma=10 ..................................................
[CV] ...................... C=10, gamma=10, score=0.765, total=   0.1s
[CV] C=10, gamma=10 ..................................................
[CV] ...................... C=10, gamma=10, score=0.731, total=   0.1s
[CV] C=10, gamma=10 ..................................................
[CV] ...................... C=10, gamma=10, score=0.740, total=   0.1s
[CV] C=10, gamma=50 ..................................................
[CV] ...................... C=10, gamma=50, score=0.741, total=   0.5s
[CV] C=10, gamma=50 ..................................................
[CV] ...................... C=10, gamma=50, score=0.735, total=   0.6s
[CV] C=10, gamma=50 ..................................................
[CV] ...................... C=10, gamma=50, score=0.765, total=   0.5s
[CV] C=10, gamma=50 ..................................................
[CV] .

[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   24.4s finished


GridSearchCV(estimator=SVC(kernel='linear'),
             param_grid={'C': [1, 5, 10, 50], 'gamma': [1, 5, 10, 50, 100]},
             verbose=3)

In [23]:
# best parameters for the grid search
print(svm_grid.best_params_)
print(svm_grid.best_score_)

{'C': 5, 'gamma': 1}
0.7547176882381621


In [24]:
# SVM Score
svm_model = svm_grid.best_estimator_
print(f"SVM Training Data Score: {svm_model.score(X_train_scaled, y_train)}")
print(f"SVM Testing Data Score: {svm_model.score(X_test_scaled, y_test)}")

SVM Training Data Score: 0.743467480450124
SVM Testing Data Score: 0.7145308924485125


### Random Forest Model

"Random forest, like its name implies, consists of a large number of individual decision trees that operate as an ensemble. Each individual tree in the random forest spits out a class prediction and the class with the most votes becomes our model’s prediction." - TDS

In [25]:
# set up random forest model
rf_model = RandomForestClassifier()
rf_model

RandomForestClassifier()

In [26]:
# sets up hyperparameter grid
param_grid = {'max_depth': [1, 5, 50], 
             "n_estimators": [250, 500, 1000, 1500],
             "min_samples_leaf": [1, 2, 5, 10]}

# sets up the random forest grid search
rf_grid = GridSearchCV(rf_model, param_grid, verbose=3, cv=3)

In [27]:
# fits the random forest
rf_grid.fit(X_train_scaled, y_train)

Fitting 3 folds for each of 48 candidates, totalling 144 fits
[CV] max_depth=1, min_samples_leaf=1, n_estimators=250 ...............


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  max_depth=1, min_samples_leaf=1, n_estimators=250, score=0.505, total=   0.8s
[CV] max_depth=1, min_samples_leaf=1, n_estimators=250 ...............


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.8s remaining:    0.0s


[CV]  max_depth=1, min_samples_leaf=1, n_estimators=250, score=0.506, total=   0.7s
[CV] max_depth=1, min_samples_leaf=1, n_estimators=250 ...............


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.5s remaining:    0.0s


[CV]  max_depth=1, min_samples_leaf=1, n_estimators=250, score=0.506, total=   0.7s
[CV] max_depth=1, min_samples_leaf=1, n_estimators=500 ...............
[CV]  max_depth=1, min_samples_leaf=1, n_estimators=500, score=0.505, total=   1.4s
[CV] max_depth=1, min_samples_leaf=1, n_estimators=500 ...............
[CV]  max_depth=1, min_samples_leaf=1, n_estimators=500, score=0.506, total=   1.4s
[CV] max_depth=1, min_samples_leaf=1, n_estimators=500 ...............
[CV]  max_depth=1, min_samples_leaf=1, n_estimators=500, score=0.506, total=   1.3s
[CV] max_depth=1, min_samples_leaf=1, n_estimators=1000 ..............
[CV]  max_depth=1, min_samples_leaf=1, n_estimators=1000, score=0.505, total=   2.6s
[CV] max_depth=1, min_samples_leaf=1, n_estimators=1000 ..............
[CV]  max_depth=1, min_samples_leaf=1, n_estimators=1000, score=0.506, total=   2.5s
[CV] max_depth=1, min_samples_leaf=1, n_estimators=1000 ..............
[CV]  max_depth=1, min_samples_leaf=1, n_estimators=1000, score=0.50

[CV]  max_depth=5, min_samples_leaf=1, n_estimators=1000, score=0.870, total=   6.4s
[CV] max_depth=5, min_samples_leaf=1, n_estimators=1000 ..............
[CV]  max_depth=5, min_samples_leaf=1, n_estimators=1000, score=0.856, total=   3.8s
[CV] max_depth=5, min_samples_leaf=1, n_estimators=1500 ..............
[CV]  max_depth=5, min_samples_leaf=1, n_estimators=1500, score=0.887, total=   6.0s
[CV] max_depth=5, min_samples_leaf=1, n_estimators=1500 ..............
[CV]  max_depth=5, min_samples_leaf=1, n_estimators=1500, score=0.870, total=   5.8s
[CV] max_depth=5, min_samples_leaf=1, n_estimators=1500 ..............
[CV]  max_depth=5, min_samples_leaf=1, n_estimators=1500, score=0.856, total=   5.6s
[CV] max_depth=5, min_samples_leaf=2, n_estimators=250 ...............
[CV]  max_depth=5, min_samples_leaf=2, n_estimators=250, score=0.886, total=   0.9s
[CV] max_depth=5, min_samples_leaf=2, n_estimators=250 ...............
[CV]  max_depth=5, min_samples_leaf=2, n_estimators=250, score=0.

[CV]  max_depth=50, min_samples_leaf=2, n_estimators=250, score=0.876, total=   1.3s
[CV] max_depth=50, min_samples_leaf=2, n_estimators=250 ..............
[CV]  max_depth=50, min_samples_leaf=2, n_estimators=250, score=0.864, total=   1.1s
[CV] max_depth=50, min_samples_leaf=2, n_estimators=250 ..............
[CV]  max_depth=50, min_samples_leaf=2, n_estimators=250, score=0.851, total=   1.1s
[CV] max_depth=50, min_samples_leaf=2, n_estimators=500 ..............
[CV]  max_depth=50, min_samples_leaf=2, n_estimators=500, score=0.874, total=   6.8s
[CV] max_depth=50, min_samples_leaf=2, n_estimators=500 ..............
[CV]  max_depth=50, min_samples_leaf=2, n_estimators=500, score=0.863, total=   2.9s
[CV] max_depth=50, min_samples_leaf=2, n_estimators=500 ..............
[CV]  max_depth=50, min_samples_leaf=2, n_estimators=500, score=0.852, total=   2.6s
[CV] max_depth=50, min_samples_leaf=2, n_estimators=1000 .............
[CV]  max_depth=50, min_samples_leaf=2, n_estimators=1000, score

[Parallel(n_jobs=1)]: Done 144 out of 144 | elapsed:  9.0min finished


GridSearchCV(cv=3, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [1, 5, 50],
                         'min_samples_leaf': [1, 2, 5, 10],
                         'n_estimators': [250, 500, 1000, 1500]},
             verbose=3)

In [28]:
# prints out the best parameters 
print(rf_grid.best_params_)
print(rf_grid.best_score_)

{'max_depth': 5, 'min_samples_leaf': 1, 'n_estimators': 1500}
0.8712540665768101


In [29]:
# Random Forest Model Scoring
rf_model = rf_grid.best_estimator_
print(f"RF Training Data Score: {rf_model.score(X_train_scaled, y_train)}")
print(f"RF Testing Data Score: {rf_model.score(X_test_scaled, y_test)}")

RF Training Data Score: 0.8743086019454511
RF Testing Data Score: 0.8787185354691075


### KNN

"The KNN algorithm assumes that similar things exist in close proximity. In other words, similar things are near to each other." - TDS

In [30]:
# import the model
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt

In [31]:
# creates k nearest neighbor model
knn_model = KNeighborsClassifier(n_neighbors=5)

In [32]:
# sets up hyperparameter grid
param_grid = {"leaf_size": [1, 10, 100, 200],
                "n_neighbors": list(range(1, 20, 2))}

# sets up the grid search
knn_grid = GridSearchCV(knn_model, param_grid, verbose=3, cv=3)

In [33]:
# fits the k nearest neighbor/grid search model to X and y
knn_grid.fit(X_train_scaled, y_train)

Fitting 3 folds for each of 40 candidates, totalling 120 fits
[CV] leaf_size=1, n_neighbors=1 ......................................
[CV] .......... leaf_size=1, n_neighbors=1, score=0.808, total=   0.1s
[CV] leaf_size=1, n_neighbors=1 ......................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s


[CV] .......... leaf_size=1, n_neighbors=1, score=0.808, total=   0.2s
[CV] leaf_size=1, n_neighbors=1 ......................................
[CV] .......... leaf_size=1, n_neighbors=1, score=0.811, total=   0.1s
[CV] leaf_size=1, n_neighbors=3 ......................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.3s remaining:    0.0s


[CV] .......... leaf_size=1, n_neighbors=3, score=0.845, total=   0.2s
[CV] leaf_size=1, n_neighbors=3 ......................................
[CV] .......... leaf_size=1, n_neighbors=3, score=0.835, total=   0.1s
[CV] leaf_size=1, n_neighbors=3 ......................................
[CV] .......... leaf_size=1, n_neighbors=3, score=0.835, total=   0.2s
[CV] leaf_size=1, n_neighbors=5 ......................................
[CV] .......... leaf_size=1, n_neighbors=5, score=0.859, total=   0.1s
[CV] leaf_size=1, n_neighbors=5 ......................................
[CV] .......... leaf_size=1, n_neighbors=5, score=0.848, total=   0.1s
[CV] leaf_size=1, n_neighbors=5 ......................................
[CV] .......... leaf_size=1, n_neighbors=5, score=0.843, total=   0.2s
[CV] leaf_size=1, n_neighbors=7 ......................................
[CV] .......... leaf_size=1, n_neighbors=7, score=0.856, total=   0.2s
[CV] leaf_size=1, n_neighbors=7 ......................................
[CV] .

[CV] ........ leaf_size=100, n_neighbors=3, score=0.835, total=   0.1s
[CV] leaf_size=100, n_neighbors=3 ....................................
[CV] ........ leaf_size=100, n_neighbors=3, score=0.835, total=   0.1s
[CV] leaf_size=100, n_neighbors=5 ....................................
[CV] ........ leaf_size=100, n_neighbors=5, score=0.858, total=   0.1s
[CV] leaf_size=100, n_neighbors=5 ....................................
[CV] ........ leaf_size=100, n_neighbors=5, score=0.848, total=   0.1s
[CV] leaf_size=100, n_neighbors=5 ....................................
[CV] ........ leaf_size=100, n_neighbors=5, score=0.843, total=   0.1s
[CV] leaf_size=100, n_neighbors=7 ....................................
[CV] ........ leaf_size=100, n_neighbors=7, score=0.856, total=   0.1s
[CV] leaf_size=100, n_neighbors=7 ....................................
[CV] ........ leaf_size=100, n_neighbors=7, score=0.852, total=   0.1s
[CV] leaf_size=100, n_neighbors=7 ....................................
[CV] .

[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:   14.1s finished


GridSearchCV(cv=3, estimator=KNeighborsClassifier(),
             param_grid={'leaf_size': [1, 10, 100, 200],
                         'n_neighbors': [1, 3, 5, 7, 9, 11, 13, 15, 17, 19]},
             verbose=3)

In [34]:
# prints out the best parameters 
print(knn_grid.best_params_)
print(knn_grid.best_score_)

{'leaf_size': 1, 'n_neighbors': 19}
0.8588565469321496


In [35]:
# K Nearest Neighbor Model Scoring
knn_model = knn_grid.best_estimator_
print(f"KNN Training Data Score: {knn_model.score(X_train_scaled, y_train)}")
print(f"KNN Testing Data Score: {knn_model.score(X_test_scaled, y_test)}")

KNN Training Data Score: 0.8695403395002861
KNN Testing Data Score: 0.8655606407322655


## Summary of Models

In [36]:
print(f"Logistic Model Training Data Score: {logistic_model.score(X_train_scaled, y_train)}")
print(f"Logistic Model Testing Data Score: {logistic_model.score(X_test_scaled, y_test)}")
print("---------------------------------------------------------------")

print(f"SVM Training Data Score: {svm_model.score(X_train_scaled, y_train)}")
print(f"SVM Testing Data Score: {svm_model.score(X_test_scaled, y_test)}")

print("---------------------------------------------------------------")
print(f"RF Training Data Score: {rf_model.score(X_train_scaled, y_train)}")
print(f"RF Testing Data Score: {rf_model.score(X_test_scaled, y_test)}")

print("---------------------------------------------------------------")
print(f"KNN Training Data Score: {knn_model.score(X_train_scaled, y_train)}")
print(f"KNN Testing Data Score: {knn_model.score(X_test_scaled, y_test)}")


Logistic Model Training Data Score: 0.7379362960137326
Logistic Model Testing Data Score: 0.7437070938215103
---------------------------------------------------------------
SVM Training Data Score: 0.743467480450124
SVM Testing Data Score: 0.7145308924485125
---------------------------------------------------------------
RF Training Data Score: 0.8743086019454511
RF Testing Data Score: 0.8787185354691075
---------------------------------------------------------------
KNN Training Data Score: 0.8695403395002861
KNN Testing Data Score: 0.8655606407322655
