In [3]:
# Import dependencies
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
from xgboost.sklearn import XGBClassifier  
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [4]:
feature_csv = "feature_df.csv"
target_csv = "target_df.csv"

feature_csv_df = pd.read_csv(feature_csv)
target_csv_df = pd.read_csv(target_csv)

In [5]:
feature_csv_df.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_steff_err1,koi_model_snr
0,0,0,0,81,25.8
1,0,1,0,158,76.3
2,0,1,0,157,505.6
3,0,0,0,169,40.9
4,0,0,0,189,40.2


In [6]:
feature_csv_df.shape

(6991, 5)

In [7]:
target_csv_df.head()

Unnamed: 0,koi_disposition
0,CONFIRMED
1,FALSE POSITIVE
2,FALSE POSITIVE
3,CONFIRMED
4,CONFIRMED


In [8]:
target_csv_df.shape

(6991, 1)

# Split, nornmalize, and encode the the data

In [9]:
# Assign the features and target to X and y abd check assignments
raw_feature_data = feature_csv_df.values
raw_target_data = target_csv_df.values
X = raw_feature_data[:, 0:6]
y = raw_target_data[:, 0]

print(X, y)

[[  0.    0.    0.   81.   25.8]
 [  0.    1.    0.  158.   76.3]
 [  0.    1.    0.  157.  505.6]
 ...
 [  0.    0.    0.  165.   10.6]
 [  0.    0.    1.  193.   12.3]
 [  0.    0.    1.  158.    8.2]] ['CONFIRMED' 'FALSE POSITIVE' 'FALSE POSITIVE' ... 'CANDIDATE'
 'FALSE POSITIVE' 'FALSE POSITIVE']


In [10]:
# Create the train and test sets for the features and target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [11]:
# Scale the features using the MinMax scaler since we know their values
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [12]:
# Encode the target for the train and test set so it can be fed into our NN and conver to a vector and check it
label_encoder = LabelEncoder()
label_encoder.fit(y_train)


encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

encoded_y_train

array([0, 2, 2, ..., 2, 2, 2])

In [13]:
# Convert encoded targets to one-hot-encoding and check it
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

y_train_categorical

array([[1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       ...,
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.]], dtype=float32)

In [14]:
# Import the sequential and dense modules fo build my NN
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

# HYPERPERAMETER TUNING

In [36]:
# Create model and print params so I can see what can be tuned
knn = KNeighborsClassifier()

knn.get_params().keys()

dict_keys(['algorithm', 'leaf_size', 'metric', 'metric_params', 'n_jobs', 'n_neighbors', 'p', 'weights'])

In [37]:
# Create param grid
param_grid = {'n_neighbors': [x for x in range(1, 102, 2)],
               'metric': ['minkowski','euclidean','manhattan'],
               'weights': ['uniform','distance']}

# Assign the optimizer grid search parameters, fit, and run the model
grid = GridSearchCV(estimator=knn, param_grid=param_grid, n_jobs=-1)
grid_result = grid.fit(X_train_scaled, y_train_categorical)



In [38]:
# Print the score and best params for the KNN model
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Best: 0.840931 using {'metric': 'minkowski', 'n_neighbors': 21, 'weights': 'uniform'}


In [39]:
# Print the score for teh traning vs. test data
print(f"Training Data Score: {grid_result.score(X_train_scaled, y_train_categorical)}")
print(f"Testing Data Score: {grid_result.score(X_test_scaled, y_test_categorical)}")

Training Data Score: 0.8502765592218196
Testing Data Score: 0.8323798627002288


# SAVE THE MODEL

In [40]:
# Save the tuned model for future use
import joblib
filename = 'Howard_Mitchell_KNN.sav'
joblib.dump(grid_result, filename)

['Howard_Mitchell_KNN.sav']