In [1]:
# Import dependencies
import pandas as pd
import numpy as np

In [2]:
# Convert CSVs to DFs
feature_csv = "feature_df.csv"
target_csv = "target_df.csv"

feature_csv_df = pd.read_csv(feature_csv)
target_csv_df = pd.read_csv(target_csv)

In [3]:
# Check feature DF head
feature_csv_df.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_steff_err1,koi_model_snr
0,0,0,0,81,25.8
1,0,1,0,158,76.3
2,0,1,0,157,505.6
3,0,0,0,169,40.9
4,0,0,0,189,40.2


In [4]:
# Check feature DF shape
feature_csv_df.shape

(6991, 5)

In [5]:
# Check target DF head
target_csv_df.head()

Unnamed: 0,koi_disposition
0,CONFIRMED
1,FALSE POSITIVE
2,FALSE POSITIVE
3,CONFIRMED
4,CONFIRMED


In [6]:
# Check target DF shape
target_csv_df.shape

(6991, 1)

# SPLIT, NORMALIZE, AND ENCODE THE DATA

In [7]:
# Assign the features and target to X and y abd check assignments
raw_feature_data = feature_csv_df.values
raw_target_data = target_csv_df.values
X = raw_feature_data[:, 0:6]
y = raw_target_data[:, 0]

print(X, y)

[[  0.    0.    0.   81.   25.8]
 [  0.    1.    0.  158.   76.3]
 [  0.    1.    0.  157.  505.6]
 ...
 [  0.    0.    0.  165.   10.6]
 [  0.    0.    1.  193.   12.3]
 [  0.    0.    1.  158.    8.2]] ['CONFIRMED' 'FALSE POSITIVE' 'FALSE POSITIVE' ... 'CANDIDATE'
 'FALSE POSITIVE' 'FALSE POSITIVE']


In [8]:
# Create the train and test sets for the features and target
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [9]:
# Scale the features using the MinMax scaler since we know their values
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [10]:
# Encode the target for the train and test set so it can be fed into our NN and conver to a vector and check it
label_encoder = LabelEncoder()
label_encoder.fit(y_train)


encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

encoded_y_train

array([0, 2, 2, ..., 2, 2, 2])

# HYPERPERAMETER TUNING

In [11]:
# Import the modules to build my model
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB

In [12]:
# Create model and print params so I can see what can be tuned
model = GaussianNB()

model.get_params().keys()

dict_keys(['priors', 'var_smoothing'])

In [13]:
# Fit and run the model
model.fit(X_train_scaled, encoded_y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [14]:
# Print the score for teh traning vs. test data
print(f"Training Data Score: {model.score(X_train_scaled, encoded_y_train)}")
print(f"Testing Data Score: {model.score(X_test_scaled, encoded_y_test)}")

Training Data Score: 0.7337402250619874
Testing Data Score: 0.7059496567505721


# SAVE THE MODEL

In [15]:
# Save the tuned model for future use
import joblib
filename = 'Howard_Mitchell_Gaussian_NB.sav'
joblib.dump(model, filename)

['Howard_Mitchell_Gaussian_NB.sav']