# Necessary Upgrades
Run the cells in this section to make sure you have the latest version of sklearn and joblib.

Restart your kernel after installing.

In [34]:
## Update sklearn to prevent version mismatches
# !pip install sklearn --upgrade

In [35]:
## install joblib. This will be used to save your model. 
# !pip install joblib
## Restart your kernel after installing 

# Import Dependencies

In [1]:
import warnings
warnings.simplefilter('ignore')

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

import joblib

# Read the CSV and Perform Basic Data Cleaning

In [2]:
df = pd.read_csv("data/exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
# df.head()

# drop the error columns
df = df.loc[:, ~df.columns.str.contains('_err')]
df.columns

# column definitions is available at the end of this notebook.

Index(['koi_disposition', 'koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co',
       'koi_fpflag_ec', 'koi_period', 'koi_time0bk', 'koi_impact',
       'koi_duration', 'koi_depth', 'koi_prad', 'koi_teq', 'koi_insol',
       'koi_model_snr', 'koi_tce_plnt_num', 'koi_steff', 'koi_slogg',
       'koi_srad', 'ra', 'dec', 'koi_kepmag'],
      dtype='object')

# Select your features (columns) and y-variable.
Column definitions are available at the end of this notebook.

Use `koi_disposition` for the y values

In [3]:
y = df[['koi_disposition']]

# all the columns minus y
selected_features = df.drop(columns=["koi_disposition"])
feature_names = selected_features.columns
# selected_features.head()

# Create a Train Test Split

In [4]:
X_train, X_test, y_train, y_test = train_test_split(selected_features, y, random_state=42, stratify = y)
# X_train.head()

# Pre-processing

Scale the data and perform some feature selection

In [5]:
# Scale your data

X_standard_scaler = StandardScaler().fit(X_train)
X_minmax_scaler = MinMaxScaler().fit(X_train)

X_train_minmax_scaled = X_minmax_scaler.transform(X_train)
X_test_minmax_scaled = X_minmax_scaler.transform(X_test)

X_train_standard_scaled = X_standard_scaler.transform(X_train)
X_test_standard_scaled = X_standard_scaler.transform(X_test)

# Load Models

In [9]:
minmax_logistic = joblib.load("models/logistic_minmax.h5")
standard_logistic = joblib.load("models/logistic_standard.h5")

minmax_rfc = joblib.load("models/rfc_minmax.h5")
standard_rfc = joblib.load("models/rfc_standard.h5")

minmax_svc = joblib.load("models/SVC_minmax.h5")
standard_svc = joblib.load("models/SVC_standard.h5")

minmax_knn = joblib.load("models/knn_minmax.h5")
standard_knn = joblib.load("models/knn_standard.h5")

minmax_models = [minmax_logistic, minmax_rfc, minmax_svc, minmax_knn]
standard_models = [standard_logistic, standard_rfc, standard_svc, standard_knn]
model_order = ["logistic", "rfc", "svc", "knn"]

# Compare Models

In [22]:
minmax_scores = []
standard_scores = []
minmax_predictions = []
standard_predictions = []

for i in range(len(model_order)):
    minmax_score = minmax_models[i].score(X_test_minmax_scaled, y_test)
    minmax_scores.append(minmax_score)
    
    minmax_prediction = minmax_models[i].predict(X_test_minmax_scaled)
    minmax_predictions.append(minmax_prediction)
    
    standard_score = standard_models[i].score(X_test_standard_scaled, y_test)
    standard_scores.append(standard_score)
    
    standard_prediction = standard_models[i].predict(X_test_standard_scaled)
    standard_predictions.append(standard_prediction)
#     print (model_order[i])    
#     print (f"minmax scaling: {minmax_score}")   
#     print (f"standard scaling: {standard_score}")

pd.DataFrame({"model type": model_order, "minmax scaler": minmax_scores, 
              "standard scaler": standard_scores}).reset_index(drop=True)

Unnamed: 0,model type,minmax scaler,standard scaler
0,logistic,0.828375,0.834096
1,rfc,0.90389,0.905606
2,svc,0.802059,0.83238
3,knn,0.795195,0.815217


In [33]:
for i in range(len(model_order)):
    print (f"MODEL TYPE: {model_order[i]}")
    print()
    print ("---")
    print("minmax scaler")
    print ("---")
    print(f"test score: {minmax_scores[i]}")
    print(classification_report(y_test, minmax_predictions[i]))
    print ("---")
    print ("standard scaler")
    print ("---")
    print(f"test score: {standard_scores[i]}")
    print(classification_report(y_test, standard_predictions[i]))
    print ("--------------------------------------------------------------------")

MODEL TYPE: logistic

---
minmax scaler
---
test score: 0.8283752860411899
                precision    recall  f1-score   support

     CANDIDATE       0.68      0.59      0.63       422
     CONFIRMED       0.66      0.72      0.69       450
FALSE POSITIVE       0.99      1.00      0.99       876

      accuracy                           0.83      1748
     macro avg       0.77      0.77      0.77      1748
  weighted avg       0.83      0.83      0.83      1748

---
standard scaler
---
test score: 0.834096109839817
                precision    recall  f1-score   support

     CANDIDATE       0.69      0.60      0.65       422
     CONFIRMED       0.66      0.73      0.70       450
FALSE POSITIVE       0.99      1.00      0.99       876

      accuracy                           0.83      1748
     macro avg       0.78      0.78      0.78      1748
  weighted avg       0.83      0.83      0.83      1748

--------------------------------------------------------------------
MODEL TYPE: 