# Necessary Upgrades
Run the cells in this section to make sure you have the latest version of sklearn and joblib.

Restart your kernel after installing.

In [34]:
## Update sklearn to prevent version mismatches
# !pip install sklearn --upgrade

In [35]:
## install joblib. This will be used to save your model. 
# !pip install joblib
## Restart your kernel after installing 

# Import Dependencies

In [1]:
import warnings
warnings.simplefilter('ignore')

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import Normalizer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

import joblib

# Read the CSV and Perform Basic Data Cleaning

In [2]:
df = pd.read_csv("data/exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
# df.head()

# drop the error columns
df = df.loc[:, ~df.columns.str.contains('_err')]
df.columns

# column definitions is available at the end of this notebook.

Index(['koi_disposition', 'koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co',
       'koi_fpflag_ec', 'koi_period', 'koi_time0bk', 'koi_impact',
       'koi_duration', 'koi_depth', 'koi_prad', 'koi_teq', 'koi_insol',
       'koi_model_snr', 'koi_tce_plnt_num', 'koi_steff', 'koi_slogg',
       'koi_srad', 'ra', 'dec', 'koi_kepmag'],
      dtype='object')

# Select your features (columns) and y-variable.
Column definitions are available at the end of this notebook.

Use `koi_disposition` for the y values

In [3]:
y = df[['koi_disposition']]

# all the columns minus y
selected_features = df.drop(columns=["koi_disposition"])
feature_names = selected_features.columns
# selected_features.head()

# Create a Train Test Split

In [4]:
X_train, X_test, y_train, y_test = train_test_split(selected_features, y, random_state=42, stratify = y)
# X_train.head()

# Pre-processing

Scale the data and perform some feature selection

In [5]:
# Scale your data
scaler = QuantileTransformer(output_distribution='normal')
normalizer = Normalizer()

X_train_s = scaler.fit_transform(X_train)
X_train_sn= normalizer.fit_transform(X_train_s)

X_test_s = scaler.fit_transform(X_test)
X_test_sn = normalizer.fit_transform(X_test_s)

# Load Models

In [7]:
logistic = joblib.load("models/log.h5")

rfc = joblib.load("models/rfc.h5")

svc = joblib.load("models/svc.h5")

knn = joblib.load("models/knn.h5")

models = [logistic, rfc, svc, knn]
model_order = ["logistic", "rfc", "svc", "knn"]

# Compare Models

## Observe Model Scores
From observing the model scores, rfc (random forest classifer) with standard scaler has the best fit to the data.

In [8]:
scores = []
predictions = []

for i in range(len(model_order)):
    score = models[i].score(X_test_sn, y_test)
    scores.append(score)
    
    prediction = models[i].predict(X_test_sn)
    predictions.append(prediction)
    

pd.DataFrame({"model type": model_order, "score": scores}).reset_index(drop=True)

Unnamed: 0,model type,score
0,logistic,0.8873
1,rfc,0.899314
2,svc,0.882151
3,knn,0.892449


## Observe Classification Reports
The classification report shows the accuracy for each of the outcomes. As expected, rfc with standard scaler has the best precision.

In [9]:
for i in range(len(model_order)):
    print (f"MODEL TYPE: {model_order[i]}")
    print()
    print ("---")
    print(f"test score: {scores[i]}")
    print(classification_report(y_test, predictions[i]))
    print ("--------------------------------------------------------------------")

MODEL TYPE: logistic

---
test score: 0.8872997711670481
                precision    recall  f1-score   support

     CANDIDATE       0.82      0.72      0.76       422
     CONFIRMED       0.76      0.83      0.79       450
FALSE POSITIVE       0.99      1.00      0.99       876

      accuracy                           0.89      1748
     macro avg       0.86      0.85      0.85      1748
  weighted avg       0.89      0.89      0.89      1748

--------------------------------------------------------------------
MODEL TYPE: rfc

---
test score: 0.8993135011441648
                precision    recall  f1-score   support

     CANDIDATE       0.86      0.73      0.79       422
     CONFIRMED       0.80      0.87      0.83       450
FALSE POSITIVE       0.97      1.00      0.99       876

      accuracy                           0.90      1748
     macro avg       0.88      0.86      0.87      1748
  weighted avg       0.90      0.90      0.90      1748

--------------------------------