## Model 1: Random Forest / Decision Tree

In [None]:
# Update sklearn
#!pip install sklearn --upgrade
# Install joblib
#!pip install joblib

In [None]:
# Import dependencies
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [None]:
# Load in exoplanet data
planets_df = pd.read_csv("resources/exoplanet_data.csv")
planets_df

In [None]:
# Show all columns
planets_df.columns

In [None]:
# Drop null columns & rows
planets_df = planets_df.dropna(axis='columns', how='all')
planets_df = planets_df.dropna(how="any")
planets_df

## Select Features

In [None]:
# Define X
X = planets_df.drop("koi_disposition", axis=1)
# Define y
y = planets_df["koi_disposition"]
print(X.shape, y.shape)

In [None]:
# Feature selection & remove unnecessary features (feature importance below 0)
X = planets_df.drop(['koi_disposition', 'koi_srad', 'koi_steff', 'koi_slogg', 'koi_slogg_err1', 'koi_srad_err2', 'koi_tce_plnt_num'], axis=1)
print(X.shape)


## Pre-Processing

## Train, Test, Split

In [None]:
# Separate the data into training & testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [None]:
# Show rows for training
X_train

### Min Max Scaler (x)

In [None]:
# MinMaxScaler: scale the numerical data
# Define the scaler
X_scaler = MinMaxScaler().fit(X_train)

# Transform x train & test (with scaler)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

### One Hot Encoding (y)

### Canidate: 0
### Confirmed: 1
### False Positive: 2

In [None]:
# Encode labels for y: "KOI disposition"
label_encoder = LabelEncoder()

# Train the encoder
label_encoder.fit(y_train)

# Encoded y train & test
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

In [None]:
# One hot encoding
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

In [None]:
# See the categories
y_train_categorical

### Random Forest:

In [None]:
from sklearn.ensemble import RandomForestClassifier
# Random Forest Classifier
rf = RandomForestClassifier(n_estimators=400)
# Fit the model
rf = rf.fit(X_train_scaled, y_train)
# Score the model
print("Training Score: ", rf.score(X_train_scaled, y_train))
print("Testing Score: ", rf.score(X_test_scaled, y_test))

In [None]:
# Print out success report of features
feature_names = X.columns
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

In [None]:
# # Loop through to show original class & label
# for label, original_class in zip(encoded_y, y):
#     print('Original Class: ' + str(original_class))
#     print('Encoded Label: ' + str(label))
#     print('-' * 12)

## Train the Model

## Hyperparameter Tuning

In [None]:
# GridSearchCV: tune model parameters
from sklearn.model_selection import GridSearchCV

parameters = { 'n_estimators': [100, 200, 300, 400, 500]}
grid = GridSearchCV(rf, parameters, verbose=3)

In [None]:
# Train the model with GridSearch
grid.fit(X_train_scaled, y_train)

In [None]:
# Print out results
print("Best Parameters: ", grid.best_params_)
print("Best Score: ", grid.best_score_)

In [None]:
# Predict with test data
predictions = grid.predict(X_test_scaled)

In [None]:
# Classification Report
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions,
                            target_names=["Candidate", "Confirmed", "False Positive"]))

## Save the Model

In [None]:
import joblib
filename = 'models/julia_brunett.sav'
joblib.dump(grid, filename)

In [None]:
# Load the model
exoplanets_model = joblib.load("models/julia_brunett.sav")

In [None]:
exoplanets_model.score(X_test_scaled, y_test)

In [None]:
grid.score(X_test_scaled, y_test)

In [None]:
rf.score(X_test_scaled, y_test)