In [1]:
# # Update sklearn to prevent version mismatches
# !pip install sklearn --upgrade

In [2]:
# # install joblib. This will be used to save your model. 
# # Restart your kernel after installing 
# !pip install joblib

In [None]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

## Read the CSV and Perform Basic Data Cleaning

In [None]:
# Read in csv
df = pd.read_csv("exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

In [None]:
df.describe()

## Select features (columns)

In [None]:
# Set target, features and feature_names.
target = df["koi_disposition"]
data = df.drop("koi_disposition", axis=1)
feature_names = data.columns
data.head()

## Create a Train Test Split

Use `koi_disposition` for the y values

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)

In [None]:
X_train.head()

## Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [None]:
from sklearn.preprocessing import MinMaxScaler
X_minmax = MinMaxScaler().fit(X_train)

X_train_minmax = X_minmax.transform(X_train)
X_test_minmax = X_minmax.transform(X_test)

## Train the Model (Random Forest)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train_minmax, y_train)

In [None]:
print(f"Training Data Score: {rf.score(X_train_minmax, y_train)}")
print(f"Testing Data Score: {rf.score(X_test_minmax, y_test)}")

In [None]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

## Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [None]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV
param_grid = {'n_estimators': [250, 300, 350],
              'max_depth': [125, 150, 175]}
grid = GridSearchCV(rf, param_grid, verbose=3)

In [None]:
# Train the model with GridSearch
grid.fit(X_train_minmax, y_train)

In [None]:
print(grid.best_params_)
print(grid.best_score_)

In [None]:
# Training score:
grid.score(X_train_minmax, y_train)

In [None]:
# Testing score:
grid.score(X_test_minmax, y_test)

In [None]:
# Make prediction and save to variable for report.
predictions = grid.predict(X_test_minmax)

In [None]:
# Print Classification Report.
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

In [None]:
# %matplotlib notebook
from yellowbrick.classifier import ClassificationReport
viz = ClassificationReport(RandomForestClassifier())
viz.fit(X_train_minmax, y_train)
viz.score(X_test_minmax, y_test)
viz.finalize()
viz.show(outpath="Output/random_forest_classification_report.png")

In [None]:
from yellowbrick.model_selection import FeatureImportances
from yellowbrick.style import set_palette
from yellowbrick.features import RadViz
set_palette('yellowbrick')
viz = FeatureImportances(rf, size=(500, 500))
viz.fit(X_train_minmax, y_train)
viz.show(outpath="Output/feature_importance.png")

## Save the Model

In [None]:
import joblib
filename = 'Models/exoplanet_exploration_random_Forest.sav'
joblib.dump(rf, filename)