# Exoplanet Machine Learning Dataset

In [None]:
# Update sklearn to prevent version mismatches
#!pip install sklearn --upgrade

In [None]:
# install joblib. 
# !pip install joblib

In [1]:
import pandas as pd

## Data Exploration and Cleaning

In [3]:
# Reading the provided dataset
df = pd.read_csv("data/exoplanet_data.csv")
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


In [4]:
df.dtypes
# all are numbers expect koi_disp

koi_disposition       object
koi_fpflag_nt          int64
koi_fpflag_ss          int64
koi_fpflag_co          int64
koi_fpflag_ec          int64
koi_period           float64
koi_period_err1      float64
koi_period_err2      float64
koi_time0bk          float64
koi_time0bk_err1     float64
koi_time0bk_err2     float64
koi_impact           float64
koi_impact_err1      float64
koi_impact_err2      float64
koi_duration         float64
koi_duration_err1    float64
koi_duration_err2    float64
koi_depth            float64
koi_depth_err1       float64
koi_depth_err2       float64
koi_prad             float64
koi_prad_err1        float64
koi_prad_err2        float64
koi_teq                int64
koi_insol            float64
koi_insol_err1       float64
koi_insol_err2       float64
koi_model_snr        float64
koi_tce_plnt_num       int64
koi_steff              int64
koi_steff_err1         int64
koi_steff_err2         int64
koi_slogg            float64
koi_slogg_err1       float64
koi_slogg_err2

In [5]:
# checking unique values 
types = df["koi_disposition"].unique()
types

array(['CONFIRMED', 'FALSE POSITIVE', 'CANDIDATE'], dtype=object)

In [6]:
# checking for nans
df.isnull().sum()

koi_disposition      0
koi_fpflag_nt        0
koi_fpflag_ss        0
koi_fpflag_co        0
koi_fpflag_ec        0
koi_period           0
koi_period_err1      0
koi_period_err2      0
koi_time0bk          0
koi_time0bk_err1     0
koi_time0bk_err2     0
koi_impact           0
koi_impact_err1      0
koi_impact_err2      0
koi_duration         0
koi_duration_err1    0
koi_duration_err2    0
koi_depth            0
koi_depth_err1       0
koi_depth_err2       0
koi_prad             0
koi_prad_err1        0
koi_prad_err2        0
koi_teq              0
koi_insol            0
koi_insol_err1       0
koi_insol_err2       0
koi_model_snr        0
koi_tce_plnt_num     0
koi_steff            0
koi_steff_err1       0
koi_steff_err2       0
koi_slogg            0
koi_slogg_err1       0
koi_slogg_err2       0
koi_srad             0
koi_srad_err1        0
koi_srad_err2        0
ra                   0
dec                  0
koi_kepmag           0
dtype: int64

## Select your features 

In [7]:
# Set x as every column except koi_dis
X = df.drop("koi_disposition", axis=1)
# Set y to koi_dis
y = df["koi_disposition"]

In [8]:
# Set features. This will also be used as your x values.
from sklearn.ensemble import RandomForestClassifier

# Create a random forest classifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X,y)

# Get the most important features
importances = rf.feature_importances_

# We can sort the features by their importance
sorted(zip(rf.feature_importances_, X.columns), reverse=True)

[(0.10939353197544172, 'koi_fpflag_co'),
 (0.10008082512238897, 'koi_fpflag_nt'),
 (0.06419056023606082, 'koi_fpflag_ss'),
 (0.052684753334835854, 'koi_model_snr'),
 (0.04575429574223963, 'koi_prad'),
 (0.038166699138336466, 'koi_prad_err1'),
 (0.0380261966149015, 'koi_prad_err2'),
 (0.03743492217871771, 'koi_fpflag_ec'),
 (0.03712087881874025, 'koi_duration_err2'),
 (0.03462981362060653, 'koi_duration_err1'),
 (0.02764542599573727, 'koi_steff_err1'),
 (0.02636664854004471, 'koi_steff_err2'),
 (0.023584268300264853, 'koi_duration'),
 (0.02330919808713343, 'koi_time0bk_err2'),
 (0.021677139088098783, 'koi_time0bk_err1'),
 (0.021608510484762068, 'koi_period'),
 (0.01918744440959643, 'koi_depth'),
 (0.018344683313247706, 'koi_impact'),
 (0.018337828336907405, 'koi_insol_err1'),
 (0.01676233101128894, 'koi_period_err1'),
 (0.016102567392253152, 'koi_period_err2'),
 (0.015387993859888562, 'koi_teq'),
 (0.015372740804538144, 'koi_insol_err2'),
 (0.014876972597654374, 'koi_insol'),
 (0.014006

In [10]:
# chosing the top important features
X = df[['koi_fpflag_co', 'koi_fpflag_nt', 'koi_fpflag_ss', 'koi_model_snr', 'koi_prad']]

## Create a Train Test Split

Use `koi_disposition` for the y values

In [None]:
# split the data up between training and testing data (3/4:1/4)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
X_train.head()

## Pre-Processing

In [None]:
# Scale your data
# do not scale Y because it is categorical 
from sklearn.preprocessing import MinMaxScaler

X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


## Train the Model & Tune Hyperparameters



### Logistic Regression Model

Logistic Regression is used when the dependent variable(target) is categorical.

In [None]:
# Create a Logistic Regression Model
from sklearn.linear_model import LogisticRegression
logistic_classifier = LogisticRegression(max_iter = 1000)
logistic_classifier

In [None]:
# Set up hyperparameter grid 
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [1, 5, 10, 50],
              'penalty': ('l2', 'none')}

# Set up the grid search
logistic = GridSearchCV(logistic_classifier, param_grid, verbose=3)

In [None]:
# Fit the model using the grid search estimator. 
logistic.fit(X_train, y_train)

In [None]:
# List the best parameters for this dataset
print(logistic.best_params_)

# List the best score
print(logistic.best_score_)

In [None]:
# Score the model
logistic_model = logistic.best_estimator_
print(f"Logistic Model Training Data Score: {logistic_model.score(X_train_scaled, y_train)}")
print(f"Logistic Model Testing Data Score: {logistic_model.score(X_test_scaled, y_test)}")

### Save the Model

In [None]:
import joblib
filename = 'models/logistic_model.sav'
joblib.dump(logistic_model, filename)

### SVM Model

"In the SVM algorithm, we plot each data item as a point in n-dimensional space (where n is number of features you have) with the value of each feature being the value of a particular coordinate. Then, we perform classification by finding the hyper-plane that differentiates the two classes very well." - www.analyticsvidhya.com

In [None]:
from sklearn.svm import SVC

# creates SVC model
svm_model = SVC(kernel='linear')
svm_model

In [None]:
# sets up hyperparameter grid
param_grid = {'C': [1, 5, 10, 50], 
             'gamma': [1,5,10,50,100]}

# grid search
svm_grid = GridSearchCV(svm_model, param_grid, verbose=3)

In [None]:
# fits the model
svm_grid.fit(X_train_scaled, y_train)

In [None]:
# best parameters for the grid search
print(svm_grid.best_params_)
print(svm_grid.best_score_)

In [None]:
# SVM Score
svm_model = svm_grid.best_estimator_
print(f"SVM Training Data Score: {svm_model.score(X_train_scaled, y_train)}")
print(f"SVM Testing Data Score: {svm_model.score(X_test_scaled, y_test)}")

In [None]:
import joblib
filename = 'models/svm.sav'
joblib.dump(logistic_model, filename)

### Random Forest Model

"Random forest, like its name implies, consists of a large number of individual decision trees that operate as an ensemble. Each individual tree in the random forest spits out a class prediction and the class with the most votes becomes our model’s prediction." - TDS

In [None]:
# set up random forest model
rf_model = RandomForestClassifier()
rf_model

In [None]:
# sets up hyperparameter grid
param_grid = {'max_depth': [1, 5, 50], 
             "n_estimators": [250, 500, 1000, 1500],
             "min_samples_leaf": [1, 2, 5, 10]}

# sets up the random forest grid search
rf_grid = GridSearchCV(rf_model, param_grid, verbose=3, cv=3)

In [None]:
# fits the random forest
rf_grid.fit(X_train_scaled, y_train)

In [None]:
# prints out the best parameters 
print(rf_grid.best_params_)
print(rf_grid.best_score_)

In [None]:
# Random Forest Model Scoring
rf_model = rf_grid.best_estimator_
print(f"RF Training Data Score: {rf_model.score(X_train_scaled, y_train)}")
print(f"RF Testing Data Score: {rf_model.score(X_test_scaled, y_test)}")

In [None]:
import joblib
filename = 'models/logistic_model.sav'
joblib.dump(logistic_model, filename)

### KNN

"The KNN algorithm assumes that similar things exist in close proximity. In other words, similar things are near to each other." - TDS

In [None]:
# import the model
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt

In [None]:
# creates k nearest neighbor model
knn_model = KNeighborsClassifier(n_neighbors=5)

In [None]:
# sets up hyperparameter grid
param_grid = {"leaf_size": [1, 10, 100, 200],
                "n_neighbors": list(range(1, 20, 2))}

# sets up the grid search
knn_grid = GridSearchCV(knn_model, param_grid, verbose=3, cv=3)

In [None]:
# fits the k nearest neighbor/grid search model to X and y
knn_grid.fit(X_train_scaled, y_train)

In [None]:
# prints out the best parameters 
print(knn_grid.best_params_)
print(knn_grid.best_score_)

In [None]:
# K Nearest Neighbor Model Scoring
knn_model = knn_grid.best_estimator_
print(f"KNN Training Data Score: {knn_model.score(X_train_scaled, y_train)}")
print(f"KNN Testing Data Score: {knn_model.score(X_test_scaled, y_test)}")

## Summary of Models

In [None]:
print(f"Logistic Model Training Data Score: {logistic_model.score(X_train_scaled, y_train)}")
print(f"Logistic Model Testing Data Score: {logistic_model.score(X_test_scaled, y_test)}")
print("---------------------------------------------------------------")

print(f"SVM Training Data Score: {svm_model.score(X_train_scaled, y_train)}")
print(f"SVM Testing Data Score: {svm_model.score(X_test_scaled, y_test)}")

print("---------------------------------------------------------------")
print(f"RF Training Data Score: {rf_model.score(X_train_scaled, y_train)}")
print(f"RF Testing Data Score: {rf_model.score(X_test_scaled, y_test)}")

print("---------------------------------------------------------------")
print(f"KNN Training Data Score: {knn_model.score(X_train_scaled, y_train)}")
print(f"KNN Testing Data Score: {knn_model.score(X_test_scaled, y_test)}")
