# Exoplanet Machine Learning Dataset

In [None]:
# Update sklearn to prevent version mismatches
#!pip install sklearn --upgrade

In [None]:
# install joblib. 
# !pip install joblib

In [None]:
import pandas as pd

## Data Exploration and Cleaning

In [None]:
# Reading the provided dataset
df = pd.read_csv("exoplanet_data.csv")
df

In [None]:
df.dtypes
# all are numbers expect koi_disp

In [None]:
# checking unique values 
types = df["koi_disposition"].unique()
types

In [None]:
# checking for nans
df.isnull().sum()

## Select your features 

In [None]:
# Set x as every column except koi_dis
X = df.drop("koi_disposition", axis=1)
# Set y to koi_dis
y = df["koi_disposition"]

In [None]:
# Set features. This will also be used as your x values.
from sklearn.ensemble import RandomForestClassifier

# Create a random forest classifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X,y)

# Random Forests in sklearn will automatically calculate feature importance
# in this case it would be the petal 
importances = rf.feature_importances_

# We can sort the features by their importance
sorted(zip(rf.feature_importances_, X.columns), reverse=True)

In [None]:
# chosing the top important features
X = df[['koi_fpflag_co', 'koi_fpflag_nt', 'koi_fpflag_ss', 'koi_model_snr', 'koi_prad']]

## Create a Train Test Split

Use `koi_disposition` for the y values

In [None]:
# split the data up between training and testing data (3/4:1/4)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
X_train.head()

## Pre-Processing

In [None]:
# Scale your data
# do not scale Y because it is categorical 
from sklearn.preprocessing import MinMaxScaler

X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


## Train the Model & Tune Hyperparameters



### Logistic Regression Model

Logistic Regression is used when the dependent variable(target) is categorical.

In [None]:
# Create a Logistic Regression Model
from sklearn.linear_model import LogisticRegression
logistic_classifier = LogisticRegression(max_iter = 1000)
logistic_classifier

In [None]:
# Set up hyperparameter grid 
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [1, 5, 10, 50],
              'penalty': ('l2', 'none')}

# Set up the grid search
logistic = GridSearchCV(logistic_classifier, param_grid, verbose=3)

In [None]:
# Fit the model using the grid search estimator. 
logistic.fit(X_train, y_train)

In [None]:
# List the best parameters for this dataset
print(logistic.best_params_)

# List the best score
print(logistic.best_score_)

In [None]:
# Score the model
logistic_model = logistic.best_estimator_
print(f"Logistic Model Training Data Score: {logistic_model.score(X_train_scaled, y_train)}")
print(f"Logistic Model Testing Data Score: {logistic_model.score(X_test_scaled, y_test)}")

### Save the Model

In [None]:
import joblib
filename = 'logistic_model.sav'
joblib.dump(logistic_model, filename)

### SVM Model

"In the SVM algorithm, we plot each data item as a point in n-dimensional space (where n is number of features you have) with the value of each feature being the value of a particular coordinate. Then, we perform classification by finding the hyper-plane that differentiates the two classes very well." - www.analyticsvidhya.com

In [None]:
from sklearn.svm import SVC

# creates SVC model
svm_model = SVC(kernel='linear')
svm_model

In [None]:
# sets up hyperparameter grid
param_grid = {'C': [1, 5, 10, 50], 
             'gamma': [1,5,10,50,100]}

# grid search
svm_grid = GridSearchCV(svm_model, param_grid, verbose=3)

In [None]:
# fits the model
svm_grid.fit(X_train_scaled, y_train)

In [None]:
# best parameters for the grid search
print(svm_grid.best_params_)
print(svm_grid.best_score_)

In [None]:
# SVM Score
svm_model = svm_grid.best_estimator_
print(f"SVM Training Data Score: {svm_model.score(X_train_scaled, y_train)}")
print(f"SVM Testing Data Score: {svm_model.score(X_test_scaled, y_test)}")

### Random Forest Model

"Random forest, like its name implies, consists of a large number of individual decision trees that operate as an ensemble. Each individual tree in the random forest spits out a class prediction and the class with the most votes becomes our model’s prediction." - TDS

In [None]:
# set up random forest model
rf_model = RandomForestClassifier()
rf_model

In [None]:
# sets up hyperparameter grid
param_grid = {'max_depth': [1, 5, 50], 
             "n_estimators": [250, 500, 1000, 1500],
             "min_samples_leaf": [1, 2, 5, 10]}

# sets up the random forest grid search
rf_grid = GridSearchCV(rf_model, param_grid, verbose=3, cv=3)

In [None]:
# fits the random forest
rf_grid.fit(X_train_scaled, y_train)

In [None]:
# prints out the best parameters 
print(rf_grid.best_params_)
print(rf_grid.best_score_)

In [None]:
# Random Forest Model Scoring
rf_model = rf_grid.best_estimator_
print(f"RF Training Data Score: {rf_model.score(X_train_scaled, y_train)}")
print(f"RF Testing Data Score: {rf_model.score(X_test_scaled, y_test)}")

### KNN

"The KNN algorithm assumes that similar things exist in close proximity. In other words, similar things are near to each other." - TDS

In [None]:
# import the model
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt

In [None]:
# creates k nearest neighbor model
knn_model = KNeighborsClassifier(n_neighbors=5)

In [None]:
# sets up hyperparameter grid
param_grid = {"leaf_size": [1, 10, 100, 200],
                "n_neighbors": list(range(1, 20, 2))}

# sets up the grid search
knn_grid = GridSearchCV(knn_model, param_grid, verbose=3, cv=3)

In [None]:
# fits the k nearest neighbor/grid search model to X and y
knn_grid.fit(X_train_scaled, y_train)

In [None]:
# prints out the best parameters 
print(knn_grid.best_params_)
print(knn_grid.best_score_)

In [None]:
# K Nearest Neighbor Model Scoring
knn_model = knn_grid.best_estimator_
print(f"KNN Training Data Score: {knn_model.score(X_train_scaled, y_train)}")
print(f"KNN Testing Data Score: {knn_model.score(X_test_scaled, y_test)}")

## Summary of Models

In [None]:
print(f"Logistic Model Training Data Score: {logistic_model.score(X_train_scaled, y_train)}")
print(f"Logistic Model Testing Data Score: {logistic_model.score(X_test_scaled, y_test)}")
print("---------------------------------------------------------------")

print(f"SVM Training Data Score: {svm_model.score(X_train_scaled, y_train)}")
print(f"SVM Testing Data Score: {svm_model.score(X_test_scaled, y_test)}")

print("---------------------------------------------------------------")
print(f"RF Training Data Score: {rf_model.score(X_train_scaled, y_train)}")
print(f"RF Testing Data Score: {rf_model.score(X_test_scaled, y_test)}")

print("---------------------------------------------------------------")
print(f"KNN Training Data Score: {knn_model.score(X_train_scaled, y_train)}")
print(f"KNN Testing Data Score: {knn_model.score(X_test_scaled, y_test)}")
