In [None]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade

Collecting sklearn
  Downloading sklearn-0.0.tar.gz (1.1 kB)


In [1]:
# install joblib for saving
# Restart kernel after installing 
!pip install joblib



In [2]:
import pandas as pd

## Background

Over a period of nine years in deep space, the NASA Kepler space telescope has been out on a planet-hunting mission to discover hidden planets outside of our solar system.

Below is a machine learning models capable of classifying candidate exoplanets from the raw dataset

Data from [NASA Exoplanet Archive](https://exoplanetarchive.ipac.caltech.edu/cgi-bin/TblView/nph-tblView?app=ExoTbls&config=koi)

### Read the CSV and Perform Basic Data Cleaning

In [8]:
df = pd.read_csv("exoplanet_data.csv")

# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()
df.columns

Index(['koi_disposition', 'koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co',
       'koi_fpflag_ec', 'koi_period', 'koi_period_err1', 'koi_period_err2',
       'koi_time0bk', 'koi_time0bk_err1', 'koi_time0bk_err2', 'koi_impact',
       'koi_impact_err1', 'koi_impact_err2', 'koi_duration',
       'koi_duration_err1', 'koi_duration_err2', 'koi_depth', 'koi_depth_err1',
       'koi_depth_err2', 'koi_prad', 'koi_prad_err1', 'koi_prad_err2',
       'koi_teq', 'koi_insol', 'koi_insol_err1', 'koi_insol_err2',
       'koi_model_snr', 'koi_tce_plnt_num', 'koi_steff', 'koi_steff_err1',
       'koi_steff_err2', 'koi_slogg', 'koi_slogg_err1', 'koi_slogg_err2',
       'koi_srad', 'koi_srad_err1', 'koi_srad_err2', 'ra', 'dec',
       'koi_kepmag'],
      dtype='object')

In [64]:
df["koi_disposition"].unique()

array(['CONFIRMED', 'FALSE POSITIVE', 'CANDIDATE'], dtype=object)

## Explore the Data

This dataset is a cumulative record of all observed Kepler "objects of interest" and contains an extensive data directory. 

Columns of note:

- **koi_disposition**: The disposition in the literature towards this exoplanet candidate. One of CANDIDATE, FALSE POSITIVE, NOT DISPOSITIONED or CONFIRMED.
    
- **koi_score**: A value between 0 and 1 that indicates the confidence in the KOI disposition. For CANDIDATEs, a higher value indicates more confidence in its disposition, while for FALSE POSITIVEs, a higher value indicates less confidence in that disposition.

[Full Directory of Data Columns Definitions](https://exoplanetarchive.ipac.caltech.edu/docs/API_kepcandidate_columns.html)


# STEP 1: Select Features

In [15]:
# STEP 1: Assign X(features) and y (target)
X = df.drop("koi_disposition", axis = 1)
y = df["koi_disposition"]
print(X.shape, y.shape)

(6991, 40) (6991,)


# STEP 2: Split into Test & Train datasets

In [57]:
# STEP 2: split the data into testing and training
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
X_train.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
6122,0,0,0,0,6.768901,7.38e-05,-7.38e-05,133.07724,0.00844,-0.00844,...,-171,4.327,0.153,-0.187,1.125,0.31,-0.207,294.40472,39.351681,14.725
6370,0,1,0,1,0.733726,6.06e-06,-6.06e-06,132.02005,0.00795,-0.00795,...,-175,4.578,0.033,-0.187,0.797,0.211,-0.056,284.50391,42.46386,15.77
2879,1,0,0,0,7.652707,6.54e-05,-6.54e-05,134.46038,0.00619,-0.00619,...,-189,4.481,0.05,-0.2,0.963,0.29,-0.097,295.50211,38.98354,13.099
107,0,0,0,0,7.953547,1.91e-05,-1.91e-05,174.66224,0.00182,-0.00182,...,-85,4.536,0.056,-0.016,0.779,0.023,-0.049,291.15878,40.750271,15.66
29,0,0,0,0,4.959319,5.15e-07,-5.15e-07,172.258529,8.3e-05,-8.3e-05,...,-77,4.359,0.11,-0.11,1.082,0.173,-0.13,292.16705,48.727589,15.263


# STEP 3: Pre-processing
### MinMaxScalar to fit and transform X features

In [28]:
#Fit Transform using MinMaxScalar for X features
from sklearn.preprocessing import MinMaxScaler
X_minmax = MinMaxScaler().fit(X_train)
X_train_minmax = X_minmax.transform(X_train)
X_test_minmax = X_minmax.transform(X_test)

### Label Encoding for target (y) value

In [73]:
# Visualize Label Encoding
from sklearn.preprocessing import LabelEncoder
disposition_types = ('CANDIDATE', 'CONFIRMED', 'FALSE POSITIVE' )
disposition_df = pd.DataFrame(disposition_types, columns=['disposition_types'])# converting type of columns to 'category'

# creating instance of labelencoder
labelencoder = LabelEncoder()

# Assigning numerical values and storing in another column
disposition_df['disposition_types_cat'] = labelencoder.fit_transform(disposition_df['disposition_types'])

disposition_df

Unnamed: 0,disposition_types,disposition_types_cat
0,CONFIRMED,1
1,FALSE POSITIVE,2
2,CANDIDATE,0


In [74]:
# Perform Label encoding on train and test data set for y
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

In [75]:
# Create one-hot encoding for downstream comparison
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

# STEP 4: Create a Support Vector Machine Linear Classifier

In [51]:
#STEP 4: Create the SVC Model
from sklearn.svm import SVC 
model = SVC(kernel='linear')
model

SVC(kernel='linear')

In [62]:
model.fit(X_train_minmax, y_train)
predictions = grid.predict(X_test_minmax)
from sklearn.metrics import classification_report
print(classification_report(encoded_y_test, predictions,
                            target_names=["CONFIRMED", "FALSE POSITIVE", "CANDIDATE"]))

                precision    recall  f1-score   support

     CONFIRMED       0.81      0.67      0.73       411
FALSE POSITIVE       0.76      0.85      0.80       484
     CANDIDATE       0.98      1.00      0.99       853

      accuracy                           0.88      1748
     macro avg       0.85      0.84      0.84      1748
  weighted avg       0.88      0.88      0.88      1748



In [None]:
print(f"Training Data Score: {model2.score(X_train_minmax, y_train)}")
print(f"Testing Data Score: {model2.score(X_test_minmax, y_test)}")

In [50]:
import matplotlib.pyplot as plt
plt.scatter(model.predict(X_train_minmax, model.predict(X_train_minmax) - y_train_categorical, c="blue", label="Training Data"))
plt.scatter(model.predict(X_test_minmax), model.predict(X_test_minmax) - y_test_categorical, c="orange", label="Testing Data")
plt.legend()
plt.hlines(y=0, xmin=y_test_categorical.min(), xmax=y_test_categorical.max())
plt.title("Residual Plot")
plt.show()

NotFittedError: This SVC instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

# Train the Model



# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [None]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [1, 5, 10, 50],
              'gamma': [0.0001, 0.0005, 0.001, 0.005]}
grid = GridSearchCV(model, param_grid, verbose=3)

In [None]:
# 

In [56]:
#STEP 6: Fit the Model using the grid search estimator
grid.fit(X_train_minmax, encoded_y_train)

print(grid.best_params_)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ................... C=1, gamma=0.0001, score=0.858, total=   0.3s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s


[CV] ................... C=1, gamma=0.0001, score=0.833, total=   0.2s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.5s remaining:    0.0s


[CV] ................... C=1, gamma=0.0001, score=0.855, total=   0.3s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=0.832, total=   0.3s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=0.835, total=   0.3s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.858, total=   0.3s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.833, total=   0.3s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.855, total=   0.2s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.832, total=   0.2s
[CV] C=1, gamma=0.0005 ...............................................
[CV] .

[CV] .................. C=50, gamma=0.0001, score=0.896, total=   0.3s
[CV] C=50, gamma=0.0001 ..............................................
[CV] .................. C=50, gamma=0.0001, score=0.879, total=   0.3s
[CV] C=50, gamma=0.0001 ..............................................
[CV] .................. C=50, gamma=0.0001, score=0.891, total=   0.4s
[CV] C=50, gamma=0.0001 ..............................................
[CV] .................. C=50, gamma=0.0001, score=0.872, total=   0.3s
[CV] C=50, gamma=0.0001 ..............................................
[CV] .................. C=50, gamma=0.0001, score=0.873, total=   0.3s
[CV] C=50, gamma=0.0005 ..............................................
[CV] .................. C=50, gamma=0.0005, score=0.896, total=   0.3s
[CV] C=50, gamma=0.0005 ..............................................
[CV] .................. C=50, gamma=0.0005, score=0.879, total=   0.3s
[CV] C=50, gamma=0.0005 ..............................................
[CV] .

[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:   22.2s finished


{'C': 50, 'gamma': 0.0001}


In [None]:
print(grid2.best_params_)
print(grid2.best_score_)

# Save the Model

In [None]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'your_name.sav'
joblib.dump(your_model, filename)