In [1]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade

Collecting sklearn
  Using cached sklearn-0.0.tar.gz (1.1 kB)
Building wheels for collected packages: sklearn
  Building wheel for sklearn (setup.py) ... [?25ldone
[?25h  Created wheel for sklearn: filename=sklearn-0.0-py2.py3-none-any.whl size=1315 sha256=7cd8288f9da7c9c0c5b61f3a5b4fb8e31c7ef61948714d70bb7a7f0f08e7a6f3
  Stored in directory: /Users/laurenstein/Library/Caches/pip/wheels/23/9d/42/5ec745cbbb17517000a53cecc49d6a865450d1f5cb16dc8a9c
Successfully built sklearn
Installing collected packages: sklearn
Successfully installed sklearn-0.0


In [2]:
# install joblib for saving
# Restart kernel after installing 
!pip install joblib



In [3]:
import pandas as pd

## Background

Over a period of nine years in deep space, the NASA Kepler space telescope has been out on a planet-hunting mission to discover hidden planets outside of our solar system.

Below is a machine learning models capable of classifying candidate exoplanets from the raw dataset

Data from [NASA Exoplanet Archive](https://exoplanetarchive.ipac.caltech.edu/cgi-bin/TblView/nph-tblView?app=ExoTbls&config=koi)

### Read the CSV and Perform Basic Data Cleaning

In [4]:
df = pd.read_csv("exoplanet_data.csv")

# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()
df.columns

Index(['koi_disposition', 'koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co',
       'koi_fpflag_ec', 'koi_period', 'koi_period_err1', 'koi_period_err2',
       'koi_time0bk', 'koi_time0bk_err1', 'koi_time0bk_err2', 'koi_impact',
       'koi_impact_err1', 'koi_impact_err2', 'koi_duration',
       'koi_duration_err1', 'koi_duration_err2', 'koi_depth', 'koi_depth_err1',
       'koi_depth_err2', 'koi_prad', 'koi_prad_err1', 'koi_prad_err2',
       'koi_teq', 'koi_insol', 'koi_insol_err1', 'koi_insol_err2',
       'koi_model_snr', 'koi_tce_plnt_num', 'koi_steff', 'koi_steff_err1',
       'koi_steff_err2', 'koi_slogg', 'koi_slogg_err1', 'koi_slogg_err2',
       'koi_srad', 'koi_srad_err1', 'koi_srad_err2', 'ra', 'dec',
       'koi_kepmag'],
      dtype='object')

## Explore the Data

This dataset is a cumulative record of all observed Kepler "objects of interest" and contains an extensive data directory. 

**Exoplanet Achive Information**: The disposition or label in the literature for the exoplanet candidate. One of CANDIDATE, FALSE POSITIVE, NOT DISPOSITIONED or CONFIRMED. (**koi_disposition**)

**Project Disposition Columns**: NASA flags used to identify or assign the foreign body. Labeled with _flag_ and not useful for generating a model.

**Transit Properties**: Calculated parameters of the object such as  Orbital Period, Transit Epoch, Planet-Star Radius Ratio, Planet-Star Distance over Star Radius and Impact Parameter. _Transit properties contain uncertainty values and are identified with a suffix _err. The margin of error is NOT included in the model_

**Stellar Parameters**: Stellar parameters are observational data used to determine stellar physics. These include effective temperature, surface gravity, metallicity, radius, mass, and ageCalculated parameters of the object such as  Orbital Period, Transit Epoch, Planet-Star Radius Ratio, Planet-Star Distance over Star Radius and Impact Parameter. _Stellar properties contain uncertainty values and are identified with a suffix _err. The margin of error is NOT included in the model_

**KIC Parameters**: Physical properties and target identifier.

[Full Directory of Data Columns Definitions](https://exoplanetarchive.ipac.caltech.edu/docs/API_kepcandidate_columns.html)


In [5]:
# Removing the uncertainty columns {suffix _err}
#flags are used as project disposition binary - removed for the time being 
# 'koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co','koi_fpflag_ec'
# 'koi_tce_plnt_num': discrete value TCE Planet Number federated to the KOI
exo_df = df[['koi_period', 'koi_time0bk', 'koi_impact','koi_duration', 'koi_depth', 'koi_prad', 'koi_teq', 'koi_insol','koi_model_snr', 'koi_steff', 'koi_srad','ra', 'dec',
       'koi_kepmag']]
exo_df.head()

Unnamed: 0,koi_period,koi_time0bk,koi_impact,koi_duration,koi_depth,koi_prad,koi_teq,koi_insol,koi_model_snr,koi_steff,koi_srad,ra,dec,koi_kepmag
0,54.418383,162.51384,0.586,4.507,874.8,2.83,443,9.11,25.8,5455,0.927,291.93423,48.141651,15.347
1,19.89914,175.850252,0.969,1.7822,10829.0,14.6,638,39.3,76.3,5853,0.868,297.00482,48.134129,15.436
2,1.736952,170.307565,1.276,2.40641,8079.2,33.46,1395,891.96,505.6,5805,0.791,285.53461,48.28521,15.597
3,2.525592,171.59555,0.701,1.6545,603.3,2.75,1406,926.16,40.9,6031,1.046,288.75488,48.2262,15.509
4,4.134435,172.97937,0.762,3.1402,686.0,2.77,1160,427.65,40.2,6046,0.972,296.28613,48.22467,15.714


In [6]:
# Find the classifiers for the koi_disposition
df["koi_disposition"].unique()

array(['CONFIRMED', 'FALSE POSITIVE', 'CANDIDATE'], dtype=object)

# STEP 1: Select Features

In [7]:
# STEP 1: Assign X(features) and y (target)
X = exo_df
y = df["koi_disposition"]
print(X.shape, y.shape)

(6991, 14) (6991,)


# STEP 2: Split into Test & Train datasets

In [8]:
# STEP 2: split the data into testing and training
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
X_train.head()

Unnamed: 0,koi_period,koi_time0bk,koi_impact,koi_duration,koi_depth,koi_prad,koi_teq,koi_insol,koi_model_snr,koi_steff,koi_srad,ra,dec,koi_kepmag
6122,6.768901,133.07724,0.15,3.616,123.1,1.24,1017,253.3,10.8,5737,1.125,294.40472,39.351681,14.725
6370,0.733726,132.02005,0.291,2.309,114.6,0.86,1867,2891.64,13.8,5855,0.797,284.50391,42.46386,15.77
2879,7.652707,134.46038,0.97,79.8969,641.1,3.21,989,226.81,254.3,6328,0.963,295.50211,38.98354,13.099
107,7.953547,174.66224,0.3,2.6312,875.4,2.25,696,55.37,38.4,4768,0.779,291.15878,40.750271,15.66
29,4.959319,172.258529,0.831,2.22739,9802.0,12.21,1103,349.4,696.5,5712,1.082,292.16705,48.727589,15.263


# STEP 3: Pre-processing
### MinMaxScalar to fit and transform X features

In [31]:
#Fit Transform using MinMaxScalar for X features
from sklearn.preprocessing import MinMaxScaler
X_minmax = MinMaxScaler().fit(X_train)

X_train_minmax = X_minmax.transform(X_train)
X_test_minmax = X_minmax.transform(X_test)

### Label Encoding for target (y) value

In [33]:
# Visualize Label Encoding
from sklearn.preprocessing import LabelEncoder
disposition_types = ('CANDIDATE', 'CONFIRMED', 'FALSE POSITIVE')
disposition_df = pd.DataFrame(disposition_types, columns=['disposition_types'])# converting type of columns to 'category'

# creating instance of labelencoder
labelencoder = LabelEncoder()

# Assigning numerical values and storing in another column
disposition_df['disposition_types_cat'] = labelencoder.fit_transform(disposition_df['disposition_types'])
disposition_df

Unnamed: 0,disposition_types,disposition_types_cat
0,CANDIDATE,0
1,CONFIRMED,1
2,FALSE POSITIVE,2


In [34]:
# Perform Label encoding on train and test data set for y
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

In [35]:
# Create one-hot encoding for downstream comparison
from tensorflow.keras.utils import to_categorical
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

# STEP 4: Create a Support Vector Machine Linear Classifier

In [36]:
#STEP 4: Create the SVC Model
from sklearn.svm import SVC 
model = SVC(kernel='linear')
model.fit(X_train_minmax, y_train)

SVC(kernel='linear')

In [37]:
print(f"Training Data Score: {model.score(X_train_minmax, y_train)}")
print(f"Testing Data Score: {model.score(X_test_minmax, y_test)}")

Training Data Score: 0.6021361815754339
Testing Data Score: 0.5886727688787186


# STEP 5: Hypertune SVM using GridSearch CV

In [38]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [1, 5, 10, 50],
              'gamma': [0.0001, 0.0005, 0.001, 0.005]}
grid = GridSearchCV(model, param_grid, verbose=3)

In [39]:
# Fit the Model using the grid search estimator
grid.fit(X_train_minmax, encoded_y_train)
print(grid.best_params_)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ................... C=1, gamma=0.0001, score=0.591, total=   0.3s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s


[CV] ................... C=1, gamma=0.0001, score=0.595, total=   0.3s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.7s remaining:    0.0s


[CV] ................... C=1, gamma=0.0001, score=0.602, total=   0.3s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=0.590, total=   0.4s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=0.592, total=   0.4s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.591, total=   0.4s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.595, total=   0.3s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.602, total=   0.3s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.590, total=   0.3s
[CV] C=1, gamma=0.0005 ...............................................
[CV] .

[CV] .................. C=50, gamma=0.0001, score=0.662, total=   0.4s
[CV] C=50, gamma=0.0001 ..............................................
[CV] .................. C=50, gamma=0.0001, score=0.659, total=   0.4s
[CV] C=50, gamma=0.0001 ..............................................
[CV] .................. C=50, gamma=0.0001, score=0.632, total=   0.4s
[CV] C=50, gamma=0.0001 ..............................................
[CV] .................. C=50, gamma=0.0001, score=0.645, total=   0.4s
[CV] C=50, gamma=0.0001 ..............................................
[CV] .................. C=50, gamma=0.0001, score=0.657, total=   0.4s
[CV] C=50, gamma=0.0005 ..............................................
[CV] .................. C=50, gamma=0.0005, score=0.662, total=   0.5s
[CV] C=50, gamma=0.0005 ..............................................
[CV] .................. C=50, gamma=0.0005, score=0.659, total=   0.5s
[CV] C=50, gamma=0.0005 ..............................................
[CV] .

[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:   28.8s finished


{'C': 50, 'gamma': 0.0001}


In [40]:
print(grid.best_params_)
print(grid.best_score_)

{'C': 50, 'gamma': 0.0001}
0.6509632947409019


In [41]:
#Create and fit model
model2 = SVC(C=50, gamma= 0.0001, kernel='linear')
model2.fit(X_train_minmax, encoded_y_train)

SVC(C=50, gamma=0.0001, kernel='linear')

In [56]:
print(f"Training Data Score: {model2.score(X_train_minmax, encoded_y_train)}")
print(f"Testing Data Score: {model2.score(X_test_minmax, encoded_y_test)}")

Training Data Score: 0.6582109479305741
Testing Data Score: 0.6361556064073226


In [43]:
# Calculate classification report
predictions = grid.predict(X_test_minmax)

from sklearn.metrics import classification_report
print(classification_report(encoded_y_test, predictions, target_names = ["CANDIDATE", "CONFIRMED", "FALSE POSITIVE"]))

                precision    recall  f1-score   support

     CANDIDATE       0.41      0.32      0.36       411
     CONFIRMED       0.58      0.62      0.60       484
FALSE POSITIVE       0.75      0.80      0.77       853

      accuracy                           0.64      1748
     macro avg       0.58      0.58      0.58      1748
  weighted avg       0.62      0.64      0.63      1748



# Save the Model

In [44]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'lmstein.sav'
joblib.dump("lmstein", filename)

['lmstein.sav']

# Random Forest 

In [60]:
from sklearn.ensemble import RandomForestClassifier

# STEP 1: Create a random forest classifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train_minmax, y_train)
rf.score(X_test_minmax, y_test)

# STEP 2: Auto calculate feature importance
importances = rf.feature_importances_
importances

# Sort the features by their importance
sorted(zip(rf.feature_importances_, X.columns), reverse=True)

[(0.1445899087775445, 'koi_model_snr'),
 (0.12235024783032629, 'koi_prad'),
 (0.08550647735916816, 'koi_period'),
 (0.08273198850504003, 'koi_depth'),
 (0.08231836486500156, 'koi_impact'),
 (0.07324427099792323, 'koi_duration'),
 (0.06426690860665622, 'koi_teq'),
 (0.058984654536360794, 'koi_time0bk'),
 (0.05886575329924233, 'koi_insol'),
 (0.049619810587069046, 'ra'),
 (0.04752960878361672, 'koi_steff'),
 (0.04631402768666161, 'koi_srad'),
 (0.042109149492268294, 'koi_kepmag'),
 (0.04156882867312136, 'dec')]