In [None]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade

In [None]:
# install joblib. This will be used to save model. 
# Restart your kernel after installing 
!pip install joblib

In [1]:
import pandas as pd

# Read the CSV and Perform Basic Data Cleaning

In [2]:
# Steps for read and initial cleaning from starter files
df = pd.read_csv("exoplanet_data.csv")

# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


In [3]:
# View columns of the df
df.columns

Index(['koi_disposition', 'koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co',
       'koi_fpflag_ec', 'koi_period', 'koi_period_err1', 'koi_period_err2',
       'koi_time0bk', 'koi_time0bk_err1', 'koi_time0bk_err2', 'koi_impact',
       'koi_impact_err1', 'koi_impact_err2', 'koi_duration',
       'koi_duration_err1', 'koi_duration_err2', 'koi_depth', 'koi_depth_err1',
       'koi_depth_err2', 'koi_prad', 'koi_prad_err1', 'koi_prad_err2',
       'koi_teq', 'koi_insol', 'koi_insol_err1', 'koi_insol_err2',
       'koi_model_snr', 'koi_tce_plnt_num', 'koi_steff', 'koi_steff_err1',
       'koi_steff_err2', 'koi_slogg', 'koi_slogg_err1', 'koi_slogg_err2',
       'koi_srad', 'koi_srad_err1', 'koi_srad_err2', 'ra', 'dec',
       'koi_kepmag'],
      dtype='object')

# Select your features (columns)

In [4]:
# Set features, determining features to use for X 

# Want to drop the y target and one additional column (TCE Planet Number)
X = df.drop(columns=['koi_disposition', 'koi_tce_plnt_num']
           )
X

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,0,0,0,0,54.418383,2.479000e-04,-2.479000e-04,162.513840,0.003520,-0.003520,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,0,1,0,0,19.899140,1.490000e-05,-1.490000e-05,175.850252,0.000581,-0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,0,1,0,0,1.736952,2.630000e-07,-2.630000e-07,170.307565,0.000115,-0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.285210,15.597
3,0,0,0,0,2.525592,3.760000e-06,-3.760000e-06,171.595550,0.001130,-0.001130,...,-211,4.438,0.070,-0.210,1.046,0.334,-0.133,288.75488,48.226200,15.509
4,0,0,0,0,4.134435,1.050000e-05,-1.050000e-05,172.979370,0.001900,-0.001900,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.224670,15.714
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6986,0,0,0,1,8.589871,1.846000e-04,-1.846000e-04,132.016100,0.015700,-0.015700,...,-152,4.296,0.231,-0.189,1.088,0.313,-0.228,298.74921,46.973351,14.478
6987,0,1,1,0,0.527699,1.160000e-07,-1.160000e-07,131.705093,0.000170,-0.000170,...,-166,4.529,0.035,-0.196,0.903,0.237,-0.079,297.18875,47.093819,14.082
6988,0,0,0,0,1.739849,1.780000e-05,-1.780000e-05,133.001270,0.007690,-0.007690,...,-220,4.444,0.056,-0.224,1.031,0.341,-0.114,286.50937,47.163219,14.757
6989,0,0,1,0,0.681402,2.430000e-06,-2.430000e-06,132.181750,0.002850,-0.002850,...,-236,4.447,0.056,-0.224,1.041,0.341,-0.114,294.16489,47.176281,15.385


In [5]:
# Determining features to use for X (dropping y target and other columns which were the uncertainties)

# X = df.drop(columns=['koi_disposition', 'koi_tce_plnt_num', 'koi_period_err1', 'koi_period_err2', 
#             'koi_time0bk_err1', 'koi_time0bk_err2', 'koi_impact_err1', 'koi_impact_err2',
#             'koi_depth_err1', 'koi_depth_err2', 'koi_duration_err1', 'koi_duration_err2',
#             'koi_prad_err1', 'koi_prad_err2', 'koi_insol_err1', 'koi_insol_err2', 
#             'koi_steff_err1', 'koi_steff_err2', 'koi_slogg_err1', 'koi_slogg_err2',
#             'koi_srad_err1', 'koi_srad_err2']
#            )
# X

# Results not as strong once the uncertainties were droppped

# Create a Train Test Split

Use `koi_disposition` for the y values

In [6]:
y=df.koi_disposition
y

0            CONFIRMED
1       FALSE POSITIVE
2       FALSE POSITIVE
3            CONFIRMED
4            CONFIRMED
             ...      
6986    FALSE POSITIVE
6987    FALSE POSITIVE
6988         CANDIDATE
6989    FALSE POSITIVE
6990    FALSE POSITIVE
Name: koi_disposition, Length: 6991, dtype: object

In [7]:
y.value_counts()

FALSE POSITIVE    3504
CONFIRMED         1800
CANDIDATE         1687
Name: koi_disposition, dtype: int64

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [9]:
X_train.head(20)

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
3563,0,0,0,0,10.548413,5.47e-05,-5.47e-05,139.06402,0.00411,-0.00411,...,-133,4.387,0.066,-0.123,1.092,0.181,-0.097,298.09543,44.737061,13.204
4099,0,0,0,0,24.754385,0.0001365,-0.0001365,140.20732,0.00446,-0.00446,...,-144,4.519,0.078,-0.052,0.804,0.056,-0.076,295.73535,42.576248,15.514
5460,0,0,0,0,1.057336,1.23e-07,-1.23e-07,131.792007,9.6e-05,-9.6e-05,...,-140,4.594,0.054,-0.027,0.683,0.054,-0.06,292.18417,49.31004,15.414
1091,0,0,0,0,201.118319,0.001461,-0.001461,187.56986,0.00529,-0.00529,...,-112,4.447,0.072,-0.108,0.954,0.135,-0.083,283.11377,48.13139,13.328
5999,0,0,0,0,91.649983,0.003181,-0.003181,175.7156,0.0286,-0.0286,...,-233,4.145,0.164,-0.164,1.608,0.905,-0.383,294.93198,39.81242,12.964
2637,0,0,0,0,9.940666,6.98e-06,-6.98e-06,135.21559,0.000582,-0.000582,...,-136,4.305,0.08,-0.12,1.259,0.219,-0.128,284.55573,46.063831,12.663
6889,1,0,0,0,358.74318,0.04849,-0.04849,150.554,0.0643,-0.0643,...,-147,4.566,0.038,-0.152,0.826,0.188,-0.063,291.28595,46.920551,14.806
192,0,0,0,0,10.848452,1.63e-05,-1.63e-05,171.80584,0.00122,-0.00122,...,-416,4.682,0.088,-0.072,0.591,0.109,-0.109,295.57935,44.545959,15.058
3833,0,0,0,0,214.312044,0.0001427,-0.0001427,336.330158,0.000427,-0.000427,...,-141,4.287,0.063,-0.117,1.357,0.242,-0.138,290.97394,48.178162,12.04
2917,0,0,0,0,20.485389,0.0003165,-0.0003165,146.9183,0.0118,-0.0118,...,-79,3.45,0.143,-0.13,3.324,0.613,-0.551,290.62183,51.057301,13.007


# Pre-processing

Scale the data using the MinMaxScaler

In [10]:
# Scale data
from sklearn.preprocessing import MinMaxScaler
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [11]:
# Look at a row of the results to see the array
X_train_scaled[0]

array([0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       9.56989489e-03, 4.08052303e-04, 9.99591948e-01, 1.36824641e-02,
       7.20801882e-03, 9.92791981e-01, 1.00886852e-02, 5.99107712e-04,
       9.86749831e-01, 1.43641964e-02, 7.07920792e-03, 9.92920792e-01,
       6.18920878e-05, 2.67627380e-05, 9.99973237e-01, 1.87176316e-05,
       3.71853547e-05, 9.99995595e-01, 5.53687822e-02, 1.41419711e-05,
       2.01209102e-05, 9.99993057e-01, 1.11563995e-03, 3.89374425e-01,
       1.77514793e-01, 9.23254472e-01, 8.16249765e-01, 4.48369565e-02,
       8.48522167e-01, 5.46402526e-03, 6.97333950e-03, 9.99065736e-01,
       8.34972972e-01, 5.17791237e-01, 5.15579800e-01])

# Train the Model



In [12]:
# Support vector machine linear classifier
from sklearn.svm import SVC 
model = SVC(kernel='linear')
model.fit(X_train_scaled, y_train)

SVC(kernel='linear')

In [22]:
# Model Accuracy
print(f'Training Data Score: {model.score(X_train_scaled, y_train)}')
print(f'Testing Data Score: {model.score(X_test_scaled, y_test)}')

Training Data Score: 0.8371161548731643
Testing Data Score: 0.8564073226544623


# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [14]:
# Create the GridSearchCV model
# Use shift tab to explore more of the options that can be updated here
from sklearn.model_selection import GridSearchCV

In [15]:
param_grid = {'C':[1,5,10,15], 'tol':[.01, .001, .0001]}
grid = GridSearchCV(model,param_grid,verbose=3)

In [16]:
# Train the model with GridSearch
grid.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV 1/5] END .....................C=1, tol=0.01;, score=0.848 total time=   0.1s
[CV 2/5] END .....................C=1, tol=0.01;, score=0.845 total time=   0.1s
[CV 3/5] END .....................C=1, tol=0.01;, score=0.820 total time=   0.1s
[CV 4/5] END .....................C=1, tol=0.01;, score=0.818 total time=   0.1s
[CV 5/5] END .....................C=1, tol=0.01;, score=0.832 total time=   0.1s
[CV 1/5] END ....................C=1, tol=0.001;, score=0.848 total time=   0.1s
[CV 2/5] END ....................C=1, tol=0.001;, score=0.844 total time=   0.1s
[CV 3/5] END ....................C=1, tol=0.001;, score=0.820 total time=   0.1s
[CV 4/5] END ....................C=1, tol=0.001;, score=0.818 total time=   0.1s
[CV 5/5] END ....................C=1, tol=0.001;, score=0.832 total time=   0.1s
[CV 1/5] END ...................C=1, tol=0.0001;, score=0.848 total time=   0.2s
[CV 2/5] END ...................C=1, tol=0.0001;

GridSearchCV(estimator=SVC(kernel='linear'),
             param_grid={'C': [1, 5, 10, 15], 'tol': [0.01, 0.001, 0.0001]},
             verbose=3)

In [17]:
# print(grid.best_params_)
print('Best C:', grid.best_estimator_.get_params()['C'])
print(grid.best_score_)

Best C: 15
0.8701093007517156


In [18]:
y.value_counts()

FALSE POSITIVE    3504
CONFIRMED         1800
CANDIDATE         1687
Name: koi_disposition, dtype: int64

In [19]:
target_names = ['FALSE POSITIVE', 'CONFIRMED', 'CANDIDATE']
# Calculate classification report
from sklearn.metrics import classification_report

# From the model, before hyperparameter tuning
predictions = model.predict(X_test_scaled)
print(classification_report(y_test, predictions,
                            target_names=target_names))

                precision    recall  f1-score   support

FALSE POSITIVE       0.76      0.58      0.66       404
     CONFIRMED       0.68      0.82      0.74       435
     CANDIDATE       0.99      1.00      0.99       909

      accuracy                           0.86      1748
     macro avg       0.81      0.80      0.80      1748
  weighted avg       0.86      0.86      0.85      1748



In [20]:
# Calculate classification report, the model "grid"/after hyperparameter tuning
predictions = grid.predict(X_test_scaled)
print(classification_report(y_test, predictions,
                            target_names=target_names))

                precision    recall  f1-score   support

FALSE POSITIVE       0.89      0.61      0.72       404
     CONFIRMED       0.72      0.91      0.80       435
     CANDIDATE       0.99      1.00      0.99       909

      accuracy                           0.89      1748
     macro avg       0.86      0.84      0.84      1748
  weighted avg       0.90      0.89      0.88      1748



# Save the Model

In [21]:
# save model for submitting to BCS
# This model is the better when compared to Linear Regression model
import joblib
filename = 'SVC.sav'
joblib.dump(grid, filename)

['SVC.sav']