In [3]:
# Import dependencies
import pandas as pd

In [4]:
# Read in data
df = pd.read_csv("exoplanet_data.csv")
df = df.dropna(axis='columns', how='all')
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


In [5]:
# Set target
y = df['koi_disposition']
X = df.drop('koi_disposition', axis=1)

In [6]:
# Use feature selection to use features with greatest effect
from sklearn.feature_selection import SelectKBest, f_classif

test = SelectKBest(score_func=f_classif, k=5)
fit = test.fit(X, y)

counter = 0
for score in fit.scores_:
    print(f'{counter}: {score}')
    counter += 1
    
# Top 8 scores (in order) are features with index 1,2,0,29,3,30,16,33
data = X.iloc[:, [0,1,2,3,16,29,30,33]]
data.head()

0: 736.896270641353
1: 1432.7062560928198
2: 1159.4586151359322
3: 576.0603404533031
4: 79.35777730967109
5: 67.82079389842501
6: 67.82079389842501
7: 19.009737851739363
8: 82.22181333874813
9: 82.22181333874813
10: 22.32908033535862
11: 46.4112164405416
12: 0.6840517334945824
13: 96.13982411889855
14: 115.76359288790788
15: 115.76359288790788
16: 278.0627029090088
17: 1.6087287313686316
18: 1.6087287313686316
19: 3.700525428853396
20: 5.723614204673926
21: 2.882359433277504
22: 276.73636382474126
23: 3.354471957445768
24: 6.785149656756396
25: 2.5370368920770368
26: 231.8143285815651
27: 167.70726831341838
28: 133.1656225382907
29: 649.4217951767387
30: 499.94231251299846
31: 91.47432933981052
32: 87.10607432897139
33: 234.04982829764415
34: 19.682074217660542
35: 65.04908618751446
36: 24.895523053387564
37: 104.24603048844547
38: 42.67191056192708
39: 11.939253227022332


Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_depth,koi_steff_err1,koi_steff_err2,koi_slogg_err2
0,0,0,0,0,874.8,81,-81,-0.096
1,0,1,0,0,10829.0,158,-176,-0.176
2,0,1,0,0,8079.2,157,-174,-0.168
3,0,0,0,0,603.3,169,-211,-0.21
4,0,0,0,0,686.0,189,-232,-0.229


In [7]:
# Split data into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, y, random_state=42)

In [8]:
# Scale data with MinMaxScaler
from sklearn.preprocessing import LabelEncoder, MinMaxScaler,StandardScaler
from tensorflow.keras.utils import to_categorical

# scale the data
X_scaler =  StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

In [11]:
# Train the model
from sklearn.svm import SVC 
svm = SVC(kernel='linear')
svm.fit(X_train_scaled, encoded_y_train)
predictions = svm.predict(X_test_scaled)
svm

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [12]:
# Model accuracy
print('Test Acc: %.3f' % svm.score(X_train_scaled, encoded_y_train))
print('Test Acc: %.3f' % svm.score(X_test_scaled, encoded_y_test))

Test Acc: 0.820
Test Acc: 0.816


In [13]:
# Tune model with GridSearch
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [1, 5, 10],
              'gamma': [0.0001, 0.001, 0.01]}
grid = GridSearchCV(svm, param_grid, verbose=3)

In [14]:
grid.fit(X_train_scaled, encoded_y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s


Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=0.824, total=   0.2s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=0.830, total=   0.2s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=0.805, total=   0.1s
[CV] C=1, gamma=0.001 ................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.2s remaining:    0.0s


[CV] .................... C=1, gamma=0.001, score=0.824, total=   0.2s
[CV] C=1, gamma=0.001 ................................................
[CV] .................... C=1, gamma=0.001, score=0.830, total=   0.1s
[CV] C=1, gamma=0.001 ................................................
[CV] .................... C=1, gamma=0.001, score=0.805, total=   0.1s
[CV] C=1, gamma=0.01 .................................................
[CV] ..................... C=1, gamma=0.01, score=0.824, total=   0.1s
[CV] C=1, gamma=0.01 .................................................
[CV] ..................... C=1, gamma=0.01, score=0.830, total=   0.1s
[CV] C=1, gamma=0.01 .................................................
[CV] ..................... C=1, gamma=0.01, score=0.805, total=   0.2s
[CV] C=5, gamma=0.0001 ...............................................
[CV] ................... C=5, gamma=0.0001, score=0.824, total=   0.2s
[CV] C=5, gamma=0.0001 ...............................................
[CV] .

[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:    5.8s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='linear',
                           max_iter=-1, probability=False, random_state=None,
                           shrinking=True, tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [1, 5, 10], 'gamma': [0.0001, 0.001, 0.01]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [15]:
# Grid scores
print(grid.best_params_)
print(grid.best_score_)

{'C': 1, 'gamma': 0.0001}
0.8197596795727636
