In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import SVC 
import warnings
warnings.simplefilter('ignore')
import joblib
import matplotlib.pyplot as plt


In [2]:
#read in csv file with cleaned data
df = pd.read_csv('data_clean.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


In [4]:
df = df.drop(columns=['Unnamed: 0'])
#the index should not have been saved when saving the csv file

In [5]:
#this is the y value
dfy_ = df['koi_disposition']
#reshape
dfy_ = dfy_.values.reshape(-1, 1)
label = LabelEncoder()
#make y value the transformed dfy_ so it can be read by the model
dfy = label.fit(dfy_).transform(dfy_)

In [6]:
#drop the column with the y value
dfx = df.drop(columns=['koi_disposition'])

In [7]:
dfx.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,-0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,-0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,-0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,-0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,-0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


In [8]:
print(dfx.shape, dfy.shape)

(6991, 40) (6991,)


In [9]:
#split data into training and testing data
x_train, x_test, y_train, y_test = train_test_split(dfx, dfy, random_state=55)
#scale x values
x_scaler = StandardScaler().fit(x_train)
x_train = x_scaler.transform(x_train)
x_test = x_scaler.transform(x_test)

In [10]:
#create model
model = SVC(kernel='linear')

In [11]:
model.fit(x_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [12]:
#this is the best model I am comfrotable with
model.score(x_test, y_test)

0.8827231121281465

In [13]:
#check score of training data 
model.score(x_train, y_train)

0.8945260347129506

In [14]:
#find features with low importance
pd.Series(abs(model.coef_[0]), index=dfx.columns).nsmallest(10)

koi_fpflag_ss     0.016370
koi_slogg_err2    0.052535
koi_steff_err1    0.093508
koi_prad_err2     0.095478
koi_insol         0.113769
dec               0.120203
ra                0.160894
koi_model_snr     0.165949
koi_time0bk       0.176092
koi_kepmag        0.201436
dtype: float64

In [15]:
#create a copy of dfx
dfx_important_features = dfx.copy()
#drop columns of features with low importance
dfx_important_features = dfx_important_features.drop(columns=['koi_fpflag_ss'])

In [16]:
#split data into training and testing data
x_train, x_test, y_train, y_test = train_test_split(dfx_important_features, dfy, random_state=55)
#scale x values
x_scaler = StandardScaler().fit(x_train)
x_train = x_scaler.transform(x_train)
x_test = x_scaler.transform(x_test)

In [17]:
#create model
model2 = SVC(kernel='linear')

In [18]:
model2.fit(x_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [19]:
#score went down by enough so that I don't want to drop important features
model2.score(x_test, y_test)

0.8575514874141876

In [20]:
model2.score(x_train, y_train)

0.8750715239366774

In [111]:
#revert back to x values without dropping insignificient values
#split data into training and testing data
x_train, x_test, y_train, y_test = train_test_split(dfx, dfy, random_state=55)
#scale x values
x_scaler = StandardScaler().fit(x_train)
x_train = x_scaler.transform(x_train)
x_test = x_scaler.transform(x_test)

In [112]:
param_grid = {
    'C':[1,10,100,1000],
    'gamma':[1,0.1,0.001,0.0001],
    'kernel':['linear', 'poly','rbf', 'sigmoid'],
    'probability':[False]} #look into probability being true on my own time

In [113]:
grid = GridSearchCV(model, param_grid, verbose=3)

In [114]:
grid.fit(x_train, y_train)


Fitting 3 folds for each of 64 candidates, totalling 192 fits
[CV] C=1, gamma=1, kernel=linear, probability=False ..................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  C=1, gamma=1, kernel=linear, probability=False, score=0.893, total=   0.4s
[CV] C=1, gamma=1, kernel=linear, probability=False ..................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s remaining:    0.0s


[CV]  C=1, gamma=1, kernel=linear, probability=False, score=0.894, total=   0.4s
[CV] C=1, gamma=1, kernel=linear, probability=False ..................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.8s remaining:    0.0s


[CV]  C=1, gamma=1, kernel=linear, probability=False, score=0.883, total=   0.3s
[CV] C=1, gamma=1, kernel=poly, probability=False ....................
[CV]  C=1, gamma=1, kernel=poly, probability=False, score=0.834, total=   2.7s
[CV] C=1, gamma=1, kernel=poly, probability=False ....................
[CV]  C=1, gamma=1, kernel=poly, probability=False, score=0.835, total=   2.6s
[CV] C=1, gamma=1, kernel=poly, probability=False ....................
[CV]  C=1, gamma=1, kernel=poly, probability=False, score=0.825, total=   2.4s
[CV] C=1, gamma=1, kernel=rbf, probability=False .....................
[CV]  C=1, gamma=1, kernel=rbf, probability=False, score=0.719, total=   1.3s
[CV] C=1, gamma=1, kernel=rbf, probability=False .....................
[CV]  C=1, gamma=1, kernel=rbf, probability=False, score=0.712, total=   1.3s
[CV] C=1, gamma=1, kernel=rbf, probability=False .....................
[CV]  C=1, gamma=1, kernel=rbf, probability=False, score=0.716, total=   1.3s
[CV] C=1, gamma=1, ker

[CV]  C=10, gamma=1, kernel=rbf, probability=False, score=0.727, total=   1.3s
[CV] C=10, gamma=1, kernel=sigmoid, probability=False ................
[CV]  C=10, gamma=1, kernel=sigmoid, probability=False, score=0.593, total=   0.6s
[CV] C=10, gamma=1, kernel=sigmoid, probability=False ................
[CV]  C=10, gamma=1, kernel=sigmoid, probability=False, score=0.634, total=   0.6s
[CV] C=10, gamma=1, kernel=sigmoid, probability=False ................
[CV]  C=10, gamma=1, kernel=sigmoid, probability=False, score=0.605, total=   0.5s
[CV] C=10, gamma=0.1, kernel=linear, probability=False ...............
[CV]  C=10, gamma=0.1, kernel=linear, probability=False, score=0.895, total=   0.9s
[CV] C=10, gamma=0.1, kernel=linear, probability=False ...............
[CV]  C=10, gamma=0.1, kernel=linear, probability=False, score=0.894, total=   1.4s
[CV] C=10, gamma=0.1, kernel=linear, probability=False ...............
[CV]  C=10, gamma=0.1, kernel=linear, probability=False, score=0.883, total=  

[CV]  C=100, gamma=0.1, kernel=linear, probability=False, score=0.894, total=   5.7s
[CV] C=100, gamma=0.1, kernel=linear, probability=False ..............
[CV]  C=100, gamma=0.1, kernel=linear, probability=False, score=0.880, total=   3.4s
[CV] C=100, gamma=0.1, kernel=poly, probability=False ................
[CV]  C=100, gamma=0.1, kernel=poly, probability=False, score=0.842, total=   0.8s
[CV] C=100, gamma=0.1, kernel=poly, probability=False ................
[CV]  C=100, gamma=0.1, kernel=poly, probability=False, score=0.837, total=   0.9s
[CV] C=100, gamma=0.1, kernel=poly, probability=False ................
[CV]  C=100, gamma=0.1, kernel=poly, probability=False, score=0.842, total=   0.6s
[CV] C=100, gamma=0.1, kernel=rbf, probability=False .................
[CV]  C=100, gamma=0.1, kernel=rbf, probability=False, score=0.862, total=   0.6s
[CV] C=100, gamma=0.1, kernel=rbf, probability=False .................
[CV]  C=100, gamma=0.1, kernel=rbf, probability=False, score=0.849, total

[CV]  C=1000, gamma=0.1, kernel=rbf, probability=False, score=0.850, total=   0.7s
[CV] C=1000, gamma=0.1, kernel=rbf, probability=False ................
[CV]  C=1000, gamma=0.1, kernel=rbf, probability=False, score=0.839, total=   0.8s
[CV] C=1000, gamma=0.1, kernel=rbf, probability=False ................
[CV]  C=1000, gamma=0.1, kernel=rbf, probability=False, score=0.834, total=   0.8s
[CV] C=1000, gamma=0.1, kernel=sigmoid, probability=False ............
[CV]  C=1000, gamma=0.1, kernel=sigmoid, probability=False, score=0.718, total=   0.4s
[CV] C=1000, gamma=0.1, kernel=sigmoid, probability=False ............
[CV]  C=1000, gamma=0.1, kernel=sigmoid, probability=False, score=0.733, total=   0.3s
[CV] C=1000, gamma=0.1, kernel=sigmoid, probability=False ............
[CV]  C=1000, gamma=0.1, kernel=sigmoid, probability=False, score=0.704, total=   0.3s
[CV] C=1000, gamma=0.001, kernel=linear, probability=False ...........
[CV]  C=1000, gamma=0.001, kernel=linear, probability=False, sco

[Parallel(n_jobs=1)]: Done 192 out of 192 | elapsed: 21.2min finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='linear',
                           max_iter=-1, probability=False, random_state=None,
                           shrinking=True, tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [1, 10, 100, 1000],
                         'gamma': [1, 0.1, 0.001, 0.0001],
                         'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
                         'probability': [False]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [115]:
best_parameters = grid.best_params_
best_parameters
#reviewed grid and the best score is under the best testing score of existing models

{'C': 1000, 'gamma': 0.001, 'kernel': 'rbf', 'probability': False}

In [116]:
model3 = SVC(C=1000, gamma=0.001, kernel='rbf', probability=False)

In [117]:
model3.fit(x_train, y_train)

SVC(C=1000, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [118]:
#score did not improve
model3.score(x_test, y_test)

0.8872997711670481

In [119]:
model3.score(x_train, y_train)

0.906732786572573

In [120]:
#trying with one of the top scoring models of the grid search
#C=1000, gamma=0.0001, kernel=rbf, probability=False, score=0.895
model4 = SVC(C=1000, gamma=0.0001, kernel='rbf', probability=False)

In [121]:
model4.fit(x_train, y_train)

SVC(C=1000, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.0001, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [122]:
#still not improving
model4.score(x_test, y_test)

0.8787185354691075

In [123]:
model4.score(x_train, y_train)

0.8928094602326913

In [185]:
#try to improve r^2 score with more training data 
#split data into training and testing data
x_train, x_test, y_train, y_test = train_test_split(dfx, dfy, random_state=55, train_size=0.8)
#scale x values
x_scaler = StandardScaler().fit(x_train)
x_train = x_scaler.transform(x_train)
x_test = x_scaler.transform(x_test)

In [186]:
model5 = SVC(kernel='linear')

In [187]:
model5.fit(x_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [188]:
model5.score(x_test, y_test)

0.873481057898499

In [189]:
model5.score(x_train, y_train)

0.8957439198855508

In [226]:
#find features with low importance
pd.Series(abs(model5.coef_[0]), index=dfx.columns).nsmallest(10)

koi_fpflag_ss     0.002828
koi_srad          0.047029
koi_slogg_err2    0.064422
koi_insol         0.080272
koi_prad_err2     0.119443
dec               0.132364
ra                0.154280
koi_steff_err1    0.165924
koi_model_snr     0.168165
koi_steff_err2    0.168686
dtype: float64

In [227]:
#create a copy of dfx
dfx_important_features = dfx.copy()
#drop columns of features with low importance
dfx_important_features = dfx_important_features.drop(columns=['koi_fpflag_ss'])

#split data into training and testing data
x_train, x_test, y_train, y_test = train_test_split(dfx_important_features, dfy, random_state=55, train_size=0.8)
#scale x values
x_scaler = StandardScaler().fit(x_train)
x_train = x_scaler.transform(x_train)
x_test = x_scaler.transform(x_test)

#this is the same column that we dropped (and later re-added upon performance decline) above

In [228]:
model6 = SVC(kernel='linear')

In [229]:
model6.fit(x_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [230]:
model6.score(x_test, y_test)

0.8463187991422445

In [231]:
model6.score(x_train, y_train)

0.8758941344778255

In [220]:
#experiment with less training data
#split data into training and testing data
x_train, x_test, y_train, y_test = train_test_split(dfx, dfy, random_state=55, train_size=0.6)
#scale x values
x_scaler = StandardScaler().fit(x_train)
x_train = x_scaler.transform(x_train)
x_test = x_scaler.transform(x_test)

In [221]:
model6 = SVC(kernel='linear')

In [222]:
model6.fit(x_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [223]:
model6.score(x_test, y_test)

0.8895244905255631

In [224]:
model6.score(x_train, y_train)

0.894134477825465

In [225]:
import joblib
file = 'SVC.sav'
joblib.dump(model, file)

['SVC.sav']