In [1]:
import pandas as pd

# Read the CSV and Perform Basic Data Cleaning

In [2]:
df = pd.read_csv("cumulative.csv")
df = df.drop(columns=["rowid", "kepid", "kepoi_name", "kepler_name", "koi_pdisposition", "koi_score", "koi_tce_delivname"])
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head(5)

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,9.488036,2.775e-05,-2.775e-05,170.53875,0.00216,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
2,FALSE POSITIVE,0,1,0,0,19.89914,1.494e-05,-1.494e-05,175.850252,0.000581,...,-176.0,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
3,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174.0,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
4,CONFIRMED,0,0,0,0,2.525592,3.761e-06,-3.761e-06,171.59555,0.00113,...,-211.0,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509


# Create a Train Test Split

Use `koi_disposition` for the y values

In [3]:
#Assinging y as "koi_disposition" column

y = df["koi_disposition"]
#print(y.head())

#Checking unique types
print(y.unique())

['CONFIRMED' 'FALSE POSITIVE' 'CANDIDATE']


In [4]:
#Converting each string to an integer

y = y.replace(['CONFIRMED', 'FALSE POSITIVE', 'CANDIDATE'], [0, 1, 2])
print(y.unique())

[0 1 2]


In [5]:
#Converting y into a float type since MinMaxScaler prefers float
y = y.astype(float)

#Reshaping y to use for pre-processing
print(f"Current Y shape is {y.shape}.")
y = y.values.reshape(-1, 1)
print(f"New Y shape is {y.shape}.")

Current Y shape is (8744,).
New Y shape is (8744, 1).


In [6]:
#Assinging X as rest of remaining columns from df

X = df.drop("koi_disposition", axis=1)
feature_names = X.columns
X.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,0,0,0,0,9.488036,2.775e-05,-2.775e-05,170.53875,0.00216,-0.00216,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,-0.00352,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
2,0,1,0,0,19.89914,1.494e-05,-1.494e-05,175.850252,0.000581,-0.000581,...,-176.0,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
3,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,-0.000115,...,-174.0,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
4,0,0,0,0,2.525592,3.761e-06,-3.761e-06,171.59555,0.00113,-0.00113,...,-211.0,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509


In [7]:
#Splitting train and test data set

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Pre-processing

Scale the data using the MinMaxScaler

In [8]:
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.utils import to_categorical

X_scaler = MinMaxScaler().fit(X_train)
y_scaler = MinMaxScaler().fit(y_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

y_train_scaled = y_scaler.transform(y_train)
y_test_scaled = y_scaler.transform(y_test)

#Need to unpack y data before fitting into SVC
y_train_scaled = y_train_scaled.ravel()
y_test_scaled = y_test_scaled.ravel()
print(y_train_scaled)
print(y_test_scaled)

[0.5 0.5 1.  ... 0.5 0.  0.5]
[0.5 1.  0.  ... 1.  0.5 0. ]


In [9]:
#Float won't run in SVC, this step is to verify how we can change data to binary

from sklearn import preprocessing
from sklearn import utils

lab_enc = preprocessing.LabelEncoder()
training_scores_encoded = lab_enc.fit_transform(y_train_scaled)
print(training_scores_encoded)
print(utils.multiclass.type_of_target(y_train_scaled))
print(utils.multiclass.type_of_target(y_train_scaled.astype('int')))
print(utils.multiclass.type_of_target(y_train_scaled))

[1 1 2 ... 1 0 1]
continuous
binary
continuous


In [10]:
#Converting back to integers
y_train_scaled = y_train_scaled.astype('int')
y_test_scaled = y_test_scaled.astype('int')

# Train the Support Vector Machine

In [11]:
from sklearn.svm import SVC
import numpy as np

model = SVC(kernel='linear')
model.fit(X_train_scaled, y_train_scaled)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [13]:
print(f"Training Data Score: {model.score(X_train_scaled, y_train_scaled)}")
print(f"Testing Data Score: {model.score(X_test_scaled, y_test_scaled)}")

Training Data Score: 0.8475144861238182
Testing Data Score: 0.8380603842634949


# Hyperparameter Tuning

Use `GridSearchCV` to tune the `C` and `gamma` parameters

In [20]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [1, 2, 3, 5, 10, 15, 20, 30, 50, 100],
              'gamma': [0.00001, 0.00005, 0.0001, 0.0005, 0.001, 0.005, 0.5]}
grid = GridSearchCV(model, param_grid, verbose=3)

In [21]:
# Train the model with GridSearch
grid.fit(X_train_scaled, y_train_scaled)

Fitting 3 folds for each of 70 candidates, totalling 210 fits
[CV] C=1, gamma=1e-05 ................................................
[CV] ....... C=1, gamma=1e-05, score=0.8436213991769548, total=   0.4s
[CV] C=1, gamma=1e-05 ................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.6s remaining:    0.0s


[CV] ....... C=1, gamma=1e-05, score=0.8371454711802379, total=   0.4s
[CV] C=1, gamma=1e-05 ................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.3s remaining:    0.0s


[CV] ....... C=1, gamma=1e-05, score=0.8462242562929062, total=   0.4s
[CV] C=1, gamma=5e-05 ................................................
[CV] ....... C=1, gamma=5e-05, score=0.8436213991769548, total=   0.4s
[CV] C=1, gamma=5e-05 ................................................
[CV] ....... C=1, gamma=5e-05, score=0.8371454711802379, total=   0.4s
[CV] C=1, gamma=5e-05 ................................................
[CV] ....... C=1, gamma=5e-05, score=0.8462242562929062, total=   0.4s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ...... C=1, gamma=0.0001, score=0.8436213991769548, total=   0.4s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ...... C=1, gamma=0.0001, score=0.8371454711802379, total=   0.4s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ...... C=1, gamma=0.0001, score=0.8462242562929062, total=   0.4s
[CV] C=1, gamma=0.0005 ...............................................
[CV] .

[CV] ......... C=3, gamma=0.5, score=0.8577960676726109, total=   0.4s
[CV] C=3, gamma=0.5 ..................................................
[CV] ......... C=3, gamma=0.5, score=0.8522415370539799, total=   0.4s
[CV] C=3, gamma=0.5 ..................................................
[CV] ......... C=3, gamma=0.5, score=0.8640732265446224, total=   0.4s
[CV] C=5, gamma=1e-05 ................................................
[CV] ........ C=5, gamma=1e-05, score=0.862368541380887, total=   0.4s
[CV] C=5, gamma=1e-05 ................................................
[CV] ....... C=5, gamma=1e-05, score=0.8577310155535224, total=   0.4s
[CV] C=5, gamma=1e-05 ................................................
[CV] ....... C=5, gamma=1e-05, score=0.8718535469107551, total=   0.4s
[CV] C=5, gamma=5e-05 ................................................
[CV] ........ C=5, gamma=5e-05, score=0.862368541380887, total=   0.4s
[CV] C=5, gamma=5e-05 ................................................
[CV] .

[CV] ...... C=15, gamma=0.001, score=0.8664226898444648, total=   0.4s
[CV] C=15, gamma=0.001 ...............................................
[CV] ...... C=15, gamma=0.001, score=0.8796338672768879, total=   0.4s
[CV] C=15, gamma=0.005 ...............................................
[CV] ...... C=15, gamma=0.005, score=0.8756287151348879, total=   0.4s
[CV] C=15, gamma=0.005 ...............................................
[CV] ...... C=15, gamma=0.005, score=0.8664226898444648, total=   0.4s
[CV] C=15, gamma=0.005 ...............................................
[CV] ...... C=15, gamma=0.005, score=0.8796338672768879, total=   0.4s
[CV] C=15, gamma=0.5 .................................................
[CV] ........ C=15, gamma=0.5, score=0.8756287151348879, total=   0.4s
[CV] C=15, gamma=0.5 .................................................
[CV] ........ C=15, gamma=0.5, score=0.8664226898444648, total=   0.4s
[CV] C=15, gamma=0.5 .................................................
[CV] .

[CV] ..... C=50, gamma=0.0001, score=0.8901601830663616, total=   0.5s
[CV] C=50, gamma=0.0005 ..............................................
[CV] ..... C=50, gamma=0.0005, score=0.8797439414723366, total=   0.4s
[CV] C=50, gamma=0.0005 ..............................................
[CV] ..... C=50, gamma=0.0005, score=0.8705397987191217, total=   0.4s
[CV] C=50, gamma=0.0005 ..............................................
[CV] ..... C=50, gamma=0.0005, score=0.8901601830663616, total=   0.4s
[CV] C=50, gamma=0.001 ...............................................
[CV] ...... C=50, gamma=0.001, score=0.8797439414723366, total=   0.4s
[CV] C=50, gamma=0.001 ...............................................
[CV] ...... C=50, gamma=0.001, score=0.8705397987191217, total=   0.4s
[CV] C=50, gamma=0.001 ...............................................
[CV] ...... C=50, gamma=0.001, score=0.8901601830663616, total=   0.5s
[CV] C=50, gamma=0.005 ...............................................
[CV] .

[Parallel(n_jobs=1)]: Done 210 out of 210 | elapsed:  2.2min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [1, 2, 3, 5, 10, 15, 20, 30, 50, 100], 'gamma': [1e-05, 5e-05, 0.0001, 0.0005, 0.001, 0.005, 0.5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [22]:
print(grid.best_params_)
print(grid.best_score_)

{'C': 100, 'gamma': 1e-05}
0.8842634949679781


In [23]:
#Using C and gamma info for SVC

new_model = SVC(kernel='linear', C=100, gamma = 0.00005)
new_model.fit(X_train_scaled, y_train_scaled)

SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=5e-05, kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [24]:
print(f"Training Data Score with default SVC: {model.score(X_train_scaled, y_train_scaled)}")
print(f"Testing Data Score with default SVC: {model.score(X_test_scaled, y_test_scaled)}")
print(f"Training Data Score with updated SVC: {new_model.score(X_train_scaled, y_train_scaled)}")
print(f"Testing Data Score with updated SVC: {new_model.score(X_test_scaled, y_test_scaled)}")

Training Data Score with default SVC: 0.8475144861238182
Testing Data Score with default SVC: 0.8380603842634949
Training Data Score with updated SVC: 0.8886855748703874
Testing Data Score with updated SVC: 0.8806038426349497
