In [1]:
#import dependencies
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
# Read the csv file into a pandas DataFrame
data = pd.read_csv('cumulative.csv')
data.head()
#data dictionary: https://exoplanetarchive.ipac.caltech.edu/docs/API_kepcandidate_columns.html 

Unnamed: 0,rowid,kepid,kepoi_name,kepler_name,koi_disposition,koi_pdisposition,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,1,10797460,K00752.01,Kepler-227 b,CONFIRMED,CANDIDATE,1.0,0,0,0,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,2,10797460,K00752.02,Kepler-227 c,CONFIRMED,CANDIDATE,0.969,0,0,0,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
2,3,10811496,K00753.01,,FALSE POSITIVE,FALSE POSITIVE,0.0,0,1,0,...,-176.0,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
3,4,10848459,K00754.01,,FALSE POSITIVE,FALSE POSITIVE,0.0,0,1,0,...,-174.0,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
4,5,10854555,K00755.01,Kepler-664 b,CONFIRMED,CANDIDATE,1.0,0,0,0,...,-211.0,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509


In [4]:
#instead of dropping columns, impute mean for missing values.
data = data.fillna(data.mean())
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9564 entries, 0 to 9563
Data columns (total 50 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   rowid              9564 non-null   int64  
 1   kepid              9564 non-null   int64  
 2   kepoi_name         9564 non-null   object 
 3   kepler_name        2294 non-null   object 
 4   koi_disposition    9564 non-null   object 
 5   koi_pdisposition   9564 non-null   object 
 6   koi_score          9564 non-null   float64
 7   koi_fpflag_nt      9564 non-null   int64  
 8   koi_fpflag_ss      9564 non-null   int64  
 9   koi_fpflag_co      9564 non-null   int64  
 10  koi_fpflag_ec      9564 non-null   int64  
 11  koi_period         9564 non-null   float64
 12  koi_period_err1    9564 non-null   float64
 13  koi_period_err2    9564 non-null   float64
 14  koi_time0bk        9564 non-null   float64
 15  koi_time0bk_err1   9564 non-null   float64
 16  koi_time0bk_err2   9564 

In [5]:
# create X & y
# Features
# Remove string type columns and columns with missing values
remove_features=['rowid', 'kepid','kepoi_name','kepler_name',
                 'koi_disposition','koi_pdisposition', 'koi_tce_delivname', 
                 "koi_teq_err1", "koi_teq_err2"]
X = data.drop(remove_features, axis=1)  
# Label
y=data['koi_disposition'] 

In [6]:
# split data into train/test
from sklearn.model_selection import train_test_split
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # 70% training and 30% test

In [10]:
# Scale features
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

X_scaler = MinMaxScaler().fit(X_train)

X_trained_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)


print(encoded_y_test)

[1 2 0 ... 0 0 2]


In [13]:
# train SVM model
from sklearn.svm import SVC 
model = SVC(kernel='linear')
model.fit(X_trained_scaled, encoded_y_train)
predictions = model.predict(X_test_scaled)

In [15]:
# check model score
model.score(X_test_scaled, encoded_y_test)

0.856794425087108

In [16]:
# create classification report
from sklearn.metrics import classification_report
print(classification_report(encoded_y_test, predictions,
                           target_names =["confirmed", "false positive", "candidate"]))

                precision    recall  f1-score   support

     confirmed       0.82      0.52      0.63       667
false positive       0.66      0.92      0.77       667
     candidate       0.99      0.98      0.98      1536

      accuracy                           0.86      2870
     macro avg       0.82      0.80      0.80      2870
  weighted avg       0.87      0.86      0.85      2870



In [17]:
# Create the GridSearch estimator 
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [1, 5, 10, 50],
              'gamma': [0.0001, 0.0005, 0.001, 0.005]}
grid = GridSearchCV(model, param_grid)

In [19]:
# Train the model with GridSearch
grid.fit(X_trained_scaled, encoded_y_train)

GridSearchCV(estimator=SVC(kernel='linear'),
             param_grid={'C': [1, 5, 10, 50],
                         'gamma': [0.0001, 0.0005, 0.001, 0.005]})

In [20]:
# best parameters for this dataset
grid.best_params_

{'C': 50, 'gamma': 0.0001}

In [21]:
# list best score
grid.best_score_

0.8745145910150918

In [22]:
# Make predictions with the hypertuned model
predictions = grid.predict(X_test_scaled)

In [23]:
print(classification_report(encoded_y_test, predictions,
                           target_names =["confirmed", "false positive", "candidate"]))

                precision    recall  f1-score   support

     confirmed       0.81      0.67      0.73       667
false positive       0.74      0.88      0.80       667
     candidate       0.98      0.98      0.98      1536

      accuracy                           0.88      2870
     macro avg       0.85      0.84      0.84      2870
  weighted avg       0.89      0.88      0.88      2870

