In [1]:
import pandas as pd

# Read the CSV and Perform Basic Data Cleaning

In [2]:
df = pd.read_csv("cumulative.csv")
df = df.drop(columns=["rowid", "kepid", "kepoi_name", "kepler_name", "koi_pdisposition", "koi_score", "koi_tce_delivname"])
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,9.488036,2.775e-05,-2.775e-05,170.53875,0.00216,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
2,FALSE POSITIVE,0,1,0,0,19.89914,1.494e-05,-1.494e-05,175.850252,0.000581,...,-176.0,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
3,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174.0,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
4,CONFIRMED,0,0,0,0,2.525592,3.761e-06,-3.761e-06,171.59555,0.00113,...,-211.0,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509


# Create a Train Test Split

Use `koi_disposition` for the y values

In [30]:
y = df["koi_disposition"]
print (y.shape)

(8744,)


In [45]:
X = df.iloc[:, 1:41]
print (X.shape)

(8744, 40)


In [46]:
# Split data into training and testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [47]:
X_train.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
8017,0,1,1,0,0.806277,4.947e-06,-4.947e-06,131.78567,0.00672,-0.00672,...,-184.0,4.471,0.054,-0.229,0.996,0.324,-0.108,290.81723,38.53912,13.614
1233,0,1,1,0,3.582077,4.318e-06,-4.318e-06,355.515064,0.000864,-0.000864,...,-235.0,4.422,0.09,-0.195,0.993,0.283,-0.131,296.07822,43.13694,15.193
2592,0,0,0,0,5.060923,2.616e-05,-2.616e-05,134.47316,0.00473,-0.00473,...,-112.0,4.492,0.048,-0.112,0.911,0.121,-0.06,289.91742,40.828606,13.346
4770,0,1,0,1,8.480304,3.32e-07,-3.32e-07,135.854534,3.1e-05,-3.1e-05,...,-169.0,3.946,0.195,-0.105,2.21,0.375,-0.563,298.8002,46.665539,7.631
6632,0,0,0,1,4.994716,4.495e-05,-4.495e-05,136.1833,0.0095,-0.0095,...,-194.0,3.706,0.32,-0.08,2.83,0.458,-1.068,282.58215,46.81551,13.352


# Pre-processing

Scale the data using the MinMaxScaler

In [54]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
X_minmax = MinMaxScaler().fit(X_train)

In [55]:
# Transform the training and testing data using the X_scaler

X_train_minmax = X_minmax.transform(X_train)
X_test_minmax = X_minmax.transform(X_test)

# Train the Support Vector Machine

In [56]:
# Support vector machine linear classifier
from sklearn.svm import SVC 
model = SVC(kernel='linear')
model.fit(X_train_minmax, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [57]:


from sklearn.metrics import classification_report
predictions = model.predict(X_test_minmax)
print(classification_report(y_test, predictions))

Test Acc: 0.839
                precision    recall  f1-score   support

     CANDIDATE       0.73      0.56      0.63       523
     CONFIRMED       0.68      0.80      0.73       594
FALSE POSITIVE       0.98      1.00      0.99      1069

      accuracy                           0.84      2186
     macro avg       0.80      0.78      0.79      2186
  weighted avg       0.84      0.84      0.83      2186



In [58]:
print(f"Training Data Score: {model.score(X_train_minmax, y_train)}")

# Model Accuracy
print(f"Testing Data Score: {model.score(X_test_minmax, y_test)}")

Training Data Score: 0.8502592253735896
Testing Data Score: 0.838975297346752


# Hyperparameter Tuning

Use `GridSearchCV` to tune the `C` and `gamma` parameters

In [59]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [1, 5, 10, 50],
              'gamma': [0.0001, 0.0005, 0.001, 0.005]}
grid = GridSearchCV(model, param_grid, verbose=3)

In [60]:
# Train the model with GridSearch
grid.fit(X_train_minmax, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 16 candidates, totalling 48 fits
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=0.847, total=   0.9s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.8s remaining:    0.0s


[CV] ................... C=1, gamma=0.0001, score=0.838, total=   0.9s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.7s remaining:    0.0s


[CV] ................... C=1, gamma=0.0001, score=0.848, total=   0.9s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.847, total=   0.9s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.838, total=   0.9s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.848, total=   0.9s
[CV] C=1, gamma=0.001 ................................................
[CV] .................... C=1, gamma=0.001, score=0.847, total=   1.0s
[CV] C=1, gamma=0.001 ................................................
[CV] .................... C=1, gamma=0.001, score=0.838, total=   0.9s
[CV] C=1, gamma=0.001 ................................................
[CV] .................... C=1, gamma=0.001, score=0.848, total=   0.8s
[CV] C=1, gamma=0.005 ................................................
[CV] .

[Parallel(n_jobs=1)]: Done  48 out of  48 | elapsed:   44.7s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='linear',
                           max_iter=-1, probability=False, random_state=None,
                           shrinking=True, tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [1, 5, 10, 50],
                         'gamma': [0.0001, 0.0005, 0.001, 0.005]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [61]:
print(grid.best_params_)
print(grid.best_score_)

{'C': 50, 'gamma': 0.0001}
0.8824336688014639


In [64]:
print(f"Training Data Score: {grid.score(X_train_minmax, y_train)}")

# Model Accuracy
print(f"Testing Data Score: {grid.score(X_test_minmax, y_test)}")

Training Data Score: 0.8877706617871303
Testing Data Score: 0.8810612991765783


In [65]:
predictions = grid.predict(X_test_minmax)

In [66]:
print(classification_report(y_test, predictions))

                precision    recall  f1-score   support

     CANDIDATE       0.85      0.65      0.73       523
     CONFIRMED       0.75      0.87      0.81       594
FALSE POSITIVE       0.98      1.00      0.99      1069

      accuracy                           0.88      2186
     macro avg       0.86      0.84      0.84      2186
  weighted avg       0.88      0.88      0.88      2186



# Using Random Forest

In [83]:
from sklearn.ensemble import RandomForestClassifier
# Create a random forest classifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train_minmax, y_train)

print(rf.score(X_train_minmax, y_train))
print(rf.score(X_test_minmax, y_test))


1.0
0.8938700823421775


In [79]:
# Random Forests in sklearn will automatically calculate feature importance
importances = rf.feature_importances_
importances

array([0.09590466, 0.06484702, 0.1106346 , 0.03781373, 0.02242329,
       0.01720449, 0.01977943, 0.013301  , 0.02155925, 0.02278895,
       0.02052148, 0.01030719, 0.01042492, 0.02442274, 0.03428435,
       0.03068141, 0.02523802, 0.01292332, 0.01285646, 0.04738213,
       0.03637444, 0.02682447, 0.01490293, 0.01289092, 0.01868957,
       0.01148021, 0.0605377 , 0.0033548 , 0.0092609 , 0.0309534 ,
       0.02740232, 0.00866643, 0.00836873, 0.01010518, 0.00919155,
       0.01217196, 0.00875184, 0.01328661, 0.01081153, 0.01067606])

In [92]:
predictions = rf.predict(X_test_minmax)

print(classification_report(y_test, predictions))

                precision    recall  f1-score   support

     CANDIDATE       0.84      0.73      0.78       523
     CONFIRMED       0.81      0.85      0.83       594
FALSE POSITIVE       0.96      1.00      0.98      1069

      accuracy                           0.89      2186
     macro avg       0.87      0.86      0.86      2186
  weighted avg       0.89      0.89      0.89      2186

