## Data Input and Cleaning

In [1]:
import pandas as pd

# Read CSV
planet_df = pd.read_csv('exoplanet_data.csv')
planet_df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


In [2]:
# Drop the null columns where all values are null
clean_df = planet_df.dropna(axis='columns', how='all')

# Drop the null rows
clean_df = clean_df.dropna()
clean_df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


## Feature Selection and Pre-processing

In [3]:
# List features
feature_names = []
for n in  clean_df.columns:
    feature_names.append(n)
feature_names

['koi_disposition',
 'koi_fpflag_nt',
 'koi_fpflag_ss',
 'koi_fpflag_co',
 'koi_fpflag_ec',
 'koi_period',
 'koi_period_err1',
 'koi_period_err2',
 'koi_time0bk',
 'koi_time0bk_err1',
 'koi_time0bk_err2',
 'koi_impact',
 'koi_impact_err1',
 'koi_impact_err2',
 'koi_duration',
 'koi_duration_err1',
 'koi_duration_err2',
 'koi_depth',
 'koi_depth_err1',
 'koi_depth_err2',
 'koi_prad',
 'koi_prad_err1',
 'koi_prad_err2',
 'koi_teq',
 'koi_insol',
 'koi_insol_err1',
 'koi_insol_err2',
 'koi_model_snr',
 'koi_tce_plnt_num',
 'koi_steff',
 'koi_steff_err1',
 'koi_steff_err2',
 'koi_slogg',
 'koi_slogg_err1',
 'koi_slogg_err2',
 'koi_srad',
 'koi_srad_err1',
 'koi_srad_err2',
 'ra',
 'dec',
 'koi_kepmag']

In [4]:
# List labels
label_names = []
for n in  clean_df['koi_disposition'].unique():
    label_names.append(n)
label_names

['CONFIRMED', 'FALSE POSITIVE', 'CANDIDATE']

In [5]:
# Define X and y
labels = clean_df['koi_disposition']
features = clean_df.drop('koi_disposition', axis=1)

In [6]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

# Scale features
scaler.fit(features)
features = scaler.transform(features)

In [7]:
features_df = pd.DataFrame(features)
features_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
0,0.0,0.0,0.0,0.0,0.05057,0.001581,0.998419,0.031063,0.006171,0.993829,...,0.95326,0.831296,0.043478,0.904667,0.004547,0.004045,0.999412,0.552405,0.733837,0.692702
1,0.0,1.0,0.0,0.0,0.018338,9.5e-05,0.999905,0.040928,0.001006,0.998994,...,0.898442,0.845778,0.029891,0.825223,0.004219,0.008977,0.999249,0.784312,0.73336,0.700058
2,0.0,1.0,0.0,0.0,0.001379,2e-06,0.999998,0.036828,0.000187,0.999813,...,0.899596,0.849539,0.036005,0.833168,0.003791,0.007744,0.999355,0.259712,0.742947,0.713365
3,0.0,0.0,0.0,0.0,0.002116,2.4e-05,0.999976,0.037781,0.001971,0.998029,...,0.878246,0.825842,0.047554,0.79146,0.005208,0.012868,0.998719,0.406994,0.739203,0.706091
4,0.0,0.0,0.0,0.0,0.003618,6.7e-05,0.999933,0.038804,0.003324,0.996676,...,0.866128,0.834869,0.036685,0.772592,0.004797,0.012136,0.998989,0.751443,0.739105,0.723035


In [8]:
# Create variable to test feature selection on
sel_features = features.copy()

In [9]:
from sklearn.feature_selection import SelectPercentile, chi2
sel_features = SelectPercentile(chi2, percentile=90).fit_transform(sel_features, labels)

## Creating Test Train Split 

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(sel_features, labels, random_state=42)

## Training the Model 

In [11]:
from sklearn.metrics import classification_report
from sklearn.linear_model import SGDClassifier
SGD_model = SGDClassifier(random_state=42)

SGD_model.fit(X_train, y_train)

predictions = SGD_model.predict(X_test)

In [12]:
print(f"Training Data Score: {SGD_model.score(X_train, y_train)}")
print(f"Testing Data Score: {SGD_model.score(X_test, y_test)}")

Training Data Score: 0.8491321762349799
Testing Data Score: 0.8398169336384439


In [13]:
print(" - - - SGD Classifier Model - - - ")
print(classification_report(y_test, predictions, target_names=label_names))

 - - - SGD Classifier Model - - - 
                precision    recall  f1-score   support

     CONFIRMED       0.71      0.59      0.64       411
FALSE POSITIVE       0.70      0.77      0.74       484
     CANDIDATE       0.98      1.00      0.99       853

      accuracy                           0.84      1748
     macro avg       0.80      0.79      0.79      1748
  weighted avg       0.84      0.84      0.84      1748



## Saving Model

In [14]:
import joblib

filename = 'sgd.sav'
joblib.dump(SGD_model, filename)

['sgd.sav']