In [1]:
import pandas as pd

# Read the CSV and Perform Basic Data Cleaning

In [2]:
df = pd.read_csv("Data/exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


# Select your features (columns)

In [3]:
df.columns

Index(['koi_disposition', 'koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co',
       'koi_fpflag_ec', 'koi_period', 'koi_period_err1', 'koi_period_err2',
       'koi_time0bk', 'koi_time0bk_err1', 'koi_time0bk_err2', 'koi_impact',
       'koi_impact_err1', 'koi_impact_err2', 'koi_duration',
       'koi_duration_err1', 'koi_duration_err2', 'koi_depth', 'koi_depth_err1',
       'koi_depth_err2', 'koi_prad', 'koi_prad_err1', 'koi_prad_err2',
       'koi_teq', 'koi_insol', 'koi_insol_err1', 'koi_insol_err2',
       'koi_model_snr', 'koi_tce_plnt_num', 'koi_steff', 'koi_steff_err1',
       'koi_steff_err2', 'koi_slogg', 'koi_slogg_err1', 'koi_slogg_err2',
       'koi_srad', 'koi_srad_err1', 'koi_srad_err2', 'ra', 'dec',
       'koi_kepmag'],
      dtype='object')

In [4]:
# Assign x and y values
# X = df.drop("koi_disposition", axis=1)

X = df[['koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co', 'koi_fpflag_ec', 'koi_period', 'koi_time0bk', 'koi_impact', 'koi_duration', 'koi_depth', 'koi_prad', 'koi_teq', 'koi_insol', 'koi_model_snr',  'koi_steff',  'koi_slogg', 'koi_srad',  'ra', 'dec', 'koi_kepmag']]
y = df["koi_disposition"]

# Create a Train Test Split

Use `koi_disposition` for the y values

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [6]:
X_train.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_time0bk,koi_impact,koi_duration,koi_depth,koi_prad,koi_teq,koi_insol,koi_model_snr,koi_steff,koi_slogg,koi_srad,ra,dec,koi_kepmag
4002,0,0,1,0,99.673478,219.33483,11.6846,0.968,2496.0,576.14,262,1.11,12.6,4932,4.777,0.492,293.05801,45.248821,15.801
4246,0,1,0,0,0.592244,131.654831,0.986,1.386,3992.4,6.77,1551,1361.22,303.8,4920,4.664,0.591,290.28094,45.46426,15.653
548,0,1,1,0,9.991625,137.447816,1.256,3.8552,3450.5,38.93,907,160.14,220.3,5874,4.338,1.096,301.04239,45.022888,14.039
3953,0,1,0,0,178.41299,218.225235,0.936,2.9298,37510.0,33.24,361,4.0,535.1,6078,4.346,1.148,288.32785,38.627621,13.944
2362,0,0,0,0,45.294223,138.678725,0.025,6.83,477.1,2.26,524,17.75,134.8,5676,4.347,1.044,285.67938,50.241299,10.961


In [7]:
y_train.head()

4002    FALSE POSITIVE
4246    FALSE POSITIVE
548     FALSE POSITIVE
3953    FALSE POSITIVE
2362         CONFIRMED
Name: koi_disposition, dtype: object

# Pre-processing

Scale the data using the MinMaxScaler

In [8]:
# Scale your data
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Train the Model



In [9]:
# Create the KNN Classifier
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=20)
classifier

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=20, p=2,
                     weights='uniform')

In [10]:
# Fit the model to the data
classifier.fit(X_train_scaled, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=20, p=2,
                     weights='uniform')

In [11]:
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")

Training Data Score: 0.8197596795727636
Testing Data Score: 0.7906178489702517


In [12]:
# Make predictions using the X_test and y_test data
# Print at least 10 predictions vs their actual labels
predictions = classifier.predict(X_test_scaled)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
1981,CONFIRMED,CANDIDATE
5609,FALSE POSITIVE,FALSE POSITIVE
532,FALSE POSITIVE,FALSE POSITIVE
6558,CANDIDATE,CANDIDATE
1249,FALSE POSITIVE,FALSE POSITIVE
...,...,...
2516,CONFIRMED,CONFIRMED
322,FALSE POSITIVE,FALSE POSITIVE
1154,CANDIDATE,CONFIRMED
1696,CONFIRMED,CONFIRMED


In [13]:
# Calculate classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

                precision    recall  f1-score   support

     CANDIDATE       0.60      0.50      0.55       422
     CONFIRMED       0.59      0.66      0.62       450
FALSE POSITIVE       0.98      1.00      0.99       876

      accuracy                           0.79      1748
     macro avg       0.72      0.72      0.72      1748
  weighted avg       0.79      0.79      0.79      1748



# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [14]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV
param_grid = {'n_neighbors': [1, 5, 10, 20, 50],
              'weights': ['uniform', 'distance']}
grid = GridSearchCV(classifier, param_grid, cv=5, verbose=0)

In [15]:
# Train the model with GridSearch
grid.fit(X_train_scaled, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=20, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=None,
             param_grid={'n_neighbors': [1, 5, 10, 20, 50],
                         'weights': ['uniform', 'distance']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [16]:
# Print the best parameters and score
print(grid.best_params_)
print(grid.best_score_)

{'n_neighbors': 10, 'weights': 'distance'}
0.8081251192065612


In [17]:
# Make predictions with the hypertuned model
predictions = grid.predict(X_test_scaled)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
1981,CONFIRMED,CANDIDATE
5609,FALSE POSITIVE,FALSE POSITIVE
532,FALSE POSITIVE,FALSE POSITIVE
6558,CANDIDATE,CANDIDATE
1249,FALSE POSITIVE,FALSE POSITIVE
...,...,...
2516,CONFIRMED,CONFIRMED
322,FALSE POSITIVE,FALSE POSITIVE
1154,CONFIRMED,CONFIRMED
1696,CONFIRMED,CONFIRMED


In [18]:
# Calculate classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

                precision    recall  f1-score   support

     CANDIDATE       0.63      0.50      0.56       422
     CONFIRMED       0.60      0.70      0.65       450
FALSE POSITIVE       0.98      1.00      0.99       876

      accuracy                           0.80      1748
     macro avg       0.74      0.73      0.73      1748
  weighted avg       0.80      0.80      0.80      1748



# Save the Model

In [19]:
# Save the model
# import joblib
# filename = 'Exoplanet_Model.h5'
# joblib.dump(classifier, filename)