In [1]:
#Dependencies
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC 
from sklearn.metrics import classification_report


EXTRACT

In [2]:
# Read the csv file into a pandas DataFrame
df = pd.read_csv('Resources/cumulative.csv')
print(f"rows: {len(df)} / columns: {len(df.columns)}")
df.head()


rows: 9564 / columns: 50


Unnamed: 0,rowid,kepid,kepoi_name,kepler_name,koi_disposition,koi_pdisposition,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,1,10797460,K00752.01,Kepler-227 b,CONFIRMED,CANDIDATE,1.0,0,0,0,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,2,10797460,K00752.02,Kepler-227 c,CONFIRMED,CANDIDATE,0.969,0,0,0,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
2,3,10811496,K00753.01,,FALSE POSITIVE,FALSE POSITIVE,0.0,0,1,0,...,-176.0,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
3,4,10848459,K00754.01,,FALSE POSITIVE,FALSE POSITIVE,0.0,0,1,0,...,-174.0,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
4,5,10854555,K00755.01,Kepler-664 b,CONFIRMED,CANDIDATE,1.0,0,0,0,...,-211.0,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509


TRANSFORM

In [3]:
#Clean data
#1. Remove empty columns and null values
df2 = df.dropna(axis=1,how="all")  # todas permanecen despues

#2. Remove empty values (rows with some empty values)
df3 = df2.dropna()

#3. Remove unnecessary columns | keep only features and outcome
df4 = df3.drop(["rowid","kepid","kepoi_name","kepler_name","koi_pdisposition","koi_tce_delivname"], axis=1)  

print(f"rows: {len(df4)} / columns: {len(df4.columns)}")
df4.head()

rows: 2269 / columns: 42


Unnamed: 0,koi_disposition,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,1.0,0,0,0,0,9.488036,2.8e-05,-2.8e-05,170.53875,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,CONFIRMED,0.969,0,0,0,0,54.418383,0.000248,-0.000248,162.51384,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
4,CONFIRMED,1.0,0,0,0,0,2.525592,4e-06,-4e-06,171.59555,...,-211.0,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
5,CONFIRMED,1.0,0,0,0,0,11.094321,2e-05,-2e-05,171.20116,...,-232.0,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714
6,CONFIRMED,1.0,0,0,0,0,4.134435,1e-05,-1e-05,172.97937,...,-232.0,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


In [4]:
# 4. Define features/Input variables ("X") and Outcome column with its values ("y")

X = df4.drop(["koi_disposition"], axis=1)  
feature_names = X.columns

y = df4["koi_disposition"]

print(X.shape,y.shape)


(2269, 41) (2269,)


PREPROCESSING


In [5]:
# 1. Split data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state = 45)
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)

(1701, 41) (1701,)
(568, 41) (568,)


In [6]:
#3. Scale the INPUT data using "MinMaxScaler" (applies only for features values), and the OUTPUT data with "LabelEncoder"(works only for target values). Normalize data and apply transformation
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

X_scaled = MinMaxScaler().fit(X_train)
y_scaled = LabelEncoder().fit(y_train)

X_train_scaled = X_scaled.transform(X_train)
X_test_scaled = X_scaled.transform(X_test)
y_train_scaled = y_scaled.transform(y_train)
y_test_scaled = y_scaled.transform(y_test)

CREATE AND TEST MODEL (SVC Model)

In [7]:
# 1. Create the SVC Model to fit it to the scaled training data
model = SVC(kernel='linear')
model.fit(X_train_scaled, y_train_scaled)
model

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [8]:
#2. Validate with the test data and print Classification Report
from sklearn.metrics import classification_report  

predictions = model.predict(X_test_scaled)
print(classification_report(y_test_scaled, predictions, target_names=y.unique()))


precision    recall  f1-score   support

     CONFIRMED       1.00      1.00      1.00       568
FALSE POSITIVE       0.00      0.00      0.00         0

      accuracy                           1.00       568
     macro avg       0.50      0.50      0.50       568
  weighted avg       1.00      1.00      1.00       568



In [9]:
# 3. Validate Model Accuracy  
print('Training Accuracy: %.3f' % model.score(X_train_scaled, y_train_scaled))
print('Testing Accuracy: %.3f' % model.score(X_test_scaled, y_test_scaled))

Training Accuracy: 0.999
Testing Accuracy: 0.998


It wouldn't be possible to apply an Hyper tunning of the model, because the model already have an accuracy of 1. And this is because all cases fell in the category of "CONFIRMED".

Hyper parameter Tunning - DOES NOT APPLY

In [10]:
# 1. Create the GridSearch estimator along with a parameter object containing the values to adjust

# from sklearn.model_selection import GridSearchCV
# param_grid = {"C": [1, 5, 10, 50], "gamma": [0.0001, 0.0005, 0.001, 0.005]}
# grid = GridSearchCV(model, param_grid, verbose=3)

#2. Fit the model using the GridSearch estimator 
# grid.fit(X_train_scaled, y_train_scaled)

# 3. List the Best Parameters and Best Score for this dataset
# print(f"Best Parameters: {grid.best_params_}")
# print(f"Best score: {grid.best_score_}")

#4. Validate (make predictions) with the hypertuned model and print Classification Report
# predictions_grid = grid.predict(X_test_scaled)
# print(classification_report(y_test_scaled, predictions_grid, target_names=y.unique()))

# 5. Validate Model Accuracy   
# print('Training Accuracy: %.3f' % grid.score(X_train_scaled, y_train_scaled))
# print('Testing Accuracy: %.3f' % grid.score(X_test_scaled, y_test_scaled))

MODEL PERFORMANCE COMPARISON - DOES NOT APPLY

In [11]:
# FINAL COMPARISON

# print('Initial vs Tuned Model Training Accuracy: %.3f vs %.3f' % (model.score(X_train_scaled, y_train_scaled), grid.score(X_train_scaled, y_train_scaled)))

# print('Initial vs Tuned Model Testing Accuracy: %.3f vs %.3f' % (model.score(X_test_scaled, y_test_scaled), grid.score(X_test_scaled, y_test_scaled)))




SAVE MODEL TO FILE

In [12]:
#Save model to file
#(from https://scikit-learn.org/stable/tutorial/basic/tutorial.html?highlight=save%20model%20file)

from joblib import dump
dump(model, 'exoplanet_model2_HeidyG.sav') 

['exoplanet_model2_HeidyG.sav']