In [1]:
#Dependencies
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report


EXTRACT

In [2]:
# Read the csv file into a pandas DataFrame
df = pd.read_csv('Resources/cumulative.csv')
print(f"rows: {len(df)} / columns: {len(df.columns)}")
df.head()


rows: 9564 / columns: 50


Unnamed: 0,rowid,kepid,kepoi_name,kepler_name,koi_disposition,koi_pdisposition,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,1,10797460,K00752.01,Kepler-227 b,CONFIRMED,CANDIDATE,1.0,0,0,0,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,2,10797460,K00752.02,Kepler-227 c,CONFIRMED,CANDIDATE,0.969,0,0,0,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
2,3,10811496,K00753.01,,FALSE POSITIVE,FALSE POSITIVE,0.0,0,1,0,...,-176.0,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
3,4,10848459,K00754.01,,FALSE POSITIVE,FALSE POSITIVE,0.0,0,1,0,...,-174.0,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
4,5,10854555,K00755.01,Kepler-664 b,CONFIRMED,CANDIDATE,1.0,0,0,0,...,-211.0,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509


TRANSFORM

In [3]:
#Clean data
#1. Remove empty columns
df2 = df.dropna(axis=1,how="all")

#2. Remove unnecessary columns | keep only features and outcome
df3 = df2.drop(["rowid","kepid","kepoi_name","kepler_name","koi_pdisposition","koi_tce_delivname"], axis=1)  

#3. Remove empty values (rows with some empty values)
df4 = df3.dropna()

print(f"rows: {len(df4)} / columns: {len(df4.columns)}")
df4.head()

rows: 7803 / columns: 42


Unnamed: 0,koi_disposition,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,1.0,0,0,0,0,9.488036,2.775e-05,-2.775e-05,170.53875,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,CONFIRMED,0.969,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
2,FALSE POSITIVE,0.0,0,1,0,0,19.89914,1.494e-05,-1.494e-05,175.850252,...,-176.0,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
3,FALSE POSITIVE,0.0,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,...,-174.0,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
4,CONFIRMED,1.0,0,0,0,0,2.525592,3.761e-06,-3.761e-06,171.59555,...,-211.0,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509


In [4]:
# 4. Define features/Input variables ("X") and Outcome column with its values ("y")

X = df4.drop(["koi_disposition"], axis=1)  # nos quedamos solo con las variables
feature_names = X.columns

y = df4["koi_disposition"]

print(X.shape,y.shape)


(7803, 41) (7803,)


PREPROCESSING


In [5]:
# 1. Split data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state = 45)
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)

(5852, 41) (5852,)
(1951, 41) (1951,)


In [6]:
#3. Scale the INPUT data using "MinMaxScaler" (applies only for features values), and the OUTPUT data with "LabelEncoder"(works only for target values). Normalize data and apply transformation
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

X_scaled = MinMaxScaler().fit(X_train)
y_scaled = LabelEncoder().fit(y_train)

X_train_scaled = X_scaled.transform(X_train)
X_test_scaled = X_scaled.transform(X_test)
y_train_scaled = y_scaled.transform(y_train)
y_test_scaled = y_scaled.transform(y_test)

CREATE AND TEST MODEL (LOGISTIC REGRESSION Model)

In [7]:
# 1. Create the Logistic Regression Model to fit it to the scaled training data
model = LogisticRegression() 
model.fit(X_train_scaled, y_train_scaled)
model

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [8]:
# #2. Validate with the test data and print Classification Report
from sklearn.metrics import classification_report  

predictions = model.predict(X_test_scaled)
print(classification_report(y_test_scaled, predictions, target_names=y.unique()))


precision    recall  f1-score   support

     CONFIRMED       0.77      0.51      0.61       434
FALSE POSITIVE       0.70      0.86      0.77       566
     CANDIDATE       0.99      1.00      0.99       951

      accuracy                           0.85      1951
     macro avg       0.82      0.79      0.79      1951
  weighted avg       0.86      0.85      0.85      1951



In [9]:
# 3. Validate Model Accuracy  
print('Training Accuracy: %.3f' % model.score(X_train_scaled, y_train_scaled))
print('Testing Accuracy: %.3f' % model.score(X_test_scaled, y_test_scaled))

Training Accuracy: 0.840
Testing Accuracy: 0.852


Hyper parameter Tunning

In [10]:
# 1. Create the GridSearch estimator along with a parameter object containing the values to adjust

from sklearn.model_selection import GridSearchCV

param_grid = {"C": [1, 5, 10, 50], "max_iter": [200,500,700,1000]}
grid = GridSearchCV(model, param_grid, verbose=3)


In [11]:
#2. Fit the model using the GridSearch estimator 
grid.fit(X_train_scaled, y_train_scaled)

Fitting 3 folds for each of 16 candidates, totalling 48 fits
[CV] C=1, max_iter=200 ...............................................
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[CV] ................... C=1, max_iter=200, score=0.834, total=   0.2s
[CV] C=1, max_iter=200 ...............................................
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[CV] ................... C=1, max_iter=200, score=0.822, total=   0.2s
[CV] C=1, max_iter=200 ...............................................
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.4s remaining:    0.0s
[CV] ................... C=1, max_iter=200, score=0.842, total=   0.2s
[CV] C=1, max_iter=500 ...............................................
[CV] ................... C=1, max_iter=500, score=0.834, total=   0.2s
[CV] C=1, max_iter=500 ...............................................
[CV] ................... C=1, max_iter=500, score=0.822, to

GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [1, 5, 10, 50],
                         'max_iter': [200, 500, 700, 1000]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [12]:
# 3. List the Best Parameters and Best Score for this dataset
print(f"Best Parameters: {grid.best_params_}")
print(f"Best score: {grid.best_score_}")

Best Parameters: {'C': 50, 'max_iter': 200}
Best score: 0.859876965140123


In [13]:
#4. Validate (make predictions) with the hypertuned model and print Classification Report

predictions_grid = grid.predict(X_test_scaled)
print(classification_report(y_test_scaled, predictions_grid, target_names=y.unique()))

precision    recall  f1-score   support

     CONFIRMED       0.82      0.61      0.70       434
FALSE POSITIVE       0.75      0.88      0.81       566
     CANDIDATE       0.99      1.00      0.99       951

      accuracy                           0.88      1951
     macro avg       0.85      0.83      0.83      1951
  weighted avg       0.88      0.88      0.87      1951



In [14]:
# 5. Validate Model Accuracy   
print('Training Accuracy: %.3f' % grid.score(X_train_scaled, y_train_scaled))
print('Testing Accuracy: %.3f' % grid.score(X_test_scaled, y_test_scaled))

Training Accuracy: 0.867
Testing Accuracy: 0.877


In [15]:
# FINAL COMPARISON

print('Initial vs Tuned Model Training Accuracy: %.3f vs %.3f' % (model.score(X_train_scaled, y_train_scaled), grid.score(X_train_scaled, y_train_scaled)))

print('Initial vs Tuned Model Testing Accuracy: %.3f vs %.3f' % (model.score(X_test_scaled, y_test_scaled), grid.score(X_test_scaled, y_test_scaled)))




Initial vs Tuned Model Training Accuracy: 0.840 vs 0.867
Initial vs Tuned Model Testing Accuracy: 0.852 vs 0.877


In [16]:
#Save model to file
#(from https://scikit-learn.org/stable/tutorial/basic/tutorial.html?highlight=save%20model%20file)

from joblib import dump
dump(model, 'exoplanet_model4_HeidyGuzman.sav') 

['exoplanet_model4_HeidyGuzman.sav']