# Logistic Regression Model

In [1]:
import pandas as pd

# ML libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

import warnings 
warnings.filterwarnings("ignore")

### Read the CSV and Perform Basic Data Cleaning

In [2]:
df = pd.read_csv("exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()

In [3]:
# create a test train split
y = df['koi_disposition']
X = df.drop(columns=['koi_disposition'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

### Pre-Processing

Scale the data using the MinMaxScaler and perform some feature selection

In [4]:
# Scale your data
X_scaler = MinMaxScaler().fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

### Train the Model

In [5]:
model_lr = LogisticRegression()
model_lr.fit(X_train_scaled, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [6]:
print(f"Training Data Score: {model_lr.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model_lr.score(X_test_scaled, y_test)}")

Training Data Score: 0.8456990272744612
Testing Data Score: 0.8443935926773455


### Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [7]:
param_grid_lr = {
    'C': [750,1000,1250],
    'penalty': ['l1','l2']}
grid_lr = GridSearchCV(model_lr, param_grid_lr, verbose=3)

In [8]:
# Train the model with GridSearch
grid_lr.fit(X_train_scaled, y_train)

Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV] C=750, penalty=l1 ...............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ................... C=750, penalty=l1, score=0.887, total=  40.4s
[CV] C=750, penalty=l1 ...............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   40.3s remaining:    0.0s


[CV] ................... C=750, penalty=l1, score=0.875, total=  31.1s
[CV] C=750, penalty=l1 ...............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.2min remaining:    0.0s


[CV] ................... C=750, penalty=l1, score=0.885, total=  19.3s
[CV] C=750, penalty=l2 ...............................................
[CV] ................... C=750, penalty=l2, score=0.882, total=   0.3s
[CV] C=750, penalty=l2 ...............................................
[CV] ................... C=750, penalty=l2, score=0.874, total=   0.3s
[CV] C=750, penalty=l2 ...............................................
[CV] ................... C=750, penalty=l2, score=0.883, total=   0.3s
[CV] C=1000, penalty=l1 ..............................................
[CV] .................. C=1000, penalty=l1, score=0.886, total=  34.9s
[CV] C=1000, penalty=l1 ..............................................
[CV] .................. C=1000, penalty=l1, score=0.875, total=  28.9s
[CV] C=1000, penalty=l1 ..............................................
[CV] .................. C=1000, penalty=l1, score=0.884, total=  22.3s
[CV] C=1000, penalty=l2 ..............................................
[CV] .

[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed:  4.3min finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [750, 1000, 1250], 'penalty': ['l1', 'l2']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [9]:
print(grid_lr.best_params_)
print(grid_lr.best_score_)

{'C': 750, 'penalty': 'l1'}
0.8821285523555217


In [10]:
# retune original model with best params
model_lr = LogisticRegression(C=1000, penalty='l1')
model_lr.fit(X_train_scaled, y_train)

print(f"Training Data Score: {model_lr.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model_lr.score(X_test_scaled, y_test)}")

Training Data Score: 0.8842265878313943
Testing Data Score: 0.8958810068649885
