# Linear Model 2 - Logistic regression

In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.genmod.generalized_linear_model import GLM
from sklearn.feature_selection import RFECV
from sklearn.model_selection import train_test_split
from statsmodels.genmod.families.family import Binomial
from statsmodels.tools.tools import add_constant
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from auxiliars import *
import pickle

In [3]:
np.random.seed(1234)

## Data

Standarized data loading:

In [4]:
data = pd.read_csv("./data/stdHTRU_2.csv")

We split a separate test set of relative size 20%:

In [5]:
X_train, X_test, y_train, y_test = train_test_split(data[data.columns[0:8]], 
                                                    data['class'], 
                                                    test_size = 0.2,
                                                    random_state = 1234)

I order to improve the performance of k-NN, we will analyze the performance of the method with no-correlated standarized data: 

In [6]:
noCorrData = pd.read_csv("./data/noCorrStdHTRU_2.csv")

In [7]:
X_train_NC, X_test_NC, y_train_NC, y_test_NC = train_test_split(noCorrData[noCorrData.columns[0:8]], 
                                                    noCorrData['class'], 
                                                    test_size = 0.2,
                                                    random_state = 1234)

## Model: Supervised Logistic regression

Scikit-learn library offersa method for Logistic Regression classification.

In [8]:
from sklearn.linear_model import LogisticRegression

In [9]:
LR = LogisticRegression(n_jobs = -1)

LogisticRegression allow us to hypertuning the following parameters:
- Penalty: Used to specify the norm used in the penalization.
    - L1: Lasso regression.
    - L2: Ridge regression.
- C: Inverse of regularization strength
- Algorithm to use in the optimization problem:
    - liblinear: for small datasets.
    - saga: for larger datasets.

In order to hypertuning model parameters and get a better idea on how the model performs on unseen data, we will use GridSearchCV.

In [13]:
from sklearn.model_selection import GridSearchCV

Values of the 10-Fold CV Grid to test:

In [56]:
grid = {'penalty' : ['l1','l2'],
    'C' : np.logspace(-4, 4, 20),
    'solver' : ['liblinear', 'saga']}

In [57]:
grid

{'penalty': ['l1', 'l2'],
 'C': array([1.00000000e-04, 2.63665090e-04, 6.95192796e-04, 1.83298071e-03,
        4.83293024e-03, 1.27427499e-02, 3.35981829e-02, 8.85866790e-02,
        2.33572147e-01, 6.15848211e-01, 1.62377674e+00, 4.28133240e+00,
        1.12883789e+01, 2.97635144e+01, 7.84759970e+01, 2.06913808e+02,
        5.45559478e+02, 1.43844989e+03, 3.79269019e+03, 1.00000000e+04]),
 'solver': ['liblinear', 'saga']}

Grid Search 10-Fold CV:

In [58]:
gs10cv = GridSearchCV(LR, param_grid = grid, cv = 10, n_jobs = -1)

### Training

In [59]:
gs10cv.fit(X_train, y_train)

  " = {}.".format(effective_n_jobs(self.n_jobs)))


GridSearchCV(cv=10, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=-1, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': array([1.00000000e-04, 2.6366...
       4.83293024e-03, 1.27427499e-02, 3.35981829e-02, 8.85866790e-02,
       2.33572147e-01, 6.15848211e-01, 1.62377674e+00, 4.28133240e+00,
       1.12883789e+01, 2.97635144e+01, 7.84759970e+01, 2.06913808e+02,
       5.45559478e+02, 1.43844989e+03, 3.79269019e+03, 1.00000000e+04]),
      

In [60]:
gs10cv.best_params_

{'C': 1.623776739188721, 'penalty': 'l1', 'solver': 'liblinear'}

In [61]:
pd.DataFrame(gs10cv.cv_results_).iloc[gs10cv.best_index_]

mean_fit_time                                                 0.109415
std_fit_time                                                 0.0230148
mean_score_time                                             0.00190327
std_score_time                                             0.000296796
param_C                                                        1.62378
param_penalty                                                       l1
param_solver                                                 liblinear
params               {'C': 1.623776739188721, 'penalty': 'l1', 'sol...
split0_test_score                                              0.97905
split1_test_score                                              0.97905
split2_test_score                                             0.976257
split3_test_score                                             0.979749
split4_test_score                                             0.976257
split5_test_score                                             0.981844
split6

In [62]:
# Save model
LRFile = open('./models/LR_BestCV_STDData_pickle_file', 'wb')
pickle.dump(gs10cv, LRFile) 

#### Training with no-correlated data

Grid Search 10-Fold CV:

In [63]:
gs10cv_nc = GridSearchCV(LR, param_grid = grid, cv = 10, n_jobs = -1)

Training:

In [64]:
gs10cv_nc.fit(X_train_NC, y_train_NC)

  " = {}.".format(effective_n_jobs(self.n_jobs)))


GridSearchCV(cv=10, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=-1, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': array([1.00000000e-04, 2.6366...
       4.83293024e-03, 1.27427499e-02, 3.35981829e-02, 8.85866790e-02,
       2.33572147e-01, 6.15848211e-01, 1.62377674e+00, 4.28133240e+00,
       1.12883789e+01, 2.97635144e+01, 7.84759970e+01, 2.06913808e+02,
       5.45559478e+02, 1.43844989e+03, 3.79269019e+03, 1.00000000e+04]),
      

In [65]:
pd.DataFrame(gs10cv_nc.cv_results_).iloc[gs10cv_nc.best_index_]

mean_fit_time                                                0.0235514
std_fit_time                                                0.00421367
mean_score_time                                              0.0018029
std_score_time                                             0.000427628
param_C                                                      0.0127427
param_penalty                                                       l1
param_solver                                                 liblinear
params               {'C': 0.012742749857031334, 'penalty': 'l1', '...
split0_test_score                                                    1
split1_test_score                                                    1
split2_test_score                                                    1
split3_test_score                                                    1
split4_test_score                                                    1
split5_test_score                                                    1
split6

In [66]:
# Save model
LRFileNC = open('./models/LR_BestCV_NCorrSTDData_pickle_file', 'wb')
pickle.dump(gs10cv_nc, LRFile)

### Testing 

In [67]:
y_pred = gs10cv.predict(X_test)

print("Confusion Matrix:")
confusionMatrix(y_test, y_pred, classes = [0,1])

Confusion Matrix:


Predicted,0,1
Real,Unnamed: 1_level_1,Unnamed: 2_level_1
0,3229,20
1,60,271


In [68]:
print("Test Error:")
(1-accuracy_score(y_test, gs10cv.predict(X_test)))*100

Test Error:


2.2346368715083775

#### Testing with no-correlated data

In [69]:
y_pred_NC = gs10cv_nc.predict(X_test_NC)
print(y_pred_NC)
print("Confusion Matrix:")
confusionMatrix(y_test_NC, y_pred_NC, classes = [0,1])

[0 0 0 ... 0 0 0]
Confusion Matrix:


Predicted,0,1
Real,Unnamed: 1_level_1,Unnamed: 2_level_1
0,3249,0
1,0,331


In [70]:
print("Test Error:")
(1-accuracy_score(y_test_NC, gs10cv_nc.predict(X_test_NC)))*100

Test Error:


0.0