# Logistic Regression

In [1]:
# import packages
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# import functions
from ipynb.fs.full.Functions import load_train_test_data, model_eval

# turn of warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
# load the preprocessed data set
X_train, X_test, y_train, y_test = load_train_test_data()

A snippet of our training data:


Unnamed: 0,Age,PhysicallyActive,BMI,Sleep,SoundSleep,JunkFood,Stress,BPLevel,Pregnancies,UrinationFreq,BMI_high,Family_Diabetes_yes,Pdiabetes_yes,Gender_Male,Alcohol_yes,Smoking_yes,RegularMedicine_yes,highBP_yes,BMI_high.1
16,2.0,2.0,-0.697273,0.022886,0.256854,0.0,1.0,1.0,-0.413322,0.0,0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0
110,2.0,1.0,0.428174,0.022886,-0.812113,0.0,2.0,2.0,-0.413322,1.0,1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1
28,1.0,0.0,0.615748,-0.786704,0.256854,0.0,0.0,1.0,-0.413322,0.0,1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1
74,2.0,0.0,0.990897,0.022886,0.256854,0.0,3.0,2.0,-0.413322,0.0,1,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1
147,0.0,1.0,0.428174,0.832476,1.325822,0.0,1.0,1.0,-0.413322,1.0,1,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1


There are 240 entries with 19 columns in our training data.
There are 43 entries with 19 columns in our testing data.


# 1) Basic Model

In [3]:
log_model_base = LogisticRegression(random_state=0)
log_model_base.fit(X_train, y_train)

In [4]:
print(log_model_base.coef_)

[[ 1.161798    0.28920617  0.04545532 -0.08174592  0.25008789  0.29540288
  -0.03334834  0.83325372  0.27562537 -0.01164158  0.05197104  1.13635599
   0.92905291  0.23905246 -0.4100675   0.71903843  1.49277429 -0.01108394
   0.05197104]]


In [5]:
y_pred = log_model_base.predict(X_test)

In [6]:
model_eval(log_model_base,X_train, X_test, y_train, y_test)


Evaluation: accuracy_score
86.25% for the train data
72.09% for the test data

Evaluation: f1_score
78.15% for the train data
45.45% for the test data

Evaluation: recall_score
76.62% for the train data
35.71% for the test data

Evaluation: precision_score
79.73% for the train data
62.50% for the test data


In [7]:
grid_vals = {'C': [0.001, 0.01, 0.1, 1], 
            'max_iter': [100, 1000], 
            'penalty': ['l1', 'l2'], 
            'solver': ['newton-cg', 'lbfgs', 'liblinear']}

grid_search = GridSearchCV(estimator=log_model_base, param_grid=grid_vals, scoring='accuracy', 
                       cv=10, refit=True, verbose=2, return_train_score=True) 

In [8]:
grid_search.fit(X_train, y_train)
grid_pred = grid_search.best_estimator_.predict(X_test)

Fitting 10 folds for each of 48 candidates, totalling 480 fits
[CV] END C=0.001, max_iter=100, penalty=l1, solver=newton-cg; total time=   0.0s
[CV] END C=0.001, max_iter=100, penalty=l1, solver=newton-cg; total time=   0.0s
[CV] END C=0.001, max_iter=100, penalty=l1, solver=newton-cg; total time=   0.0s
[CV] END C=0.001, max_iter=100, penalty=l1, solver=newton-cg; total time=   0.0s
[CV] END C=0.001, max_iter=100, penalty=l1, solver=newton-cg; total time=   0.0s
[CV] END C=0.001, max_iter=100, penalty=l1, solver=newton-cg; total time=   0.0s
[CV] END C=0.001, max_iter=100, penalty=l1, solver=newton-cg; total time=   0.0s
[CV] END C=0.001, max_iter=100, penalty=l1, solver=newton-cg; total time=   0.0s
[CV] END C=0.001, max_iter=100, penalty=l1, solver=newton-cg; total time=   0.0s
[CV] END C=0.001, max_iter=100, penalty=l1, solver=newton-cg; total time=   0.0s
[CV] END ....C=0.001, max_iter=100, penalty=l1, solver=lbfgs; total time=   0.0s
[CV] END ....C=0.001, max_iter=100, penalty=l1

[CV] END ...C=0.001, max_iter=1000, penalty=l2, solver=lbfgs; total time=   0.0s
[CV] END ...C=0.001, max_iter=1000, penalty=l2, solver=lbfgs; total time=   0.0s
[CV] END ...C=0.001, max_iter=1000, penalty=l2, solver=lbfgs; total time=   0.0s
[CV] END ...C=0.001, max_iter=1000, penalty=l2, solver=lbfgs; total time=   0.0s
[CV] END ...C=0.001, max_iter=1000, penalty=l2, solver=lbfgs; total time=   0.0s
[CV] END C=0.001, max_iter=1000, penalty=l2, solver=liblinear; total time=   0.0s
[CV] END C=0.001, max_iter=1000, penalty=l2, solver=liblinear; total time=   0.0s
[CV] END C=0.001, max_iter=1000, penalty=l2, solver=liblinear; total time=   0.0s
[CV] END C=0.001, max_iter=1000, penalty=l2, solver=liblinear; total time=   0.0s
[CV] END C=0.001, max_iter=1000, penalty=l2, solver=liblinear; total time=   0.0s
[CV] END C=0.001, max_iter=1000, penalty=l2, solver=liblinear; total time=   0.0s
[CV] END C=0.001, max_iter=1000, penalty=l2, solver=liblinear; total time=   0.0s
[CV] END C=0.001, max

[CV] END C=0.01, max_iter=1000, penalty=l2, solver=newton-cg; total time=   0.0s
[CV] END C=0.01, max_iter=1000, penalty=l2, solver=newton-cg; total time=   0.0s
[CV] END C=0.01, max_iter=1000, penalty=l2, solver=newton-cg; total time=   0.0s
[CV] END C=0.01, max_iter=1000, penalty=l2, solver=newton-cg; total time=   0.0s
[CV] END C=0.01, max_iter=1000, penalty=l2, solver=newton-cg; total time=   0.0s
[CV] END C=0.01, max_iter=1000, penalty=l2, solver=newton-cg; total time=   0.0s
[CV] END C=0.01, max_iter=1000, penalty=l2, solver=newton-cg; total time=   0.0s
[CV] END C=0.01, max_iter=1000, penalty=l2, solver=newton-cg; total time=   0.0s
[CV] END C=0.01, max_iter=1000, penalty=l2, solver=newton-cg; total time=   0.0s
[CV] END ....C=0.01, max_iter=1000, penalty=l2, solver=lbfgs; total time=   0.0s
[CV] END ....C=0.01, max_iter=1000, penalty=l2, solver=lbfgs; total time=   0.0s
[CV] END ....C=0.01, max_iter=1000, penalty=l2, solver=lbfgs; total time=   0.0s
[CV] END ....C=0.01, max_ite

[CV] END .C=0.1, max_iter=1000, penalty=l1, solver=liblinear; total time=   0.0s
[CV] END .C=0.1, max_iter=1000, penalty=l1, solver=liblinear; total time=   0.0s
[CV] END .C=0.1, max_iter=1000, penalty=l1, solver=liblinear; total time=   0.0s
[CV] END .C=0.1, max_iter=1000, penalty=l1, solver=liblinear; total time=   0.0s
[CV] END .C=0.1, max_iter=1000, penalty=l1, solver=liblinear; total time=   0.0s
[CV] END .C=0.1, max_iter=1000, penalty=l1, solver=liblinear; total time=   0.0s
[CV] END .C=0.1, max_iter=1000, penalty=l1, solver=liblinear; total time=   0.0s
[CV] END .C=0.1, max_iter=1000, penalty=l2, solver=newton-cg; total time=   0.0s
[CV] END .C=0.1, max_iter=1000, penalty=l2, solver=newton-cg; total time=   0.0s
[CV] END .C=0.1, max_iter=1000, penalty=l2, solver=newton-cg; total time=   0.0s
[CV] END .C=0.1, max_iter=1000, penalty=l2, solver=newton-cg; total time=   0.0s
[CV] END .C=0.1, max_iter=1000, penalty=l2, solver=newton-cg; total time=   0.0s
[CV] END .C=0.1, max_iter=10

[CV] END ...C=1, max_iter=1000, penalty=l1, solver=liblinear; total time=   0.0s
[CV] END ...C=1, max_iter=1000, penalty=l1, solver=liblinear; total time=   0.0s
[CV] END ...C=1, max_iter=1000, penalty=l1, solver=liblinear; total time=   0.0s
[CV] END ...C=1, max_iter=1000, penalty=l1, solver=liblinear; total time=   0.0s
[CV] END ...C=1, max_iter=1000, penalty=l1, solver=liblinear; total time=   0.0s
[CV] END ...C=1, max_iter=1000, penalty=l1, solver=liblinear; total time=   0.0s
[CV] END ...C=1, max_iter=1000, penalty=l2, solver=newton-cg; total time=   0.0s
[CV] END ...C=1, max_iter=1000, penalty=l2, solver=newton-cg; total time=   0.0s
[CV] END ...C=1, max_iter=1000, penalty=l2, solver=newton-cg; total time=   0.0s
[CV] END ...C=1, max_iter=1000, penalty=l2, solver=newton-cg; total time=   0.0s
[CV] END ...C=1, max_iter=1000, penalty=l2, solver=newton-cg; total time=   0.0s
[CV] END ...C=1, max_iter=1000, penalty=l2, solver=newton-cg; total time=   0.0s
[CV] END ...C=1, max_iter=10

In [10]:
print("Best hyperparameters :", grid_search.best_params_)
print("Best cross-validated accuracy :", grid_search.best_score_)

Best hyperparameters : {'C': 1, 'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear'}
Best cross-validated accuracy : 0.8208333333333332
