# Load Input Data

In [1]:
import pandas as pd 

# Loading the diabetes dataset 
data = pd.read_csv('Diabetes.csv')
# Remove rows that contain missing values
data = data.dropna()

In [2]:
X = data[['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age']]

y = data['Outcome']

In [3]:
X.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
3,1,89.0,66,23,94.0,28.1,0.167,21.0
4,0,137.0,40,35,168.0,43.1,2.288,33.0
6,3,78.0,50,32,88.0,31.0,0.248,26.0
8,2,197.0,70,45,543.0,30.5,0.158,53.0
13,1,189.0,60,23,846.0,30.1,0.398,59.0


In [4]:
y

3      0
4      1
6      1
8      1
13     1
      ..
753    1
755    1
760    0
763    0
765    0
Name: Outcome, Length: 392, dtype: int64

# Logistic Regression using SKLearn

doesn't give AIC value or p-values  
must do on own somehow to get these

In [6]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0, max_iter = 300).fit(X, y)

# Display the coefficients
print(clf.coef_, clf.intercept_)

[[ 7.94008998e-02  3.80593725e-02 -1.67455926e-03  1.17363332e-02
  -7.99509212e-04  7.00803397e-02  9.66705601e-01  3.45888220e-02]] [-9.9182084]


# Logistic Regression using StatsModels

Gives a lot more statistical information about the model

In [9]:
# importing libraries 
import statsmodels.api as sm 

# building the model and fitting the data 
log_reg = sm.Logit(y, X).fit() 

Optimization terminated successfully.
         Current function value: 0.563677
         Iterations 6


In [13]:
# printing the summary table 

# Gives p-values and confid intervals for coefficients
# if p-value > 0.05 then not a good indicator of the dependent variable
# if p-value < 0.05 signficant - reject Ho that cofficient is 0
print(log_reg.summary()) 

                           Logit Regression Results                           
Dep. Variable:                Outcome   No. Observations:                  392
Model:                          Logit   Df Residuals:                      384
Method:                           MLE   Df Model:                            7
Date:                Sun, 18 Jul 2021   Pseudo R-squ.:                  0.1128
Time:                        16:07:14   Log-Likelihood:                -220.96
converged:                       True   LL-Null:                       -249.05
Covariance Type:            nonrobust   LLR p-value:                 8.717e-10
                               coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------
Pregnancies                  0.1299      0.049      2.655      0.008       0.034       0.226
Glucose                      0.0174      0.005      3.765      0.000       0.008       0.

In [11]:
# printing the summary table 
# Gives AIC

print(log_reg.summary2())

                             Results: Logit
Model:                 Logit              Pseudo R-squared:   0.113     
Dependent Variable:    Outcome            AIC:                457.9225  
Date:                  2021-07-18 16:04   BIC:                489.6926  
No. Observations:      392                Log-Likelihood:     -220.96   
Df Model:              7                  LL-Null:            -249.05   
Df Residuals:          384                LLR p-value:        8.7167e-10
Converged:             1.0000             Scale:              1.0000    
No. Iterations:        6.0000                                           
------------------------------------------------------------------------
                          Coef.  Std.Err.    z    P>|z|   [0.025  0.975]
------------------------------------------------------------------------
Pregnancies               0.1299   0.0489  2.6554 0.0079  0.0340  0.2258
Glucose                   0.0174   0.0046  3.7654 0.0002  0.0084  0.0265
BloodPr

### Explanation of some of the terms in the summary table:

#### Coef: 
The coefficients of the input variables in the regression equation (remember how to interpret them?).
#### Log-Likelihood: 
The natural logarithm of the Maximum Likelihood Estimation (MLE) function. We mentioned previously that MLE is the optimisation process for finding the set of parameters which result in best fit.
#### LL-Null: 
The value of log-likelihood of the model when no independent variable is included (only an intercept is included).
#### Pseudo R-squ.: 
This is a substitute for the R-squared value in Least Squares linear regression. It is the ratio of the log-likelihood of the null model to that of the full model.