# <center> Logistic regression

    

In [3]:
### import of libraries
import pandas as pd
import numpy as np
import statsmodels.api as sm

### Load data:

In [4]:
dataUni = pd.read_csv("uni.csv", header=0)
dataUni.head() # First 5 lines
dataUni.tail() # Last 5 lines
#dataUni.info()# Form of variable coding
#dataUni.isnull().sum()# Count if there are negative values in each column

Unnamed: 0,admit,gre,gpa,rank
395,0,620,4.0,2
396,0,560,3.04,3
397,0,460,2.63,2
398,0,700,3.65,2
399,0,600,3.89,3


In [5]:
### predictive variables

x1 = dataUni[['gre']] 
x1 = sm.add_constant(x1) 
x2 = dataUni[['gpa']] 
x2 = sm.add_constant(x2)
x3 = dataUni[['rank']] 
x3 = sm.add_constant(x3)
x4 = dataUni[['gre', 'gpa', 'rank']] 

### predictable variables
y = dataUni[['admit']] 


### Model 1

$H_0:$ The result of the exam does not affect whether someone was admitted to studies

$H_1:$ The result of the exam affect whether someone was admitted to studies


In [6]:
uniReg = sm.GLM(y, x1, family=sm.families.Binomial())
uniRegRes1 = uniReg.fit()
uniRegRes1.summary()

0,1,2,3
Dep. Variable:,admit,No. Observations:,400.0
Model:,GLM,Df Residuals:,398.0
Model Family:,Binomial,Df Model:,1.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-243.03
Date:,"Tue, 02 Mar 2021",Deviance:,486.06
Time:,16:23:36,Pearson chi2:,399.0
No. Iterations:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-2.9013,0.606,-4.787,0.000,-4.089,-1.714
gre,0.0036,0.001,3.633,0.000,0.002,0.006


The result of the exam does not affect whether someone was admitted to studies.

### Model 2

$H_0:$ The number of points on the exam does not affect whether someone was admitted to studies

$H_1:$ The number of points on the exam affect whether someone was admitted to studies

In [7]:
uniReg = sm.GLM(y, x2, family=sm.families.Binomial())
uniRegRes2 = uniReg.fit()
uniRegRes2.summary()

0,1,2,3
Dep. Variable:,admit,No. Observations:,400.0
Model:,GLM,Df Residuals:,398.0
Model Family:,Binomial,Df Model:,1.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-243.48
Date:,"Tue, 02 Mar 2021",Deviance:,486.97
Time:,16:23:41,Pearson chi2:,401.0
No. Iterations:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-4.3576,1.035,-4.209,0.000,-6.387,-2.328
gpa,1.0511,0.299,3.517,0.000,0.465,1.637


The number of points on the exam affect whether someone is admitted to studies.

### Model 3

$H_0:$ The ranking position does not affect whether someone was admitted to studies

$H_1:$ The ranking position affect whether someone was admitted to studies

In [8]:
uniReg = sm.GLM(y, x3, family=sm.families.Binomial())
uniRegRes3 = uniReg.fit()
uniRegRes3.summary()

0,1,2,3
Dep. Variable:,admit,No. Observations:,400.0
Model:,GLM,Df Residuals:,398.0
Model Family:,Binomial,Df Model:,1.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-237.86
Date:,"Tue, 02 Mar 2021",Deviance:,475.71
Time:,16:23:47,Pearson chi2:,402.0
No. Iterations:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.6366,0.306,2.080,0.038,0.037,1.237
rank,-0.5863,0.124,-4.728,0.000,-0.829,-0.343


The ranking position affect whether someone was admitted to studies.

### Model 4

$H_0:$ Ranking position, number of points, result of the exam does not affect whether someone was admitted to studies

$H_1:$ Ranking position, number of points, result of the exam does not affect whether someone was admitted to studies

In [9]:
uniReg = sm.GLM(y, x4, family=sm.families.Binomial())
uniRegRes4 = uniReg.fit()
uniRegRes4.summary()

0,1,2,3
Dep. Variable:,admit,No. Observations:,400.0
Model:,GLM,Df Residuals:,397.0
Model Family:,Binomial,Df Model:,2.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-234.55
Date:,"Tue, 02 Mar 2021",Deviance:,469.1
Time:,16:23:52,Pearson chi2:,403.0
No. Iterations:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
gre,0.0015,0.001,1.420,0.155,-0.001,0.004
gpa,-0.0042,0.201,-0.021,0.983,-0.398,0.390
rank,-0.6695,0.121,-5.527,0.000,-0.907,-0.432


## Akaike and Bayesian information criteria:


In [10]:
print(f'AIC, model 1: {uniRegRes1.aic:.3f}')
print(f'AIC, model 2: {uniRegRes2.aic:.3f}')
print(f'AIC, model 3: {uniRegRes3.aic:.3f}')
print(f'AIC, model 4: {uniRegRes4.aic:.3f}')

AIC, model 1: 490.056
AIC, model 2: 490.968
AIC, model 3: 479.711
AIC, model 4: 475.097


The best model is the one for which the value of the information criterion is the lowest. The best is model 4.

## Logistic regression using  SKlearn


In [11]:
#import 

from sklearn.linear_model import LogisticRegression


# zmienne predykcyjne
x1 = dataUni[['gre']] 

x2 = dataUni[['gpa']]

x3 = dataUni[['rank']]

x4 = dataUni[['gre', 'gpa', 'rank']]

# zmienna predykowana
y = dataUni[['admit']] 


### Model 1

In [12]:
model_N = LogisticRegression(solver='lbfgs')
model_N.fit(x1, y)

  return f(**kwargs)


LogisticRegression()

In [14]:
print(f' intercept : {model_N.intercept_}')
print(f' coef : {model_N.coef_ } ')
print(f' R^2 : {model_N.score(x1, y) } ')

 intercept : [-2.9013421]
 coef : [[0.00358221]] 
 R^2 : 0.6825 


### Model 2

In [15]:
model_N2 = LogisticRegression(solver='lbfgs')
model_N2.fit(x4, y)

  return f(**kwargs)


LogisticRegression()

In [16]:
print(f' intercept : {model_N2.intercept_}')
print(f' coef : {model_N2.coef_ } ') # dla wszystkich Xsów!
print(f' R^2 : {model_N2.score(x4, y) } ')

 intercept : [-3.2666256]
 coef : [[ 0.00238418  0.70117133 -0.54988098]] 
 R^2 : 0.7075 
