In [28]:
import pandas as pd #this package allows us to "excel-ify" python
import numpy as np #most of the mathematical functions we'll need
import statsmodels.api as sm #for statistical analysis

import matplotlib.pyplot as plt #to plot
import seaborn as sns #to plot

In [29]:
#X represents the size of a tumor in centimeters.
X = np.array([3.78, 2.44, 2.09, 0.14, 1.72, 1.65, 4.92, 4.37, 4.96, 4.52, 3.69, 5.88]).reshape(-1,1)

#Note: X has to be reshaped into a column from a row for the LogisticRegression() function to work.

#y represents whether or not the tumor is cancerous (0 for "No", 1 for "Yes").
y = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]) 

In [35]:
#Step 12: Logistic Regression
# This is very similar to the code for OLS regression

X_c = sm.add_constant(X)

logit = sm.Logit(endog=y, exog=X_c).fit() # this is where we use sm. Logit instead of sm.OLS
logitSummary= logit.summary()
print (logitSummary) 

Optimization terminated successfully.
         Current function value: 0.140603
         Iterations 11
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                   12
Model:                          Logit   Df Residuals:                       10
Method:                           MLE   Df Model:                            1
Date:                Mon, 06 Feb 2023   Pseudo R-squ.:                  0.7972
Time:                        13:48:14   Log-Likelihood:                -1.6872
converged:                       True   LL-Null:                       -8.3178
Covariance Type:            nonrobust   LLR p-value:                 0.0002710
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const        -17.9783     19.733     -0.911      0.362     -56.654      20.698
x1             4.8465      5

Remember the interpretation is change in x1 on log odds of y!

In [31]:
# let's compare to OLS regression
ols = sm.OLS(y,X_c).fit() # this is where we use sm. Logit instead of sm.OLS
OLSsummary= ols.summary()
print (OLSsummary) 

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.701
Model:                            OLS   Adj. R-squared:                  0.671
Method:                 Least Squares   F-statistic:                     23.47
Date:                Mon, 06 Feb 2023   Prob (F-statistic):           0.000677
Time:                        13:44:24   Log-Likelihood:                -1.4618
No. Observations:                  12   AIC:                             6.924
Df Residuals:                      10   BIC:                             7.893
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.3523      0.196     -1.797      0.1



# ANOTHER Example

In [42]:
import statsmodels.api as sm

spector_data from Spector and Mazzeo (1980). https://www.tandfonline.com/doi/abs/10.1080/00220485.1980.10844952

 Variable name definitions::

        Grade - binary variable indicating whether or not a student's grade
                improved.  1 indicates an improvement.
        TUCE  - Test score on economics test
        PSI   - participation in program
        GPA   - Student's grade point average

In [67]:
spector_data = sm.datasets.spector.load_pandas()
df = spector_data.data
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   GPA     32 non-null     float64
 1   TUCE    32 non-null     float64
 2   PSI     32 non-null     float64
 3   GRADE   32 non-null     float64
dtypes: float64(4)
memory usage: 1.1 KB


Unnamed: 0,GPA,TUCE,PSI,GRADE
0,2.66,20.0,0.0,0.0
1,2.89,22.0,0.0,0.0
2,3.28,24.0,0.0,0.0
3,2.92,12.0,0.0,0.0
4,4.0,21.0,0.0,1.0


In [69]:
spector_data.exog = sm.add_constant(spector_data.exog)

In [68]:
logit_mod = sm.Logit(spector_data.endog, spector_data.exog)
logit_res = logit_mod.fit()

Optimization terminated successfully.
         Current function value: 0.586580
         Iterations 5


In [70]:
ols_mod = sm.OLS(spector_data.endog, spector_data.exog)
ols_res=ols_mod.fit()

In [71]:
print(logit_res.summary())

                           Logit Regression Results                           
Dep. Variable:                  GRADE   No. Observations:                   32
Model:                          Logit   Df Residuals:                       29
Method:                           MLE   Df Model:                            2
Date:                Mon, 06 Feb 2023   Pseudo R-squ.:                 0.08844
Time:                        14:01:59   Log-Likelihood:                -18.771
converged:                       True   LL-Null:                       -20.592
Covariance Type:            nonrobust   LLR p-value:                    0.1618
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
GPA            0.2993      0.682      0.439      0.661      -1.036       1.635
TUCE          -0.1015      0.100     -1.019      0.308      -0.297       0.094
PSI            1.6364      0.813      2.014      0.0

a change in gpa by 1 changes log odds by 0.2993

In [77]:
# to calculate prob from odds
import math
math.exp(.2993)/(1+math.exp(.2993))

0.5742713870829339

a change in GPA by 1 improves probablity by 0.57!

In [72]:
print(ols_res.summary())

                            OLS Regression Results                            
Dep. Variable:                  GRADE   R-squared:                       0.416
Model:                            OLS   Adj. R-squared:                  0.353
Method:                 Least Squares   F-statistic:                     6.646
Date:                Mon, 06 Feb 2023   Prob (F-statistic):            0.00157
Time:                        14:02:01   Log-Likelihood:                -12.978
No. Observations:                  32   AIC:                             33.96
Df Residuals:                      28   BIC:                             39.82
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -1.4980      0.524     -2.859      0.0