# regression-models

Regresssion models from Chapter 13 of _The Effect_ by Nick Huntington-Klein.

## Config

In [2]:
%run include-2-shared-functions.ipynb

['Solarize_Light2', '_classic_test_patch', '_mpl-gallery', '_mpl-gallery-nogrid', 'bmh', 'classic', 'dark_background', 'fast', 'fivethirtyeight', 'ggplot', 'grayscale', 'seaborn', 'seaborn-bright', 'seaborn-colorblind', 'seaborn-dark', 'seaborn-dark-palette', 'seaborn-darkgrid', 'seaborn-deep', 'seaborn-muted', 'seaborn-notebook', 'seaborn-paper', 'seaborn-pastel', 'seaborn-poster', 'seaborn-talk', 'seaborn-ticks', 'seaborn-white', 'seaborn-whitegrid', 'tableau-colorblind10']


## Simple Regression Models

In [3]:
# Load the restaurant inspection dataset from causaldata
df_res = cdata.restaurant_inspections.load_pandas().data
df_res.head()

Unnamed: 0,business_name,inspection_score,Year,NumberofLocations,Weekend
0,MCGINLEYS PUB,94,2017,9,False
1,VILLAGE INN #1,86,2015,66,False
2,RONNIE SUSHI 2,80,2016,79,False
3,FRED MEYER - RETAIL FISH,96,2003,86,False
4,PHO GRILL,83,2017,53,False


In [4]:
# Model 1
## Use NumberofLocations as the independent variable and inspection_score as the dependent variable
m1 = sm.ols(formula='inspection_score ~ NumberofLocations', data=df_res).fit()

In [5]:
Stargazer([m1])

0,1
,
,Dependent variable:inspection_score
,
,(1)
,
Intercept,94.866***
,(0.046)
NumberofLocations,-0.019***
,(0.000)
Observations,27178


In [6]:
# Model 2
## Use NumberofLocations and Year as independent variables and inspection_score as the dependent variable
m2 = sm.ols(formula='inspection_score ~ NumberofLocations + Year', data=df_res).fit()

In [7]:
Stargazer([m2])

0,1
,
,Dependent variable:inspection_score
,
,(1)
,
Intercept,225.333***
,(12.411)
NumberofLocations,-0.019***
,(0.000)
Year,-0.065***


In [8]:
df_res['Weekend'].value_counts()

False    26968
True       210
Name: Weekend, dtype: int64

In [9]:
# Model 3
## Use NumberofLocations, Year, and Weekend as independent variables and inspection_score as the dependent variable
m3 = sm.ols(formula='inspection_score ~ NumberofLocations + Year + Weekend', data=df_res).fit()

In [10]:
Stargazer([m3])

0,1
,
,Dependent variable:inspection_score
,
,(1)
,
Intercept,224.680***
,(12.410)
NumberofLocations,-0.019***
,(0.000)
Weekend[T.True],1.432***


In [11]:
# Show the results of all 3 models simultaneously
Stargazer([m1,m2,m3])

0,1,2,3
,,,
,Dependent variable:inspection_score,Dependent variable:inspection_score,Dependent variable:inspection_score
,,,
,(1),(2),(3)
,,,
Intercept,94.866***,225.333***,224.680***
,(0.046),(12.411),(12.410)
NumberofLocations,-0.019***,-0.019***,-0.019***
,(0.000),(0.000),(0.000)
Weekend[T.True],,,1.432***


In [12]:
# Create a non-linear version of m1
## including the second power of NumberofLocations
m1_nl = sm.ols(formula='inspection_score ~ NumberofLocations + np.power(NumberofLocations,2)', data=df_res).fit()

In [13]:
Stargazer([m1_nl])

0,1
,
,Dependent variable:inspection_score
,
,(1)
,
Intercept,97.518***
,(0.059)
NumberofLocations,-0.080***
,(0.001)
"np.power(NumberofLocations, 2)",0.000***


In [14]:
# Create an interaction model 
m4 = sm.ols(formula='inspection_score ~ NumberofLocations + Weekend + Year + np.multiply(NumberofLocations, Year) + np.multiply(NumberofLocations, Weekend)', 
            data=df_res).fit()

In [15]:
Stargazer([m4])

0,1
,
,Dependent variable:inspection_score
,
,(1)
,
Intercept,311.228***
,(15.444)
NumberofLocations,-1.459***
,(0.154)
Weekend[T.True],1.600***


## Polynomials and interaction effects

In [16]:
# Create a model with independent variables NumberofLocations, NumberofLocations**2, and Year
## Use I() to insert calculations like squaring the NumberofLocations
m5 = sm.ols(formula='inspection_score ~ NumberofLocations + I(NumberofLocations**2) + Year', data=df_res).fit()

In [17]:
Stargazer([m5])

0,1
,
,Dependent variable:inspection_score
,
,(1)
,
I(NumberofLocations ** 2),0.000***
,(0.000)
Intercept,362.834***
,(11.601)
NumberofLocations,-0.084***


In [18]:
m5.summary()

0,1,2,3
Dep. Variable:,inspection_score,R-squared:,0.21
Model:,OLS,Adj. R-squared:,0.21
Method:,Least Squares,F-statistic:,2402.0
Date:,"Sun, 22 Jan 2023",Prob (F-statistic):,0.0
Time:,10:55:49,Log-Likelihood:,-85203.0
No. Observations:,27178,AIC:,170400.0
Df Residuals:,27174,BIC:,170400.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,362.8342,11.601,31.276,0.000,340.096,385.573
NumberofLocations,-0.0844,0.001,-82.888,0.000,-0.086,-0.082
I(NumberofLocations ** 2),0.0001,1.77e-06,69.684,0.000,0.000,0.000
Year,-0.1319,0.006,-22.870,0.000,-0.143,-0.121

0,1,2,3
Omnibus:,2526.441,Durbin-Watson:,1.943
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3411.908
Skew:,-0.775,Prob(JB):,0.0
Kurtosis:,3.782,Cond. No.,17000000.0


In [19]:
# Use the t_test to test the linear combinations of the coefficients
m5.t_test('NumberofLocations + 2*I(NumberofLocations ** 2)*100 = 0')

<class 'statsmodels.stats.contrast.ContrastResults'>
                             Test for Constraints                             
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
c0            -0.0597      0.001    -84.505      0.000      -0.061      -0.058

In [20]:
# Create a model with interaction between NumberofLocations and Weekend
m6 = sm.ols(formula='inspection_score ~ NumberofLocations*Weekend + Year', data=df_res).fit()

In [21]:
m6.summary()

0,1,2,3
Dep. Variable:,inspection_score,R-squared:,0.069
Model:,OLS,Adj. R-squared:,0.069
Method:,Least Squares,F-statistic:,502.3
Date:,"Sun, 22 Jan 2023",Prob (F-statistic):,0.0
Time:,10:55:49,Log-Likelihood:,-87430.0
No. Observations:,27178,AIC:,174900.0
Df Residuals:,27173,BIC:,174900.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,225.1260,12.415,18.134,0.000,200.793,249.460
Weekend[T.True],1.7592,0.488,3.606,0.000,0.803,2.715
NumberofLocations,-0.0191,0.000,-43.759,0.000,-0.020,-0.018
NumberofLocations:Weekend[T.True],-0.0098,0.008,-1.307,0.191,-0.025,0.005
Year,-0.0648,0.006,-10.494,0.000,-0.077,-0.053

0,1,2,3
Omnibus:,2829.404,Durbin-Watson:,1.931
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3861.648
Skew:,-0.849,Prob(JB):,0.0
Kurtosis:,3.725,Cond. No.,682000.0


In [22]:
# Use the t_test to test linear combinations of coefficients
m6.t_test('NumberofLocations + NumberofLocations:Weekend[T.True] = 0')

<class 'statsmodels.stats.contrast.ContrastResults'>
                             Test for Constraints                             
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
c0            -0.0289      0.008     -3.851      0.000      -0.044      -0.014

## Mortgage dataset

In [None]:
df_mort = cdata.mortgages.load_pandas().data
df_mort.head()