In [1]:

# 1. Packages

import statsmodels.api as sm
import statsmodels.formula.api as smf
import linearmodels.iv.model as lm

In [2]:
# 2. Data
houseprices_object = sm.datasets.get_rdataset(dataname="HousePrices", package="AER", cache=True)
houseprices = houseprices_object.data
print(houseprices.head())



   price  lotsize  bedrooms  bathrooms  stories driveway recreation fullbase  \
0  42000     5850         3          1        2      yes         no      yes   
1  38500     4000         2          1        1      yes         no       no   
2  49500     3060         3          1        1      yes         no       no   
3  60500     6650         3          1        2      yes        yes       no   
4  61000     6360         2          1        1      yes         no       no   

  gasheat aircon  garage prefer  
0      no     no       1     no  
1      no     no       0     no  
2      no     no       0     no  
3      no     no       0     no  
4      no     no       0     no  


In [3]:
# print(houseprices_object.__doc__)

In [4]:
# 3. Model
mlr1 = smf.ols(formula = "price ~ lotsize + bedrooms", data=houseprices).fit()

In [6]:
mlr1.summary()

"""The condition number is large, 2.6e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
"""

0,1,2,3
Dep. Variable:,price,R-squared:,0.37
Model:,OLS,Adj. R-squared:,0.368
Method:,Least Squares,F-statistic:,159.6
Date:,"Thu, 24 Oct 2024",Prob (F-statistic):,2.95e-55
Time:,14:21:37,Log-Likelihood:,-6213.1
No. Observations:,546,AIC:,12430.0
Df Residuals:,543,BIC:,12450.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,5612.5997,4102.819,1.368,0.172,-2446.741,1.37e+04
lotsize,6.0530,0.424,14.265,0.000,5.219,6.887
bedrooms,1.057e+04,1247.676,8.470,0.000,8116.488,1.3e+04

0,1,2,3
Omnibus:,77.789,Durbin-Watson:,1.193
Prob(Omnibus):,0.0,Jarque-Bera (JB):,146.854
Skew:,0.833,Prob(JB):,1.29e-32
Kurtosis:,4.919,Cond. No.,26000.0


In [7]:
# 4. Exogeneity
# TSLS
mdatac = sm.add_constant(data=houseprices, prepend=False)
mlr2 = lm.IV2SLS(dependent=mdatac["price"], exog=mdatac[['const', 'bedrooms']], endog=mdatac['lotsize'], instruments=mdatac[['driveway', 'garage']]).fit(cov_type='homoskedastic', debiased=True)

In [8]:
# Wu-Hausman Tests
print(mlr2.wu_hausman())
print(mlr2.wooldridge_regression)

Wu-Hausman test of exogeneity
H0: All endogenous variables are exogenous
Statistic: 50.9308
P-value: 0.0000
Distributed: F(1,542)
Wooldridge's regression test of exogeneity
H0: Endogenous variables are exogenous
Statistic: 50.9046
P-value: 0.0000
Distributed: chi2(1)


In [9]:
# Sargan Over-identifying Restrictions Test
print(mlr2.sargan)

Sargan's test of overidentification
H0: The model is not overidentified.
Statistic: 0.0477
P-value: 0.8271
Distributed: chi2(1)
