In [1]:
# Further example using different models and different data sets

import pandas as pd
import numpy as np

data=pd.read_excel('C:/IssamMalki/Python Codes/Predictive Analytics Sessions/nls80.xls')

data=data.dropna()  # drops missing values (you must specify them in the excel file as NaN)

controls = ['hours', 'married', 'exper', 'age']  # set the control variables

print(data[['lwage','educ'] + controls].describe(percentiles=[]))  # describing data. lwage is the dependent variable, educ is impact variable

instruments = ['sibs','feduc','meduc']   # set the instruments - all are meant to capture the changes in education

print(data[instruments].describe(percentiles=[]))

# Let's try and explore more about the relationship between endogenous and instruments

print('Correlation matrix')
print(data[['educ'] + instruments].corr())

# Running OLS regression: note here I am adding the library at this stage to make sure we 
# understand the function of each library 

from statsmodels.api import OLS, add_constant
from linearmodels.iv import IV2SLS

# Set the model


data['const'] = 1  # if you are using this approach, you must add the constant.
controls = ['const'] + controls

# OLS estimation 
ivolsmod = IV2SLS(data.lwage, data[['educ'] + controls], None, None)
ols_output = ivolsmod.fit(cov_type='robust')
print(ols_output.summary)

#2SLS: using one single instrument - just identified OLS
iv2slsmod = IV2SLS(data.lwage, data[controls], data.educ, data.sibs)
tsls_output = iv2slsmod.fit(cov_type='robust')
print(tsls_output.summary)

#2SLS: Over-identified model - using the whole set of instruments identified
iv2slsmod2 = IV2SLS(data.lwage, data[controls], data.educ, data[['sibs','feduc','meduc']])
tsls2_output= iv2slsmod2.fit(cov_type='robust')
print(tsls2_output.summary)


from linearmodels.iv import IVGMM  # Note, I am importing the library again here, otherwise the GMM estimator won't work

# GMM Estimator
ivgmmmod =IVGMM(data.lwage, data[controls], data.educ, data[['sibs','feduc','meduc']])
gmm_output = ivgmmmod.fit(cov_type='robust')
print(gmm_output.summary)

# The IV estimator as defined above is a special case of the GMM estimator since W=Z'Z,
# which is the weights used to account for endogneity
# When we change the weights, the estimation output will be different.
# The weighting matrix can be changed. I use clustered weight by exper. 
#The covariance estimator should usually match the weighting matrix,
# and so clustering is also used here.

ivgmmmod2 = IVGMM(data.lwage, data[controls], data.educ, data[['sibs','feduc','meduc']],
                weight_type='clustered', clusters=data.exper)

gmm2_output = ivgmmmod2.fit(cov_type='clustered', clusters=data.exper)
print(gmm2_output.summary)



from linearmodels.iv import IVGMMCUE  # Note, I am importing the library again here, otherwise the GMM estimator won't work

ivgmmcod = IVGMMCUE(data.lwage, data[controls], data.educ, data[['sibs','feduc','meduc']])
gmmcue_output = ivgmmcod.fit(cov_type='robust', display=True)
print(gmmcue_output.summary)


# Comparing all models 

from collections import OrderedDict
from linearmodels.iv.results import compare

res = OrderedDict()
res['OLS'] = ols_output
res['2SLS-one Z'] = tsls_output
res['2SLS-all Z'] = tsls2_output
res['GMM'] = gmm_output
res['GMM Cluster(exper)'] = gmm2_output
res['GMM-CUE'] = gmmcue_output
print(compare(res))

            lwage        educ       hours     married       exper         age
count  722.000000  722.000000  722.000000  722.000000  722.000000  722.000000
mean     6.799923   13.663435   44.030471    0.887812   11.333795   32.900277
std      0.419385    2.236755    7.282618    0.315817    4.249265    3.088348
min      4.744932    9.000000   20.000000    0.000000    1.000000   28.000000
50%      6.827629   13.000000   40.000000    1.000000   11.000000   32.000000
max      8.032035   18.000000   80.000000    1.000000   22.000000   38.000000
             sibs       feduc       meduc
count  722.000000  722.000000  722.000000
mean     2.858726   10.254848   10.806094
std      2.250471    3.305757    2.828636
min      0.000000    0.000000    0.000000
50%      2.000000   10.500000   12.000000
max     14.000000   18.000000   18.000000
Correlation matrix
           educ      sibs     feduc     meduc
educ   1.000000 -0.212802  0.427095  0.371762
sibs  -0.212802  1.000000 -0.226889 -0.291255
fed

  1,
