# HFUL Function

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [29]:
def HFUL_LinIV_OLS(y,X,Z,ZpZ):
    """Estimate Beta using HFUL, 2SLS, and OLS"""
    n = y.shape[0]
    k = X.shape[1]
    # Prep some matrices
    Z_pseudoinv = np.linalg.inv(ZpZ) @ Z.T
    X_bar = np.column_stack((y, X))
    # Calculate alpha hat
    P_diag = np.zeros(n) # create vector that has diagonal terms of projection matrix P, initialize with zeros
    for i in range(n):
        P_diag[i] = np.dot(Z[i, :], Z_pseudoinv[:, i])
    PXbarXbar = np.zeros((X_bar.shape[1], X_bar.shape[1]))
    for i in range(n):
        PXbarXbar = PXbarXbar + P_diag[i] * np.outer(X_bar[i, :], X_bar[i, :])
    A = np.linalg.inv(X_bar.T @ X_bar) @ (X_bar.T @ Z @ Z_pseudoinv @ X_bar - PXbarXbar)
    w, _ = np.linalg.eig(A)
    alpha_tilde = np.amin(w) #get the smallest eigenvalue
    alpha_hat = ((n+1) * alpha_tilde - 1)/(n + alpha_tilde - 1)
    # Now calculate b HFUL
    PXX = np.zeros((X.shape[1], X.shape[1]))
    PXy = np.zeros(k)
    for i in range(n):
        PXX = PXX + P_diag[i] * np.outer(X[i, :], X[i, :])
    for i in range(n):
        PXy = PXy + P_diag[i] * X[i, :] * y[i]
    b_hful_term1 = (X.T @ Z) @ (Z_pseudoinv @ X) - PXX - (alpha_hat * X.T @ X)
    b_hful_term2 = (X.T @ Z) @ (Z_pseudoinv @ y) - PXy - (alpha_hat * X.T @ y)
    b_hful = np.linalg.inv(b_hful_term1) @ b_hful_term2
    # Now calculate linear IV estimate
    b_liniv = np.linalg.inv(X.T @ Z @ Z_pseudoinv @ X) @ X.T @ Z @ Z_pseudoinv @ y
    # Finally, the good ol OLS estimates
    b_ols = np.linalg.inv(X.T @ X) @ X.T @ y
    # Put them all together and return.
    return np.array([b_hful[1], b_liniv[1], b_ols[1]])

# Data Prep

In [3]:
df = pd.read_stata('C:/Users/jacob/ARE212_Materials/angrist-krueger91.dta')

In [4]:
df.head()

Unnamed: 0,ageq,edu,logwage,married,state,qob,black,smsa,yob,region
0,47.0,12,6.245846,1,1,1,1,1,1933,0.0
1,46.25,12,5.847161,1,48,4,1,1,1933,0.0
2,50.0,12,6.645516,1,2,1,1,1,1930,0.0
3,47.0,16,6.706133,1,22,1,1,1,1933,0.0
4,42.25,14,6.357876,1,42,4,1,1,1937,0.0


1. Run regressions for (1) and (2) in Table V
We regress $logwage$ on $edu$, controlling for year of birth. For (2), we instrument $edu$ with 30 yob*qob dummies.

In [57]:
yob_dum[1930+yr]*qob_dum[c]

0         0
1         0
2         0
3         0
4         0
         ..
329504    0
329505    0
329506    0
329507    0
329508    0
Length: 329509, dtype: uint8

In [68]:
# Prepare Z matrix: yob dummies and yob*qob dummies
yob_dum = pd.get_dummies(df['yob'])
qob_dum = pd.get_dummies(df['qob'])
# Make interactions terms
qob_yob_interact = yob_dum
for yr in range(10):
    for c in range(1,5):
        qob_yob_interact[f"y3{yr}q{c}"] = yob_dum[1930+yr]*qob_dum[c]
# combine year dummies with 30 qob*yob dummies
qob_yob_interact.drop(qob_yob_interact.iloc[:, 0:10], inplace = True, axis = 1)
Z = qob_yob_interact.to_numpy()

In [65]:
qob_yob_interact.drop(qob_yob_interact.iloc[:, 0:10], inplace = True, axis = 1)
qob_yob_interact

Unnamed: 0,y30q1,y30q2,y30q3,y30q4,y31q1,y31q2,y31q3,y31q4,y32q1,y32q2,...,y37q3,y37q4,y38q1,y38q2,y38q3,y38q4,y39q1,y39q2,y39q3,y39q4
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
329504,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
329505,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
329506,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
329507,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
def ZpZij(i,j,Z):
    N = Z.shape[0]
    zpi = Z[:,i]
    zj = Z[:,j]
    ZpZij = 0
    for row in range(N):
        ZpZij = ZpZij + zpi[row]*zj[row]
    return ZpZij

In [20]:
def ZpZ(Z):
    N = Z.shape[0]
    l = Z.shape[1]
    ZpZ = np.zeros((l,l))
    for i in range(l):
        for j in range(l):
            ZpZ[i,j] = ZpZij(i,j,Z)
    return ZpZ

In [33]:
#qob_yob_interact.sum(axis=0)

In [22]:
%%time
ZpZ = ZpZ(Z)

Wall time: 6min 14s


In [41]:
ZpZ.shape

(40, 40)

In [50]:
ZpZ_00 = 0
for i in range(N):
    ZpZ_00 = ZpZ_00 + Z[i,0]*Z[i,0]
ZpZ_00

IndexError: too many indices for array

In [42]:
sum(Z[:,0])

33602

In [44]:
Z.T @ Z

array([[ 66,   0,   0, ...,   0,   0,   0],
       [  0, 119,   0, ...,   0,   0,   0],
       [  0,   0, 211, ...,   0,   0,   0],
       ...,
       [  0,   0,   0, ..., 252,   0,   0],
       [  0,   0,   0, ...,   0, 135,   0],
       [  0,   0,   0, ...,   0,   0, 211]], dtype=uint8)

In [43]:
np.dot(Z[:,0],Z[:,0])

66

In [27]:
# y vector: log wages
y = df['logwage'].to_numpy()
# Prepare X matrix: education and yob dummies
X = np.column_stack((df['edu'].to_numpy(), yob_dum.to_numpy()))

In [30]:
%%time
results = HFUL_LinIV_OLS(y,X,Z,ZpZ)
results

array([ 4.93106953e+00, -7.21369732e+03,  8.45727079e+02])