In [1]:
# Set working directory
cd("C:/Users/54116/OneDrive - UBC/PhD - UBC/1. Metrics II/4. psets/9")

In [2]:
using Parameters, Optim, ForwardDiff, LinearAlgebra, Distributions, Random, PrettyTables, Plots, LaTeXStrings, QuadGK, Roots, JLD2, Statistics

In [3]:
Random.seed!(1405);

#### Question 2 - Monte Carlo experiments for ATE in a 2-step estimation model

In [4]:
F(w)=cdf(Normal(0,1),w);
G(w)=pdf(Normal(0,1),w);

In [6]:
function dgp(n,τ0,α0,β0,γ0,δ0,μ0) # τ0 must be 2x1
    dist1 = Normal(μ0, 1)
    X2 = rand(dist1,n)
    v0 = randn(n)
    v1 = randn(n)
    X1 = ones(n)
    X = [X1 X2]
    ϵ = randn(n)
    D = ((X*τ0 .>= ϵ).*1.0)[:,1] #this way D is a vector{float} instead of matrix - and CF provides float which optmin likes
    Px = F.(X*τ0)
    Y0 = α0 .+ β0*Px .+ v0
    Y1 = γ0 .+ δ0*Px .+ v1
    Y = (D .>= 0).*(D .<= 0).*Y0 + (D.>=1) .* (D.<=1) .*Y1
    return (X1=X1, X2=X2, X=X, D=D, Y=Y)
end;

In [7]:
function cf_probit(τ,X1,X2,D)
n = length(D)    
Qn = - (1/n) * D'*log.(F(τ[1]*X1 .+ τ[2]*X2)) - (1/n) * (1 .- D)' * log.(1 .- F(τ[1]*X1 .+ τ[2]*X2))
    return (Qn = Qn)
end

cf_probit (generic function with 1 method)

In [8]:
function mle_probit(X1,X2,D)
n = length(D)
result = optimize(τ->cf_probit(τ,X1,X2,D),[0.0;0.0;])
τ_n = Optim.minimizer(result)
    return (τ_n = τ_n)
end

mle_probit (generic function with 1 method)

In [97]:
function twostep(X1,X2,X,D,Y)
#first, estimate parameters
n = length(D)
τ_n = mle_probit(X1,X2,D)
Z = [ones(n) F.(X*τ_n) D (F.(X*τ_n).-Statistics.mean(F.(X*τ_n))).*D]
θ_n = inv(Z'Z)*Z'Y
β_n = θ_n[2]
δβ_n = θ_n[4]    

#now for the standard error
B_n = (Z'*Z)/n #4x4
Ω_n = (Z'*(((Y - Z*θ_n).^2).*Z))/n  #4x4 - need to broadcast residuals nx1 to the Z matrix nx4
s_n = (D.*(G.(X*τ_n)./F.(X*τ_n)) - (1 .- D).*(G.(X*τ_n)./(1 .- F.(X*τ_n)))).*X # nx2
C_n = (s_n'*s_n)/n # 2x2
meanvector = [Statistics.mean(G.(X*τ_n).*X[:,1]) Statistics.mean(G.(X*τ_n).*X[:,2])]
Δ_n = (Z'*(β_n * G.(X*τ_n) .* X + δβ_n * (G.(X*τ_n).*X .- meanvector)))/n #4x2
V_θ = inv(B_n)*(Ω_n+Δ_n*inv(C_n)*Δ_n')*inv(B_n)
se_θ = sqrt(V_θ[3,3]/n)
V_not_adjusted = inv(B_n)*(Ω_n)*inv(B_n)
se_not = sqrt(V_not_adjusted[3,3]/n)
    
#intervals
ci_lower_θ = quantile(Normal(), 0.1 / 2) .* se_θ .+ θ_n[3]
ci_upper_θ = quantile(Normal(), 1 - 0.1 / 2) .* se_θ .+ θ_n[3]
ci_lower_not = quantile(Normal(), 0.1 / 2) .* se_not .+ θ_n[3]
ci_upper_not = quantile(Normal(), 1 - 0.1 / 2) .* se_not .+ θ_n[3]    
        
    return (θ_n=θ_n[3], se_θ=se_θ, se_not=se_not, ci_upper_θ, ci_lower_θ, ci_upper_not, ci_lower_not) #
end

twostep (generic function with 1 method)

In [218]:
#X1,X2,X,D,Y=dgp(100,[-0.5 2]',1,1,1,1,1);
#cf_probit([-0.5, 2],X1,X2,D)
#n = length(D)
#result = optimize(τ->cf_probit(τ,X1,X2,D),[0.0;0.0;])
#mle_probit(X1,X2,D)
twostep(X1,X2,X,D,Y)

(θ_n = 0.16937039222064915, se_θ = 0.6422678366268927, se_not = 0.6368502244755219, ci_upper_θ = 1.2258069727706689, ci_lower_θ = -0.8870661883293711, ci_upper_not = 1.21689579377407, ci_lower_not = -0.8781550093327724)

In [473]:
function boostrap(X1,X2,D,Y,B,η)
n=length(D)
θ_boot = zeros(B)
    
for i=1:B
        
sampling = sample([1:n;],n,replace=true)
X2_b = X2[sampling]
D_b = D[sampling]
Y_b = Y[sampling]
X = [ones(n) X2_b]
        
@unpack θ_n = twostep(X1,X2_b,X,D_b,Y_b)
        
θ_boot[i] = θ_n
end
    
se_boot = sqrt((sum((θ_boot .- Statistics.mean(θ_boot)).^2))/B)
    
θ_range = sort(θ_boot)    

boot_lower_ci = θ_range[Int(η/2*(B+1))]
 
boot_upper_ci = θ_range[Int((1-η/2)*(B+1))]
    
return (θ_boot=θ_boot, se_boot=se_boot, boot_lower_ci=boot_lower_ci, boot_upper_ci=boot_upper_ci)
end

boostrap (generic function with 1 method)

In [604]:
B = 999
n=length(D)
θ_boot = zeros(B)
    
for i=1:B
    
sampling = sample([1:n;],n,replace=true)
X2_b = X2[sampling]
D_b = D[sampling]
Y_b = Y[sampling]
X = [ones(n) X2_b]
        
@unpack θ_n = twostep(X1,X2_b,X,D_b,Y_b)
        
θ_boot[i] = θ_n
end
    
se_boot = sqrt((sum((θ_boot .- Statistics.mean(θ_boot)).^2))/B)
    
θ_range = sort(θ_boot)    

boot_lower_ci = θ_range[Int(0.1/2*(B+1))]
 
boot_upper_ci = θ_range[Int((1-0.1/2)*(B+1))];

In [600]:
# esto lo corre bien - el problema está en el loop
sampling = sample([1:n;],n,replace=true)
X2_b = X2[sampling]
D_b = D[sampling]
Y_b = Y[sampling]
X = [ones(n) X2_b]
        
@unpack θ_n = twostep(X1,X2_b,X,D_b,Y_b)

(θ_n = -0.18547091578576413, se_θ = 0.8876267085483491, se_not = 0.878026105518953, ci_upper_θ = 1.274545095148985, ci_lower_θ = -1.6454869267205137, ci_upper_not = 1.2587535084351613, ci_lower_not = -1.6296953400066903)

In [602]:
#se_boot, boot_lower_ci, boot_upper_ci
θ_boot
θ_range
θ_range[Int(0.1/2*(B+1))]
θ_range[Int((1-0.1/2)*(B+1))]

2.8776727198724075

In [459]:
α = 1
γ = 1
β = 1
δ = 1
ATE = γ-α+(δ-β)*(quadgk(z->F(1*(-0.5)+2*z)*G(z),-Inf,Inf,atol=1e-8)[1])

0.0

In [460]:
# coverage probabilities

function intervals(r,b)

R = r # put 1,000
    
InsideCI_2step = 0
InsideCI_not = 0
InsideCI_boot = 0
    
for i=1:R
    X1,X2,X,D,Y = dgp(100,[-0.5,2],1,1,1,1,1)
    
    @unpack θ_n, se_θ, se_not, ci_lower_θ, ci_upper_θ, ci_lower_not, ci_upper_not = twostep(X1,X2,X,D,Y);
    InsideCI_2step += ci_upper_θ > 0 && ci_lower_θ < 0 ? 1.0 : 0.0 ### true value of theta[3], ATE, is 0
    InsideCI_not += ci_upper_not > 0 && ci_lower_not < 0 ? 1.0 : 0.0 ###
        
    @unpack boot_lower_ci, boot_upper_ci = boostrap(X1,X2,D,Y,b,0.1)  ### b plays here
    InsideCI_boot += boot_upper_ci > 0 && boot_lower_ci < 0 ? 1.0 : 0.0 ###   
        
end
    return (twostep = InsideCI_2step/R, not_adjusted = InsideCI_not/R, bootstrap = InsideCI_boot/R)
end

intervals (generic function with 1 method)

In [472]:
intervals(100,999)

(twostep = 0.92, not_adjusted = 0.9, bootstrap = 1.0)