## Week 13 notebook

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
import statsmodels.api as sm
import sklearn as skl

### Question 1

In [2]:
# Read in 13.1
df_13_1 = pd.read_csv(r'C:\Users\JT von Seggern\DS Masters Repos\2025-summer-mod-6\homework_13.1.csv')
df_13_1.drop(columns=['Unnamed: 0'], inplace=True)
df_13_1.head()

Unnamed: 0,Z,X,Y
0,1,0,0.056584
1,1,1,1.387519
2,0,1,4.289658
3,0,1,0.743003
4,0,1,2.718527


In [9]:
# Instrumental Variables approach: Effect of X on Z using Y as instrument
treatment = df_13_1['X']    # Endogenous variable
instrument = df_13_1['Y']   # Instrument
outcome = df_13_1['Z']      # Outcome variable

print("=== INSTRUMENTAL VARIABLES: EFFECT OF X ON Z ===")
print("Using Y as instrument for X")
print()

# First stage: Regress X on instrument Y
# X = α₀ + α₁Y + u
first_stage = sm.OLS(treatment, sm.add_constant(instrument)).fit()
print("=== FIRST STAGE: X ~ Y ===")
print(f"Instrument coefficient (α₁): {first_stage.params[1]:.4f}")
print(f"F-statistic: {first_stage.fvalue:.4f}")
print(f"R-squared: {first_stage.rsquared:.4f}")

# Check instrument relevance
if first_stage.fvalue > 10:
    print("✓ Instrument appears relevant (F > 10)")
else:
    print("⚠ Weak instrument concern (F < 10)")
print()

# Get predicted values of X from first stage
X_hat = first_stage.fittedvalues

# Second stage: Regress Z on predicted X
# Z = β₀ + β₁X̂ + v
X_hat_df = pd.DataFrame({'X_hat': X_hat})
X_hat_with_const = sm.add_constant(X_hat_df)

second_stage = sm.OLS(outcome, X_hat_with_const).fit()
print("=== SECOND STAGE: Z ~ X̂ ===")
print(f"IV estimate of X effect on Z: {second_stage.params['X_hat']:.4f}")
print(f"Standard error: {second_stage.bse['X_hat']:.4f}")
print(f"T-statistic: {second_stage.tvalues['X_hat']:.4f}")
print(f"P-value: {second_stage.pvalues['X_hat']:.6f}")
print()

=== INSTRUMENTAL VARIABLES: EFFECT OF X ON Z ===
Using Y as instrument for X

=== FIRST STAGE: X ~ Y ===
Instrument coefficient (α₁): 0.0093
F-statistic: 19.4423
R-squared: 0.0019
✓ Instrument appears relevant (F > 10)

=== SECOND STAGE: Z ~ X̂ ===
IV estimate of X effect on Z: 0.2220
Standard error: 0.2331
T-statistic: 0.9524
P-value: 0.340909



  print(f"Instrument coefficient (α₁): {first_stage.params[1]:.4f}")


### Question 2

In [11]:
# Read in 13.2
df_13_2 = pd.read_csv(r'C:\Users\JT von Seggern\DS Masters Repos\2025-summer-mod-6\homework_13.2.csv')
df_13_2.drop(columns=['Unnamed: 0'], inplace=True)
df_13_2.head()

Unnamed: 0,Z2,X2,Y2
0,0.215107,-2.069048,3.304559
1,0.459804,0.647129,-1.12253
2,-1.015782,-0.167446,-1.27034
3,0.84965,0.424037,-1.835687
4,0.157479,0.224863,-1.105033


In [13]:
# Use stage least squares to find the effect of X2 on Y2
treatment_2 = df_13_2['X2']    # Endogenous variable
instrument_2 = df_13_2['Z2']   # Instrument
outcome_2 = df_13_2['Y2']      # Outcome variable

# Two-Stage Least Squares (2SLS) approach:

# First stage: Regress treatment (X2) on instrument (Z2)
# X2 = α₀ + α₁Y2 + u
first_stage_2 = sm.OLS(treatment_2, sm.add_constant(instrument_2)).fit()
print("=== FIRST STAGE RESULTS ===")
print(f"Instrument coefficient: {first_stage_2.params[1]:.4f}")
print(f"F-statistic: {first_stage_2.fvalue:.4f}")
print(f"R-squared: {first_stage_2.rsquared:.4f}")
print()

# Check instrument relevance (F-stat > 10 rule of thumb)
if first_stage_2.fvalue > 10:
    print("✓ Instrument appears relevant (F > 10)")
else:
    print("⚠ Weak instrument concern (F < 10)")

# Second Stage: Regress outcome (Y2) on predicted treatment (X2_hat)
# Y2 = β₀ + β₁X2_hat + v
X2_hat = first_stage_2.fittedvalues
X2_hat_df = pd.DataFrame({'X2_hat': X2_hat})
X2_hat_with_const = sm.add_constant(X2_hat_df)

second_stage_2 = sm.OLS(outcome_2, X2_hat_with_const).fit()
print("=== SECOND STAGE RESULTS ===")
print(f"IV estimate of X2 effect on Z2: {second_stage_2.params['X2_hat']:.4f}")
print(f"Standard error: {second_stage_2.bse['X2_hat']:.4f}")
print(f"T-statistic: {second_stage_2.tvalues['X2_hat']:.4f}")
print(f"P-value: {second_stage_2.pvalues['X2_hat']:.6f}")
print()

=== FIRST STAGE RESULTS ===
Instrument coefficient: 1.0002
F-statistic: 4803.5038
R-squared: 0.3245

✓ Instrument appears relevant (F > 10)
=== SECOND STAGE RESULTS ===
IV estimate of X2 effect on Z2: -2.5026
Standard error: 0.0279
T-statistic: -89.6943
P-value: 0.000000



  print(f"Instrument coefficient: {first_stage_2.params[1]:.4f}")


### Question 3

In [14]:
df_13_3 = pd.read_csv(r'C:\Users\JT von Seggern\DS Masters Repos\2025-summer-mod-6\homework_13.3.csv')
df_13_3.drop(columns=['Unnamed: 0'], inplace=True)
df_13_3.head()

Unnamed: 0,Z3,X3,ZW_int,W3,Y3
0,0.560354,1.787329,0.690434,1.232138,2.16741
1,0.352335,0.035811,-0.147173,-0.417707,0.110531
2,-0.459113,-0.12572,-0.22268,0.485023,-0.550161
3,-1.042234,3.866518,-2.082372,1.997988,5.568673
4,-0.770318,0.402082,0.275651,-0.35784,1.360916


In [17]:
# Analyze the relationship between Z3, W3, and X3 to identify compliers
print("=== IDENTIFYING COMPLIERS IN df_13_3 ===")
print()

# First, let's examine the data structure
print("Data overview:")
print(df_13_3.describe())
print()

# Create the interaction term ZW_int = Z3 * W3
df_13_3['ZW_int'] = df_13_3['Z3'] * df_13_3['W3']

# Run regression: X3 = β0 + β1*Z3 + β2*W3 + β3*(Z3*W3) + ε
X = df_13_3[['Z3', 'W3', 'ZW_int']]
X_with_const = sm.add_constant(X)
y = df_13_3['X3']

model = sm.OLS(y, X_with_const).fit()
print("=== REGRESSION RESULTS: X3 ~ Z3 + W3 + Z3*W3 ===")
print(model.summary())
print()

# Extract coefficients
beta_0 = model.params['const']
beta_1 = model.params['Z3']  # Main effect of Z3
beta_2 = model.params['W3']  # Main effect of W3  
beta_3 = model.params['ZW_int']  # Interaction effect

print(f"Intercept (β0): {beta_0:.4f}")
print(f"Z3 coefficient (β1): {beta_1:.4f}")
print(f"W3 coefficient (β2): {beta_2:.4f}")
print(f"Z3*W3 interaction (β3): {beta_3:.4f}")
print()

# Calculate the effect of changing Z3 from 0 to 1 for each individual
print("=== IDENTIFYING COMPLIERS ===")
print("A complier is someone for whom changing Z3 from 0→1 increases X3")
print()

# For each person, calculate: E[X3|Z3=1, W3] - E[X3|Z3=0, W3]
# This equals: β1 + β3*W3

df_13_3['treatment_effect'] = beta_1 + beta_3 * df_13_3['W3']

print("Treatment effect of Z3 on X3 = β1 + β3*W3")
print(f"= {beta_1:.4f} + {beta_3:.4f}*W3")
print()

# Compliers have positive treatment effect
df_13_3['is_complier'] = df_13_3['treatment_effect'] > 0

print("Summary of treatment effects:")
print(f"Mean treatment effect: {df_13_3['treatment_effect'].mean():.4f}")
print(f"Number of compliers: {df_13_3['is_complier'].sum()}")
print(f"Number of non-compliers: {(~df_13_3['is_complier']).sum()}")
print()

# Analyze the pattern
print("=== PATTERN ANALYSIS ===")
complier_analysis = df_13_3.groupby(['is_complier']).agg({
    'W3': ['mean', 'min', 'max'],
    'Z3': ['mean', 'min', 'max'],
    'treatment_effect': ['mean', 'min', 'max']
}).round(4)

print("Complier vs Non-complier characteristics:")
print(complier_analysis)
print()

# Check the relationship with W3 sign
print("=== RELATIONSHIP WITH W3 SIGN ===")
df_13_3['W3_positive'] = df_13_3['W3'] > 0
df_13_3['W3_negative'] = df_13_3['W3'] < 0

cross_tab = pd.crosstab(df_13_3['is_complier'], 
                       [df_13_3['W3_positive'], df_13_3['W3_negative']], 
                       margins=True)
print("Cross-tabulation: Complier status vs W3 sign")
print(cross_tab)
print()

# Determine the answer
if beta_3 > 0:
    print("Since β3 > 0:")
    print("Treatment effect = β1 + β3*W3 is positive when W3 > -β1/β3")
    threshold = -beta_1/beta_3
    print(f"Threshold: W3 > {threshold:.4f}")
    if threshold < 0:
        print("Since threshold < 0, compliers are those with W3 > 0 (positive W3)")
        answer = "Option B: The compliers are those with positive numbers in the W3 column."
    else:
        print("Since threshold > 0, compliers are those with W3 > threshold")
        answer = "Need to check specific threshold"
elif beta_3 < 0:
    print("Since β3 < 0:")
    print("Treatment effect = β1 + β3*W3 is positive when W3 < -β1/β3")  
    threshold = -beta_1/beta_3
    print(f"Threshold: W3 < {threshold:.4f}")
    if threshold > 0:
        print("Since threshold > 0, compliers are those with W3 < 0 (negative W3)")
        answer = "Option C: The compliers are those with negative numbers in the W3 column."
    else:
        print("Since threshold < 0, compliers are those with W3 < threshold")
        answer = "Need to check specific threshold"
else:
    print("β3 = 0, so treatment effect depends only on β1")
    answer = "Treatment effect is constant across all individuals"

print(f"\nCONCLUSION: {answer}")

# Verify by checking actual data
print("\n=== VERIFICATION ===")
pos_w3_compliers = df_13_3[df_13_3['W3'] > 0]['is_complier'].mean()
neg_w3_compliers = df_13_3[df_13_3['W3'] < 0]['is_complier'].mean()

print(f"Proportion of compliers among those with positive W3: {pos_w3_compliers:.3f}")
print(f"Proportion of compliers among those with negative W3: {neg_w3_compliers:.3f}")

=== IDENTIFYING COMPLIERS IN df_13_3 ===

Data overview:
                 Z3            X3        ZW_int            W3            Y3
count  10000.000000  10000.000000  10000.000000  10000.000000  10000.000000
mean      -0.007059      0.018829      0.004269      0.030900      0.057918
std        0.989241      2.000498      0.996460      1.004483      2.850070
min       -3.831518    -14.297329     -8.538433     -3.546323    -15.234137
25%       -0.675826     -1.026753     -0.358618     -0.648601     -1.650148
50%       -0.011101      0.225545      0.004038      0.043331      0.325535
75%        0.651741      1.311780      0.376674      0.716972      1.966483
max        3.834986      9.249093      8.105167      3.830984     10.664392

=== REGRESSION RESULTS: X3 ~ Z3 + W3 + Z3*W3 ===
                            OLS Regression Results                            
Dep. Variable:                     X3   R-squared:                       0.749
Model:                            OLS   Adj. R-squa

### Question 4

In [18]:
num = 1000
Z4 = np.random.normal(0, 1, num)
W4 = np.random.normal(0, 1, num)
ZW_int = W4 * Z4
X4 = Z4 + W4 - 2 * ZW_int + np.random.normal(0, 1, num)
Y4 = W4 + X4 - Z4 + np.random.normal(0, 1, num)

df_13_4 = pd.DataFrame({'Z4': Z4, 'W4': W4, 'X4': X4, 'Y4': Y4})

In [19]:
# Analyze instrument validity for Z4
print("=== INSTRUMENTAL VARIABLE VALIDITY CHECK ===")
print("Checking if Z4 is a valid instrument for X4 in the Y4 equation")
print()

# Display the data generation process
print("Data Generation Process:")
print("Z4 = N(0,1)")
print("W4 = N(0,1)")
print("ZW_int = W4 * Z4")
print("X4 = Z4 + W4 - 2*ZW_int + ε₁")
print("Y4 = W4 + X4 - Z4 + ε₂")
print()

# Substitute X4 into Y4 equation
print("Substituting X4 into Y4:")
print("Y4 = W4 + (Z4 + W4 - 2*ZW_int) - Z4 + ε₂")
print("Y4 = W4 + Z4 + W4 - 2*W4*Z4 - Z4 + ε₂")
print("Y4 = 2*W4 - 2*W4*Z4 + ε₂")
print("Y4 = 2*W4*(1 - Z4) + ε₂")
print()

# Check the three conditions for instrument validity
print("=== INSTRUMENT VALIDITY CONDITIONS ===")
print()

print("1. RELEVANCE: Is Z4 correlated with X4?")
# From X4 = Z4 + W4 - 2*ZW_int + ε₁ = Z4 + W4 - 2*W4*Z4 + ε₁
first_stage = sm.OLS(df_13_4['X4'], sm.add_constant(df_13_4['Z4'])).fit()
print(f"First stage: X4 ~ Z4")
print(f"Coefficient: {first_stage.params[1]:.4f}")
print(f"F-statistic: {first_stage.fvalue:.4f}")
print(f"P-value: {first_stage.pvalues[1]:.6f}")
if first_stage.fvalue > 10:
    print("✓ RELEVANT: Z4 is significantly correlated with X4")
else:
    print("✗ WEAK: Z4 is weakly correlated with X4")
print()

print("2. EXCLUSION: Does Z4 affect Y4 only through X4?")
print("From the structural equations:")
print("Y4 = 2*W4*(1 - Z4) + ε₂")
print("This shows Z4 directly affects Y4, not just through X4!")
print("✗ EXCLUSION VIOLATED: Z4 has a direct effect on Y4")
print()

print("3. EXOGENEITY: Is Z4 uncorrelated with error term in Y4 equation?")
print("Z4 ~ N(0,1) independently, so this condition is likely satisfied")
print("✓ EXOGENOUS: Z4 appears uncorrelated with error terms")
print()

# Empirical tests
print("=== EMPIRICAL TESTS ===")
print()

# Test exclusion restriction
print("Testing exclusion restriction:")
print("If valid, Z4 should not affect Y4 when controlling for X4")

# Reduced form: Y4 ~ Z4 (should be significant if instrument is relevant)
reduced_form = sm.OLS(df_13_4['Y4'], sm.add_constant(df_13_4['Z4'])).fit()
print(f"Reduced form (Y4 ~ Z4): coef = {reduced_form.params[1]:.4f}, p = {reduced_form.pvalues[1]:.6f}")

# Test exclusion: Y4 ~ X4 + Z4 (Z4 should be insignificant if exclusion holds)
exclusion_test = sm.OLS(df_13_4['Y4'], sm.add_constant(df_13_4[['X4', 'Z4']])).fit()
print(f"Exclusion test (Y4 ~ X4 + Z4):")
print(f"  X4 coefficient: {exclusion_test.params['X4']:.4f}, p = {exclusion_test.pvalues['X4']:.6f}")
print(f"  Z4 coefficient: {exclusion_test.params['Z4']:.4f}, p = {exclusion_test.pvalues['Z4']:.6f}")

if exclusion_test.pvalues['Z4'] < 0.05:
    print("✗ EXCLUSION VIOLATED: Z4 is significant when controlling for X4")
else:
    print("✓ EXCLUSION SATISFIED: Z4 is not significant when controlling for X4")
print()

# The real test: include W4 as a control
print("Including W4 as control (since it's a confounder):")
full_test = sm.OLS(df_13_4['Y4'], sm.add_constant(df_13_4[['X4', 'Z4', 'W4']])).fit()
print(f"Full model (Y4 ~ X4 + Z4 + W4):")
print(f"  X4 coefficient: {full_test.params['X4']:.4f}, p = {full_test.pvalues['X4']:.6f}")
print(f"  Z4 coefficient: {full_test.params['Z4']:.4f}, p = {full_test.pvalues['Z4']:.6f}")
print(f"  W4 coefficient: {full_test.params['W4']:.4f}, p = {full_test.pvalues['W4']:.6f}")

# Final conclusion
print("\n=== CONCLUSION ===")
print("Z4 is INVALID as an instrument because:")
print("- It violates the EXCLUSION RESTRICTION")
print("- Z4 appears directly in the Y4 equation: Y4 = 2*W4*(1 - Z4) + ε₂")
print("- Z4 affects Y4 through channels other than X4")
print()
print("The correct answer is: Option C - It is invalid because Z4 affects Y4")

# Show the structural relationship
print("\n=== STRUCTURAL RELATIONSHIPS ===")
print("Y4 = W4 + X4 - Z4 + ε₂")
print("This directly shows Z4 affecting Y4 with coefficient -1")
print("This violates the exclusion restriction for instrumental variables")


=== INSTRUMENTAL VARIABLE VALIDITY CHECK ===
Checking if Z4 is a valid instrument for X4 in the Y4 equation

Data Generation Process:
Z4 = N(0,1)
W4 = N(0,1)
ZW_int = W4 * Z4
X4 = Z4 + W4 - 2*ZW_int + ε₁
Y4 = W4 + X4 - Z4 + ε₂

Substituting X4 into Y4:
Y4 = W4 + (Z4 + W4 - 2*ZW_int) - Z4 + ε₂
Y4 = W4 + Z4 + W4 - 2*W4*Z4 - Z4 + ε₂
Y4 = 2*W4 - 2*W4*Z4 + ε₂
Y4 = 2*W4*(1 - Z4) + ε₂

=== INSTRUMENT VALIDITY CONDITIONS ===

1. RELEVANCE: Is Z4 correlated with X4?
First stage: X4 ~ Z4
Coefficient: 0.8786
F-statistic: 136.8337
P-value: 0.000000
✓ RELEVANT: Z4 is significantly correlated with X4

2. EXCLUSION: Does Z4 affect Y4 only through X4?
From the structural equations:
Y4 = 2*W4*(1 - Z4) + ε₂
This shows Z4 directly affects Y4, not just through X4!
✗ EXCLUSION VIOLATED: Z4 has a direct effect on Y4

3. EXOGENEITY: Is Z4 uncorrelated with error term in Y4 equation?
Z4 ~ N(0,1) independently, so this condition is likely satisfied
✓ EXOGENOUS: Z4 appears uncorrelated with error terms

=== EMP

  print(f"Coefficient: {first_stage.params[1]:.4f}")
  print(f"P-value: {first_stage.pvalues[1]:.6f}")
  print(f"Reduced form (Y4 ~ Z4): coef = {reduced_form.params[1]:.4f}, p = {reduced_form.pvalues[1]:.6f}")
  print(f"Reduced form (Y4 ~ Z4): coef = {reduced_form.params[1]:.4f}, p = {reduced_form.pvalues[1]:.6f}")
