In [1]:
import os
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from sklearn.cluster import KMeans
import quantecon as qe
import matplotlib.pyplot as plt
import pickle
import kagglehub
from linearmodels.iv import IVGMM
from linearmodels.iv import IV2SLS

***Load in data***

In [4]:
# 2010 population + crime data
with open('df_2010_controls2_final.pkl', 'rb') as f:
    df_2010_controls2_final = pickle.load(f)

# Create a copy
df_2010_copy = df_2010_controls2_final.copy()
df_2010_copy.head()

Unnamed: 0,county,schedule1_count,schedule2_count,total_pop_1860,log_slave,ag_output,ag_gdp_1860,gini_x,cash_value_farms,state,...,gdp_2020,gdp_2021,gdp_2022,gdp_2023,grand_total_crimes_2000,murders_2000,vehicle_thefts_2000,grand_total_crimes_1990,murders_1990,vehicle_thefts_1990
0,10.0,360232.0,3475.0,363707.0,8.15335,32662909.0,89.805557,0.54031,129368094.0,Alabama,...,1841867,1954645,2318914,2452642,2433,2,24,1396,0,5
1,30.0,292363.0,1537.0,293900.0,7.337588,27996221.0,95.257645,0.58392,152052217.0,Alabama,...,8770138,9847424,11097699,12071468,8881,9,35,4186,2,22
2,50.0,395136.0,1872.0,397008.0,7.534763,31948684.0,80.473656,0.52155,142857209.0,Alabama,...,787697,868828,931174,926685,1652,2,7,899,3,7
3,70.0,136102.0,2639.0,138741.0,7.878155,24998235.0,180.179153,0.43734,99711086.0,Alabama,...,502089,530401,571231,610429,830,1,4,0,0,0
4,90.0,375103.0,1078.0,376181.0,6.982863,21747586.0,57.811497,0.59723,88331569.0,Alabama,...,973485,1162103,1266269,1261107,1447,0,7,1168,2,11


***IV 2nd stage, 1990, 1,2,3 controls, murder rate***

In [None]:
# Define the dependent variable for the second stage (murder rate)
y_second_stage = df_2010_copy['murders_1990']

# 1. IV Regression with 1 Control
X_1_control = df_2010_copy[['pop_density_2010']]  # Exogenous control
X_1_control = sm.add_constant(X_1_control)  # Add constant

iv_model_1 = IV2SLS(
    dependent=y_second_stage,             # Dependent variable
    exog=X_1_control,                     # Exogenous controls (including constant)
    endog=df_2010_copy['gini_stata'],     # Endogenous variable (inequality)
    instruments=df_2010_copy[['log_slave']]  # Instrument(s)
)

iv_results_1 = iv_model_1.fit(cov_type='robust')  # Use robust standard errors
print("\nIV Regression Results (1 Control - Murder Rate, Robust SE)")
print(iv_results_1.summary)

# 2. IV Regression with 2 Controls
X_2_controls = df_2010_copy[['pop_density_2010', 'total_males_15_29_2010']]  # Exogenous controls
X_2_controls = sm.add_constant(X_2_controls)  # Add constant

iv_model_2 = IV2SLS(
    dependent=y_second_stage,             # Dependent variable
    exog=X_2_controls,                    # Exogenous controls (including constant)
    endog=df_2010_copy['gini_stata'],     # Endogenous variable (inequality)
    instruments=df_2010_copy[['log_slave']]  # Instrument(s)
)

iv_results_2 = iv_model_2.fit(cov_type='robust')  # Use robust standard errors
print("\nIV Regression Results (2 Controls - Murder Rate, Robust SE)")
print(iv_results_2.summary)

# 3. IV Regression with 3 Controls
X_3_controls = df_2010_copy[['pop_density_2010', 'total_males_15_29_2010', 'gdp_2010']]  # Exogenous controls
X_3_controls = sm.add_constant(X_3_controls)  # Add constant

iv_model_3 = IV2SLS(
    dependent=y_second_stage,             # Dependent variable
    exog=X_3_controls,                    # Exogenous controls (including constant)
    endog=df_2010_copy['gini_stata'],     # Endogenous variable (inequality)
    instruments=df_2010_copy[['log_slave']]  # Instrument(s)
)

iv_results_3 = iv_model_3.fit(cov_type='robust')  # Use robust standard errors
print("\nIV Regression Results (3 Controls - Murder Rate, Robust SE)")
print(iv_results_3.summary)