In [2]:
import os
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from sklearn.cluster import KMeans
import quantecon as qe
import matplotlib.pyplot as plt
import pickle
import kagglehub
from linearmodels.iv import IVGMM
from linearmodels.iv import IV2SLS

***Load in Data***

In [3]:
# 2010 population + crime data
with open('df_2010_controls2_final.pkl', 'rb') as f:
    df_2010_controls2_final = pickle.load(f)

# Create a copy
df_2010_copy = df_2010_controls2_final.copy()
df_2010_copy.head()

Unnamed: 0,county,schedule1_count,schedule2_count,total_pop_1860,log_slave,ag_output,ag_gdp_1860,gini_x,cash_value_farms,state,...,gdp_2014,gdp_2015,gdp_2016,gdp_2017,gdp_2018,gdp_2019,gdp_2020,gdp_2021,gdp_2022,gdp_2023


***IV, 2nd stage 1,2,3 controls, murder rate***

In [25]:
# Define the dependent variable for the second stage (murder rate)
y_second_stage = df_2010_copy['log_murder_rate']

# 1. IV Regression with 1 Control
X_1_control = df_2010_copy[['pop_density_2010']]  # Exogenous control
X_1_control = sm.add_constant(X_1_control)  # Add constant

iv_model_1 = IV2SLS(
    dependent=y_second_stage,             # Dependent variable
    exog=X_1_control,                     # Exogenous controls (including constant)
    endog=df_2010_copy['gini_stata'],     # Endogenous variable (inequality)
    instruments=df_2010_copy[['log_slave']]  # Instrument(s)
)

iv_results_1 = iv_model_1.fit(cov_type='robust')  # Use robust standard errors
print("\nIV Regression Results (1 Control - Murder Rate, Robust SE)")
print(iv_results_1.summary)

# 2. IV Regression with 2 Controls
X_2_controls = df_2010_copy[['pop_density_2010', 'total_males_15_29_2010']]  # Exogenous controls
X_2_controls = sm.add_constant(X_2_controls)  # Add constant

iv_model_2 = IV2SLS(
    dependent=y_second_stage,             # Dependent variable
    exog=X_2_controls,                    # Exogenous controls (including constant)
    endog=df_2010_copy['gini_stata'],     # Endogenous variable (inequality)
    instruments=df_2010_copy[['log_slave']]  # Instrument(s)
)

iv_results_2 = iv_model_2.fit(cov_type='robust')  # Use robust standard errors
print("\nIV Regression Results (2 Controls - Murder Rate, Robust SE)")
print(iv_results_2.summary)

# 3. IV Regression with 3 Controls
X_3_controls = df_2010_copy[['pop_density_2010', 'total_males_15_29_2010', 'gdp_2010']]  # Exogenous controls
X_3_controls = sm.add_constant(X_3_controls)  # Add constant

iv_model_3 = IV2SLS(
    dependent=y_second_stage,             # Dependent variable
    exog=X_3_controls,                    # Exogenous controls (including constant)
    endog=df_2010_copy['gini_stata'],     # Endogenous variable (inequality)
    instruments=df_2010_copy[['log_slave']]  # Instrument(s)
)

iv_results_3 = iv_model_3.fit(cov_type='robust')  # Use robust standard errors
print("\nIV Regression Results (3 Controls - Murder Rate, Robust SE)")
print(iv_results_3.summary)


IV Regression Results (1 Control - Murder Rate, Robust SE)
                          IV-2SLS Estimation Summary                          
Dep. Variable:        log_murder_rate   R-squared:                     -0.0357
Estimator:                    IV-2SLS   Adj. R-squared:                -0.0525
No. Observations:                 126   F-statistic:                    15.687
Date:                Mon, Dec 09 2024   P-value (F-stat)                0.0004
Time:                        11:56:28   Distribution:                  chi2(2)
Cov. Estimator:                robust                                         
                                                                              
                                Parameter Estimates                                 
                  Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------------
const               -0.6518     0.4736    -1.3763    

***IV, 2nd stage 1,2,3 controls, mtv theft rate***

In [26]:
# Define the dependent variable for the second stage (motor vehicle theft rate)
y_second_stage = df_2010_copy['log_mtv_theft_rate']

# 1. IV Regression with 1 Control
X_1_control = df_2010_copy[['pop_density_2010']]  # Exogenous control
X_1_control = sm.add_constant(X_1_control)  # Add constant

iv_model_1 = IV2SLS(
    dependent=y_second_stage,             # Dependent variable
    exog=X_1_control,                     # Exogenous controls (including constant)
    endog=df_2010_copy['gini_stata'],     # Endogenous variable (inequality)
    instruments=df_2010_copy[['log_slave']]  # Instrument(s)
)

iv_results_1 = iv_model_1.fit(cov_type='robust')  # Use robust standard errors
print("\nIV Regression Results (1 Control - MTV Theft Rate, Robust SE)")
print(iv_results_1.summary)

# 2. IV Regression with 2 Controls
X_2_controls = df_2010_copy[['pop_density_2010', 'total_males_15_29_2010']]  # Exogenous controls
X_2_controls = sm.add_constant(X_2_controls)  # Add constant

iv_model_2 = IV2SLS(
    dependent=y_second_stage,             # Dependent variable
    exog=X_2_controls,                    # Exogenous controls (including constant)
    endog=df_2010_copy['gini_stata'],     # Endogenous variable (inequality)
    instruments=df_2010_copy[['log_slave']]  # Instrument(s)
)

iv_results_2 = iv_model_2.fit(cov_type='robust')  # Use robust standard errors
print("\nIV Regression Results (2 Controls - MTV Theft Rate, Robust SE)")
print(iv_results_2.summary)

# 3. IV Regression with 3 Controls
X_3_controls = df_2010_copy[['pop_density_2010', 'total_males_15_29_2010', 'gdp_2010']]  # Exogenous controls
X_3_controls = sm.add_constant(X_3_controls)  # Add constant

iv_model_3 = IV2SLS(
    dependent=y_second_stage,             # Dependent variable
    exog=X_3_controls,                    # Exogenous controls (including constant)
    endog=df_2010_copy['gini_stata'],     # Endogenous variable (inequality)
    instruments=df_2010_copy[['log_slave']]  # Instrument(s)
)

iv_results_3 = iv_model_3.fit(cov_type='robust')  # Use robust standard errors
print("\nIV Regression Results (3 Controls - MTV Theft Rate, Robust SE)")
print(iv_results_3.summary)



IV Regression Results (1 Control - MTV Theft Rate, Robust SE)
                          IV-2SLS Estimation Summary                          
Dep. Variable:     log_mtv_theft_rate   R-squared:                     -0.0030
Estimator:                    IV-2SLS   Adj. R-squared:                -0.0193
No. Observations:                 126   F-statistic:                    1.2744
Date:                Mon, Dec 09 2024   P-value (F-stat)                0.5288
Time:                        11:59:12   Distribution:                  chi2(2)
Cov. Estimator:                robust                                         
                                                                              
                                Parameter Estimates                                 
                  Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------------
const                2.2407     0.6841     3.2752 

***IV, 2nd stage 1,2,3 controls, crime rate***

In [27]:
# Define the dependent variable for the second stage (crime rate)
y_second_stage = df_2010_copy['log_crime_rate']

# 1. IV Regression with 1 Control
X_1_control = df_2010_copy[['pop_density_2010']]  # Exogenous control
X_1_control = sm.add_constant(X_1_control)  # Add constant

iv_model_1 = IV2SLS(
    dependent=y_second_stage,             # Dependent variable
    exog=X_1_control,                     # Exogenous controls (including constant)
    endog=df_2010_copy['gini_stata'],     # Endogenous variable (inequality)
    instruments=df_2010_copy[['log_slave']]  # Instrument(s)
)

iv_results_1 = iv_model_1.fit(cov_type='robust')  # Use robust standard errors
print("\nIV Regression Results (1 Control - Crime Rate, Robust SE)")
print(iv_results_1.summary)

# 2. IV Regression with 2 Controls
X_2_controls = df_2010_copy[['pop_density_2010', 'total_males_15_29_2010']]  # Exogenous controls
X_2_controls = sm.add_constant(X_2_controls)  # Add constant

iv_model_2 = IV2SLS(
    dependent=y_second_stage,             # Dependent variable
    exog=X_2_controls,                    # Exogenous controls (including constant)
    endog=df_2010_copy['gini_stata'],     # Endogenous variable (inequality)
    instruments=df_2010_copy[['log_slave']]  # Instrument(s)
)

iv_results_2 = iv_model_2.fit(cov_type='robust')  # Use robust standard errors
print("\nIV Regression Results (2 Controls - Crime Rate, Robust SE)")
print(iv_results_2.summary)

# 3. IV Regression with 3 Controls
X_3_controls = df_2010_copy[['pop_density_2010', 'total_males_15_29_2010', 'gdp_2010']]  # Exogenous controls
X_3_controls = sm.add_constant(X_3_controls)  # Add constant

iv_model_3 = IV2SLS(
    dependent=y_second_stage,             # Dependent variable
    exog=X_3_controls,                    # Exogenous controls (including constant)
    endog=df_2010_copy['gini_stata'],     # Endogenous variable (inequality)
    instruments=df_2010_copy[['log_slave']]  # Instrument(s)
)

iv_results_3 = iv_model_3.fit(cov_type='robust')  # Use robust standard errors
print("\nIV Regression Results (3 Controls - Crime Rate, Robust SE)")
print(iv_results_3.summary)



IV Regression Results (1 Control - Crime Rate, Robust SE)
                          IV-2SLS Estimation Summary                          
Dep. Variable:         log_crime_rate   R-squared:                     -0.0089
Estimator:                    IV-2SLS   Adj. R-squared:                -0.0253
No. Observations:                 126   F-statistic:                    0.4531
Date:                Mon, Dec 09 2024   P-value (F-stat)                0.7973
Time:                        12:02:18   Distribution:                  chi2(2)
Cov. Estimator:                robust                                         
                                                                              
                                Parameter Estimates                                 
                  Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------------
const                8.3763     0.3269     25.620     

***1st stage IV***

In [28]:

# Define the dependent variable for all first-stage regressions (land inequality)
y_first_stage = df_2010_copy['gini_stata']

# Regression 1: Land inequality ~ log_slave + 1 control
X_first_stage_1 = df_2010_copy[['log_slave', 'pop_density_1860']]
X_first_stage_1 = sm.add_constant(X_first_stage_1)
model_first_stage_1 = sm.OLS(y_first_stage, X_first_stage_1).fit()

# Regression 2: Land inequality ~ log_slave + 2 controls
X_first_stage_2 = df_2010_copy[['log_slave', 'pop_density_1860', 'total_males_15_29_1860']]
X_first_stage_2 = sm.add_constant(X_first_stage_2)
model_first_stage_2 = sm.OLS(y_first_stage, X_first_stage_2).fit()

# Regression 3: Land inequality ~ log_slave + 3 controls
X_first_stage_3 = df_2010_copy[['log_slave', 'pop_density_1860', 'total_males_15_29_1860', 'ag_gdp_1860']]
X_first_stage_3 = sm.add_constant(X_first_stage_3)
model_first_stage_3 = sm.OLS(y_first_stage, X_first_stage_3).fit()

# Display the summaries for all three regressions
print("\nFirst Stage IV Regression: 1 Control")
print(model_first_stage_1.summary())

print("\nFirst Stage IV Regression: 2 Controls")
print(model_first_stage_2.summary())

print("\nFirst Stage IV Regression: 3 Controls")
print(model_first_stage_3.summary())


First Stage IV Regression: 1 Control
                            OLS Regression Results                            
Dep. Variable:             gini_stata   R-squared:                       0.350
Model:                            OLS   Adj. R-squared:                  0.339
Method:                 Least Squares   F-statistic:                     33.11
Date:                Mon, 09 Dec 2024   Prob (F-statistic):           3.13e-12
Time:                        12:03:27   Log-Likelihood:                 92.639
No. Observations:                 126   AIC:                            -179.3
Df Residuals:                     123   BIC:                            -170.8
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
co