In [2]:
import os
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from sklearn.cluster import KMeans
import quantecon as qe
import matplotlib.pyplot as plt
import pickle
import kagglehub
from linearmodels.iv import IVGMM
from linearmodels.iv import IV2SLS

In [3]:
# 2010 population + crime data
with open('df_2010_controls2_final.pkl', 'rb') as f:
    df_2010_controls2_final = pickle.load(f)

# Create a copy
df_2010_copy = df_2010_controls2_final.copy()
df_2010_copy.head()

Unnamed: 0,county,schedule1_count,schedule2_count,total_pop_1860,log_slave,ag_output,ag_gdp_1860,gini_x,cash_value_farms,state,...,gdp_2014,gdp_2015,gdp_2016,gdp_2017,gdp_2018,gdp_2019,gdp_2020,gdp_2021,gdp_2022,gdp_2023
0,10.0,360232.0,3475.0,363707.0,8.15335,32662909.0,89.805557,0.54031,129368094.0,Alabama,...,1569120,1729098,1806246,1762558,1826642,1814228,1841867,1954645,2318914,2452642
1,30.0,292363.0,1537.0,293900.0,7.337588,27996221.0,95.257645,0.58392,152052217.0,Alabama,...,6034727,6492574,6983037,7382558,7935575,8558537,8770138,9847424,11097699,12071468
2,50.0,395136.0,1872.0,397008.0,7.534763,31948684.0,80.473656,0.52155,142857209.0,Alabama,...,779299,765000,757473,761761,789103,793769,787697,868828,931174,926685
3,70.0,136102.0,2639.0,138741.0,7.878155,24998235.0,180.179153,0.43734,99711086.0,Alabama,...,381354,377535,392287,406741,408791,465977,502089,530401,571231,610429
4,90.0,375103.0,1078.0,376181.0,6.982863,21747586.0,57.811497,0.59723,88331569.0,Alabama,...,928552,987510,925988,987967,1064218,1055980,973485,1162103,1266269,1261107


***IV-GMM, 2nd stage 1,2,3 controls, murder rate***

In [6]:
# Define the dependent variable for the first stage (land inequality)
y_first_stage = df_2010_copy['gini_stata']

# Define the dependent variable for the second stage (murder rate)
y_second_stage = df_2010_copy['log_murder_rate']

# 1. IV-GMM Regression with 1 Control
X_1_control = df_2010_copy[['pop_density_2010']]
X_1_control = sm.add_constant(X_1_control)

gmm_model_1 = IVGMM(
    dependent=y_second_stage,             # Dependent variable (murder rate)
    exog=X_1_control,                     # Exogenous controls (including constant)
    endog=df_2010_copy['gini_stata'],     # Endogenous variable (inequality)
    instruments=df_2010_copy[['log_slave']]  # Instrument(s)
)

# Fit with robust covariance
gmm_results_1 = gmm_model_1.fit(cov_type='robust')  # Use robust weighting matrix
print("\nIV-GMM Regression Results (1 Control, Robust Weighting)")
print(gmm_results_1.summary)

# 2. IV-GMM Regression with 2 Controls
X_2_controls = df_2010_copy[['pop_density_2010', 'total_males_15_29_2010']]
X_2_controls = sm.add_constant(X_2_controls)

gmm_model_2 = IVGMM(
    dependent=y_second_stage,             # Dependent variable (murder rate)
    exog=X_2_controls,                    # Exogenous controls (including constant)
    endog=df_2010_copy['gini_stata'],     # Endogenous variable (inequality)
    instruments=df_2010_copy[['log_slave']]  # Instrument(s)
)

# Fit with robust covariance
gmm_results_2 = gmm_model_2.fit(cov_type='robust')  # Use robust weighting matrix
print("\nIV-GMM Regression Results (2 Controls, Robust Weighting)")
print(gmm_results_2.summary)

# 3. IV-GMM Regression with 3 Controls
X_3_controls = df_2010_copy[['pop_density_2010', 'total_males_15_29_2010', 'gdp_2010']]
X_3_controls = sm.add_constant(X_3_controls)

gmm_model_3 = IVGMM(
    dependent=y_second_stage,             # Dependent variable (murder rate)
    exog=X_3_controls,                    # Exogenous controls (including constant)
    endog=df_2010_copy['gini_stata'],     # Endogenous variable (inequality)
    instruments=df_2010_copy[['log_slave']]  # Instrument(s)
)

# Fit with robust covariance
gmm_results_3 = gmm_model_3.fit(cov_type='robust')  # Use robust weighting matrix
print("\nIV-GMM Regression Results (3 Controls, Robust Weighting)")
print(gmm_results_3.summary)



IV-GMM Regression Results (1 Control, Robust Weighting)
                          IV-GMM Estimation Summary                           
Dep. Variable:        log_murder_rate   R-squared:                     -0.0357
Estimator:                     IV-GMM   Adj. R-squared:                -0.0525
No. Observations:                 126   F-statistic:                    15.687
Date:                Mon, Dec 09 2024   P-value (F-stat)                0.0004
Time:                        13:02:34   Distribution:                  chi2(2)
Cov. Estimator:                robust                                         
                                                                              
                                Parameter Estimates                                 
                  Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------------
const               -0.6518     0.4736    -1.3763     0.

***IV, 2nd stage 1,2,3 controls, vehicle theft rate***

In [23]:
import statsmodels.api as sm
from linearmodels.iv import IVGMM

# Define the dependent variable for the second stage (motor vehicle theft rate)
y_second_stage = df_2010_copy['log_mtv_theft_rate']

# 1. IV-GMM Regression with 1 Control
X_1_control = df_2010_copy[['pop_density_2010']]
X_1_control = sm.add_constant(X_1_control)

gmm_model_1 = IVGMM(
    dependent=y_second_stage,             # Dependent variable (motor vehicle theft rate)
    exog=X_1_control,                     # Exogenous controls (including constant)
    endog=df_2010_copy['gini_stata'],     # Endogenous variable (inequality)
    instruments=df_2010_copy[['log_slave']]  # Instrument(s)
)

gmm_results_1 = gmm_model_1.fit()
print("\nIV-GMM Regression Results (1 Control - MTV Theft Rate)")
print(gmm_results_1.summary)

# 2. IV-GMM Regression with 2 Controls
X_2_controls = df_2010_copy[['pop_density_2010', 'total_males_15_29_2010']]
X_2_controls = sm.add_constant(X_2_controls)

gmm_model_2 = IVGMM(
    dependent=y_second_stage,             # Dependent variable (motor vehicle theft rate)
    exog=X_2_controls,                    # Exogenous controls (including constant)
    endog=df_2010_copy['gini_stata'],     # Endogenous variable (inequality)
    instruments=df_2010_copy[['log_slave']]  # Instrument(s)
)

gmm_results_2 = gmm_model_2.fit()
print("\nIV-GMM Regression Results (2 Controls - MTV Theft Rate)")
print(gmm_results_2.summary)

# 3. IV-GMM Regression with 3 Controls
X_3_controls = df_2010_copy[['pop_density_2010', 'total_males_15_29_2010', 'gdp_2010']]
X_3_controls = sm.add_constant(X_3_controls)

gmm_model_3 = IVGMM(
    dependent=y_second_stage,             # Dependent variable (motor vehicle theft rate)
    exog=X_3_controls,                    # Exogenous controls (including constant)
    endog=df_2010_copy['gini_stata'],     # Endogenous variable (inequality)
    instruments=df_2010_copy[['log_slave']]  # Instrument(s)
)

gmm_results_3 = gmm_model_3.fit()
print("\nIV-GMM Regression Results (3 Controls - MTV Theft Rate)")
print(gmm_results_3.summary)



IV-GMM Regression Results (1 Control - MTV Theft Rate)
                          IV-GMM Estimation Summary                           
Dep. Variable:     log_mtv_theft_rate   R-squared:                     -0.0030
Estimator:                     IV-GMM   Adj. R-squared:                -0.0193
No. Observations:                 126   F-statistic:                    1.2744
Date:                Mon, Dec 09 2024   P-value (F-stat)                0.5288
Time:                        12:05:32   Distribution:                  chi2(2)
Cov. Estimator:                robust                                         
                                                                              
                                Parameter Estimates                                 
                  Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------------
const                2.2407     0.6841     3.2752     0.0

***IV, 2nd stage 1,2,3 controls, crime rate***

In [24]:
# Define the dependent variable for the second stage (crime rate)
y_second_stage = df_2010_copy['log_crime_rate']

# 1. IV-GMM Regression with 1 Control
X_1_control = df_2010_copy[['pop_density_2010']]
X_1_control = sm.add_constant(X_1_control)

gmm_model_1 = IVGMM(
    dependent=y_second_stage,             # Dependent variable (crime rate)
    exog=X_1_control,                     # Exogenous controls (including constant)
    endog=df_2010_copy['gini_stata'],     # Endogenous variable (inequality)
    instruments=df_2010_copy[['log_slave']]  # Instrument(s)
)

gmm_results_1 = gmm_model_1.fit()
print("\nIV-GMM Regression Results (1 Control - Crime Rate)")
print(gmm_results_1.summary)

# 2. IV-GMM Regression with 2 Controls
X_2_controls = df_2010_copy[['pop_density_2010', 'total_males_15_29_2010']]
X_2_controls = sm.add_constant(X_2_controls)

gmm_model_2 = IVGMM(
    dependent=y_second_stage,             # Dependent variable (crime rate)
    exog=X_2_controls,                    # Exogenous controls (including constant)
    endog=df_2010_copy['gini_stata'],     # Endogenous variable (inequality)
    instruments=df_2010_copy[['log_slave']]  # Instrument(s)
)

gmm_results_2 = gmm_model_2.fit()
print("\nIV-GMM Regression Results (2 Controls - Crime Rate)")
print(gmm_results_2.summary)

# 3. IV-GMM Regression with 3 Controls
X_3_controls = df_2010_copy[['pop_density_2010', 'total_males_15_29_2010', 'gdp_2010']]
X_3_controls = sm.add_constant(X_3_controls)

gmm_model_3 = IVGMM(
    dependent=y_second_stage,             # Dependent variable (crime rate)
    exog=X_3_controls,                    # Exogenous controls (including constant)
    endog=df_2010_copy['gini_stata'],     # Endogenous variable (inequality)
    instruments=df_2010_copy[['log_slave']]  # Instrument(s)
)

gmm_results_3 = gmm_model_3.fit()
print("\nIV-GMM Regression Results (3 Controls - Crime Rate)")
print(gmm_results_3.summary)



IV-GMM Regression Results (1 Control - Crime Rate)
                          IV-GMM Estimation Summary                           
Dep. Variable:         log_crime_rate   R-squared:                     -0.0089
Estimator:                     IV-GMM   Adj. R-squared:                -0.0253
No. Observations:                 126   F-statistic:                    0.4531
Date:                Mon, Dec 09 2024   P-value (F-stat)                0.7973
Time:                        12:06:41   Distribution:                  chi2(2)
Cov. Estimator:                robust                                         
                                                                              
                                Parameter Estimates                                 
                  Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------------
const                8.3763     0.3269     25.620     0.0000 

***1st stage regressions with 1,2,3 controls***

In [20]:

# Define the dependent variable for the first stage (land inequality)
y_first_stage = df_2010_copy['gini_stata']

# 1. First Stage Regression with 1 Control
X_1_control = df_2010_copy[['log_slave', 'pop_density_1860']]
X_1_control = sm.add_constant(X_1_control)

model_first_stage_1 = sm.OLS(y_first_stage, X_1_control).fit()
print("\nFirst Stage IV-GMM Regression Results (1 Control)")
print(model_first_stage_1.summary())

# 2. First Stage Regression with 2 Controls
X_2_controls = df_2010_copy[['log_slave', 'pop_density_1860', 'total_males_15_29_1860']]
X_2_controls = sm.add_constant(X_2_controls)

model_first_stage_2 = sm.OLS(y_first_stage, X_2_controls).fit()
print("\nFirst Stage IV-GMM Regression Results (2 Controls)")
print(model_first_stage_2.summary())

# 3. First Stage Regression with 3 Controls
X_3_controls = df_2010_copy[['log_slave', 'pop_density_1860', 'total_males_15_29_1860', 'ag_gdp_1860']]
X_3_controls = sm.add_constant(X_3_controls)

model_first_stage_3 = sm.OLS(y_first_stage, X_3_controls).fit()
print("\nFirst Stage IV-GMM Regression Results (3 Controls)")
print(model_first_stage_3.summary())



First Stage IV-GMM Regression Results (1 Control)
                            OLS Regression Results                            
Dep. Variable:             gini_stata   R-squared:                       0.326
Model:                            OLS   Adj. R-squared:                  0.316
Method:                 Least Squares   F-statistic:                     31.92
Date:                Mon, 09 Dec 2024   Prob (F-statistic):           4.91e-12
Time:                        11:29:50   Log-Likelihood:                 96.748
No. Observations:                 135   AIC:                            -187.5
Df Residuals:                     132   BIC:                            -178.8
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------