In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from sklearn.cluster import KMeans
import quantecon as qe
import matplotlib.pyplot as plt
import pickle
import kagglehub
from linearmodels.iv import IVGMM
from linearmodels.iv import IV2SLS

***Load in Data***

In [2]:
# 2010 population + crime data
with open('df_2010_controls2_final.pkl', 'rb') as f:
    df_2010_controls2_final = pickle.load(f)

# Create a copy
df_2010_copy = df_2010_controls2_final.copy()
df_2010_copy.head()

Unnamed: 0,county,schedule1_count,schedule2_count,total_pop_1860,log_slave,ag_output,ag_gdp_1860,gini_x,cash_value_farms,state,...,gdp_2014,gdp_2015,gdp_2016,gdp_2017,gdp_2018,gdp_2019,gdp_2020,gdp_2021,gdp_2022,gdp_2023
0,10.0,360232.0,3475.0,363707.0,8.15335,32662909.0,89.805557,0.54031,129368094.0,Alabama,...,1569120,1729098,1806246,1762558,1826642,1814228,1841867,1954645,2318914,2452642
1,30.0,292363.0,1537.0,293900.0,7.337588,27996221.0,95.257645,0.58392,152052217.0,Alabama,...,6034727,6492574,6983037,7382558,7935575,8558537,8770138,9847424,11097699,12071468
2,50.0,395136.0,1872.0,397008.0,7.534763,31948684.0,80.473656,0.52155,142857209.0,Alabama,...,779299,765000,757473,761761,789103,793769,787697,868828,931174,926685
3,70.0,136102.0,2639.0,138741.0,7.878155,24998235.0,180.179153,0.43734,99711086.0,Alabama,...,381354,377535,392287,406741,408791,465977,502089,530401,571231,610429
4,90.0,375103.0,1078.0,376181.0,6.982863,21747586.0,57.811497,0.59723,88331569.0,Alabama,...,928552,987510,925988,987967,1064218,1055980,973485,1162103,1266269,1261107


***Murder rate 2nd stage OLS with 0,1,2,3 controls***

In [3]:
# Define the dependent variable (outcome of the first stage)
y = df_2010_copy['log_murder_rate']  # Replace 'dependent_var' with the actual name of your dependent variable

# 2nd Stage with No Controls
X0 = df_2010_copy[['gini_stata']]  # Replace with the main independent variable for the first stage
X0 = sm.add_constant(X0)  # Adds an intercept to the model
model0 = sm.OLS(y, X0).fit()
print("First Stage OLS with 0 Controls")
print(model0.summary())

# 2nd Stage with 1 Control
X1 = df_2010_copy[['gini_stata', 'pop_density_2010']]  # Replace 'control_var1' with the name of your control variable
X1 = sm.add_constant(X1)
model1 = sm.OLS(y, X1).fit()
print("\nFirst Stage OLS with 1 Control")
print(model1.summary())

# 2nd Stage with 2 Controls
X2 = df_2010_copy[['gini_stata', 'pop_density_2010', 'total_males_15_29_2010']]  # Add a second control variable
X2 = sm.add_constant(X2)
model2 = sm.OLS(y, X2).fit()
print("\nFirst Stage OLS with 2 Controls")
print(model2.summary())

# 2nd Stage with 3 Controls
X3 = df_2010_copy[['gini_stata', 'pop_density_2010', 'total_males_15_29_2010', 'gdp_2010']]  # Add a third control variable
X3 = sm.add_constant(X3)
model3 = sm.OLS(y, X3).fit()
print("\nFirst Stage OLS with 3 Controls")
print(model3.summary())

First Stage OLS with 0 Controls
                            OLS Regression Results                            
Dep. Variable:        log_murder_rate   R-squared:                       0.032
Model:                            OLS   Adj. R-squared:                  0.024
Method:                 Least Squares   F-statistic:                     4.137
Date:                Mon, 09 Dec 2024   Prob (F-statistic):             0.0441
Time:                        11:39:16   Log-Likelihood:                -174.62
No. Observations:                 126   AIC:                             353.2
Df Residuals:                     124   BIC:                             358.9
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.519

In [10]:
print(X3.dtypes)

const                     float64
gini_stata                float32
pop_density_2010          float64
total_males_15_29_2010      int64
gdp_2010                   object
dtype: object


***Vehicle theft rate 2nd stage OLS with 0,1,2, controls***

In [4]:
# Define the dependent variable (outcome of the first stage)
y = df_2010_copy['log_mtv_theft_rate']  # Replace 'dependent_var' with the actual name of your dependent variable

# 2nd Stage with No Controls
X0 = df_2010_copy[['gini_stata']]  # Replace with the main independent variable for the first stage
X0 = sm.add_constant(X0)  # Adds an intercept to the model
model0 = sm.OLS(y, X0).fit()
print("First Stage OLS with 0 Controls")
print(model0.summary())

# 2nd Stage with 1 Control
X1 = df_2010_copy[['gini_stata', 'pop_density_2010']]  # Replace 'control_var1' with the name of your control variable
X1 = sm.add_constant(X1)
model1 = sm.OLS(y, X1).fit()
print("\nFirst Stage OLS with 1 Control")
print(model1.summary())

# 2nd Stage with 2 Controls
X2 = df_2010_copy[['gini_stata', 'pop_density_2010', 'total_males_15_29_2010']]  # Add a second control variable
X2 = sm.add_constant(X2)
model2 = sm.OLS(y, X2).fit()
print("\nFirst Stage OLS with 2 Controls")
print(model2.summary())

# 2nd Stage with 3 Controls
X3 = df_2010_copy[['gini_stata', 'pop_density_2010', 'total_males_15_29_2010', 'gdp_2010']]  # Add a third control variable
X3 = sm.add_constant(X3)
model3 = sm.OLS(y, X3).fit()
print("\nFirst Stage OLS with 3 Controls")
print(model3.summary())

First Stage OLS with 0 Controls
                            OLS Regression Results                            
Dep. Variable:     log_mtv_theft_rate   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                 -0.008
Method:                 Least Squares   F-statistic:                   0.01557
Date:                Mon, 09 Dec 2024   Prob (F-statistic):              0.901
Time:                        11:42:25   Log-Likelihood:                -184.14
No. Observations:                 126   AIC:                             372.3
Df Residuals:                     124   BIC:                             377.9
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          2.693

***Crime rate 2nd stage OLS with 0,1,2,3 controls***

In [5]:
# Define the dependent variable (outcome of the first stage)
y = df_2010_copy['log_crime_rate']  # Replace 'dependent_var' with the actual name of your dependent variable

# 2nd Stage with No Controls
X0 = df_2010_copy[['gini_stata']]  # Replace with the main independent variable for the first stage
X0 = sm.add_constant(X0)  # Adds an intercept to the model
model0 = sm.OLS(y, X0).fit()
print("First Stage OLS with 0 Controls")
print(model0.summary())

# 2nd Stage with 1 Control
X1 = df_2010_copy[['gini_stata', 'pop_density_2010']]  # Replace 'control_var1' with the name of your control variable
X1 = sm.add_constant(X1)
model1 = sm.OLS(y, X1).fit()
print("\nFirst Stage OLS with 1 Control")
print(model1.summary())

# 2nd Stage with 2 Controls
X2 = df_2010_copy[['gini_stata', 'pop_density_2010', 'total_males_15_29_2010']]  # Add a second control variable
X2 = sm.add_constant(X2)
model2 = sm.OLS(y, X2).fit()
print("\nFirst Stage OLS with 2 Controls")
print(model2.summary())

# 2nd Stage with 3 Controls
X3 = df_2010_copy[['gini_stata', 'pop_density_2010', 'total_males_15_29_2010', 'gdp_2010']]  # Add a third control variable
X3 = sm.add_constant(X3)
model3 = sm.OLS(y, X3).fit()
print("\nFirst Stage OLS with 3 Controls")
print(model3.summary())

First Stage OLS with 0 Controls
                            OLS Regression Results                            
Dep. Variable:         log_crime_rate   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                 -0.008
Method:                 Least Squares   F-statistic:                  0.007254
Date:                Mon, 09 Dec 2024   Prob (F-statistic):              0.932
Time:                        11:47:05   Log-Likelihood:                -109.70
No. Observations:                 126   AIC:                             223.4
Df Residuals:                     124   BIC:                             229.1
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          8.173

***1st stage OLS with 0,1,2 controls***

In [10]:
# Define the dependent variable (outcome of the first stage)
y = df_2010_copy['gini_stata']  # Replace 'dependent_var' with the actual name of your dependent variable

# 2nd Stage with No Controls
X0 = df_2010_copy[['log_slave']]  # Replace with the main independent variable for the first stage
X0 = sm.add_constant(X0)  # Adds an intercept to the model
model0 = sm.OLS(y, X0).fit()
print("First Stage OLS with 0 Controls")
print(model0.summary())

# 2nd Stage with 1 Control
X1 = df_2010_copy[['log_slave', 'pop_density_2010']]  # Replace 'control_var1' with the name of your control variable
X1 = sm.add_constant(X1)
model1 = sm.OLS(y, X1).fit()
print("\nFirst Stage OLS with 1 Control")
print(model1.summary())

# 2nd Stage with 2 Controls
X2 = df_2010_copy[['log_slave', 'pop_density_2010', 'total_males_15_29_2010']]  # Add a second control variable
X2 = sm.add_constant(X2)
model2 = sm.OLS(y, X2).fit()
print("\nFirst Stage OLS with 2 Controls")
print(model2.summary())

First Stage OLS with 0 Controls
                            OLS Regression Results                            
Dep. Variable:             gini_stata   R-squared:                       0.317
Model:                            OLS   Adj. R-squared:                  0.312
Method:                 Least Squares   F-statistic:                     61.69
Date:                Fri, 15 Nov 2024   Prob (F-statistic):           1.19e-12
Time:                        19:36:37   Log-Likelihood:                 95.840
No. Observations:                 135   AIC:                            -187.7
Df Residuals:                     133   BIC:                            -181.9
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.000