# Project 3: Police use of Force

In [22]:
import numpy as np
import pandas as pd
from scipy.stats import norm, logistic
import seaborn as sns
sns.set_theme()

# User-written modules (assuming they are saved as estimation.py, LinearModel.py, probit_ante.py, and logit_ante.py)
import estimation as est
import LinearModel as lm
import probit_ante as probit
import logit_ante as logit

from scipy.stats import norm, logistic, t

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Load and categorizing data

Looking through the Appendix we find, that 'sincome' has 3 categories: "Under $20,000", "$20,000-$50,000" and "Over $50,000" respectively labeled 1,2 and 3. We change the this and define them as low, medium and high. 

The same occurs for the 'spop' variable, which has 4 categories: "Under 100,000", "100,000-499,000", "500,000-999,999" and "Over 1 million". We choose to define them as small, medium, large and huge

Furthermore, the type of stop is defined as "1 = street stop" and "2 traffic stop". We change this to the corresponding names instead of a number variable


In [23]:
# Load the data
data = pd.read_csv('ppcs_cc.csv')

data['sincome_low'] = (data['sincome'] == 1).astype(int)
data['sincome_medium'] = (data['sincome'] == 2).astype(int)
data['sincome_high'] = (data['sincome'] == 3).astype(int)

data['spop_small'] = (data['spop'] == 1).astype(int)
data['spop_medium'] = (data['spop'] == 2).astype(int)
data['spop_large'] = (data['spop'] == 3).astype(int)
data['spop_huge'] = (data['spop'] == 4).astype(int)

data['inctype_street'] = (data['inctype_lin'] == 1).astype(int)
data['inctype_traffic'] = (data['inctype_lin'] == 2).astype(int)

data['sage_scaled'] = data['sage'] / 10

In [24]:
varlist = ['smale', 'sempl', 'sincome_medium', 'sincome_high', 'spop_medium', \
           'spop_large', 'spop_huge', 'inctype_traffic', 'daytime', 'sbehavior', 'sage_scaled']

y_lab = ['anyuseofforce_coded']
x_lab_c = ['sblack', 'shisp', 'sother'] + varlist + ['const']
continoues_varlist_c = [False, False, False, False, False, False, False, False, False, False, False, False, False, True, False]

x_lab_nc = ['sblack', 'shisp', 'sother', 'const']
continoues_varlist_nc = [False, False, False, False]

# Prepare data for models without controls
y = data['anyuseofforce_coded'].values
N = len(y)
x_nc = data[['sblack', 'shisp', 'sother']].values
x_nc = np.hstack([x_nc, np.ones((N, 1))])  # Add constant term

# Prepare data for models with controls
x_c = data[['sblack', 'shisp', 'sother'] + varlist].values
x_c = np.hstack([x_c, np.ones((N, 1))])  # Add constant term 

In [25]:
pd.set_option('display.precision', 2)

varlist_table = ['swhite', 'sblack', 'shisp', 'sother', 'smale', 'sempl', 'sincome_low', 'sincome_medium', 'sincome_high', 'spop_small', 'spop_medium', \
           'spop_large', 'spop_huge', 'inctype_traffic', 'inctype_street', 'daytime', 'sbehavior'] \
            + ['omajwhite', 'omajblack', 'omajhisp', 'omajother']

# Placeholder for the results
results = []

# Loop through each variable in the variable list
for var in varlist_table:
    not_used_force = data[data['anyuseofforce_coded'] == 0]

    count_not_force = int(not_used_force[var].sum())  # Sum counts for 'not force'
    mean_not_force = not_used_force[var].mean()  # Mean for 'not force'

    used_force = data[data['anyuseofforce_coded'] == 1]

    count_force = int(used_force[var].sum())  # Sum counts for 'force used'
    mean_force = used_force[var].mean()  # Mean for 'force used'

    # Append results to the list
    results.append({
        'Variable': var,
        'Count (no force) ': count_not_force,
        'Share (no force)': mean_not_force,
        'Count (force)': count_force,
        'Share (force)': mean_force
    })

# Convert the results into a DataFrame
results_df = pd.DataFrame(results)

# Display the DataFrame
results_df


Unnamed: 0,Variable,Count (no force),Share (no force),Count (force),Share (force)
0,swhite,2799,0.74,9,0.47
1,sblack,417,0.11,3,0.16
2,shisp,380,0.1,6,0.32
3,sother,184,0.05,1,0.05
4,smale,1997,0.53,15,0.79
5,sempl,2633,0.7,9,0.47
6,sincome_low,1099,0.29,6,0.32
7,sincome_medium,957,0.25,6,0.32
8,sincome_high,1724,0.46,7,0.37
9,spop_small,2913,0.77,8,0.42


In [26]:
latex_table = results_df.to_latex(
    index=False,  # Do not include the DataFrame index in the LaTeX table
    float_format="%.2f",  # Format floating-point numbers to 2 decimal places
    caption="Summary statistics for variables based on use of force",
    label="tab:use_of_force_summary"
)

# Print the LaTeX table
# print(latex_table)

In [27]:
ols = lm.estimate(y, x_nc)
ols_tab = lm.print_table((y_lab, x_lab_nc), ols, title='LPM results')
ols_tab

LPM results
Dependent variable: ['anyuseofforce_coded']

R2 = 0.003
sigma2 = 0.005


Unnamed: 0,b_hat,se,t
sblack,0.0039,0.0037,1.07
shisp,0.0123,0.0038,3.23
sother,0.0022,0.0053,0.41
const,0.0032,0.0013,2.41


In [28]:
def probit_logit_model(
    y,
    X,
    x_labels,
    continuous_varlist,
    y_label,
    options,
    method="BFGS",
    constant=1,
    average_effect=True,
    decimals=3,
):
    """
    Estimate probit and logit models and compute (average) marginal effects
    and their delta-method standard errors for each regressor.

    Parameters
    ----------
    y : array-like (n,)
        Binary dependent variable.
    X : array-like (n, K)
        Matrix of regressors (including constant if present).
    x_labels : list of str
        Variable labels for the regressors (same order as columns in X).
    continuous_varlist : list of bool
        Indicator for whether each regressor is treated as continuous (True)
        or as a discrete change (False). Length must be at least K - constant.
    y_label : str
        Label for the dependent variable (used in titles).
    options : dict
        Options passed to the `est.estimate` routine.
    method : str, default "BFGS"
        Optimization method passed to `est.estimate`.
    constant : int, default 1
        Number of constant terms at the end of X to exclude from ME computation.
    average_effect : bool, default True
        If True, compute marginal effects at the mean of X.
        If False, compute marginal effects for each observation and average.
    decimals : int, default 3
        Number of decimals in the output table.

    Returns
    -------
    pandas.DataFrame
        Table with variables, marginal effects and standard errors for
        probit and logit models (with significance stars).
    """

    # --- Helper functions -------------------------------------------------

    def significance_stars(me, se, df):
        """Return significance stars based on a t-test for ME = 0."""
        t_stat = me / se
        p_val = 2 * (1 - t.cdf(abs(t_stat), df=df))

        if p_val < 0.01:
            return "***"
        elif p_val < 0.05:
            return "**"
        elif p_val < 0.10:
            return "*"
        return ""

    def get_se(grad, cov):
        """
        Delta-method standard error for a scalar function of parameters.

        grad : array-like
            Gradient of ME wrt parameters.
            NOTE: For the continuous case here, `grad` is KxK; for the discrete
            case it is 1xK. The algebra for the continuous case should be
            checked to ensure grad has the intended shape.
        cov : (K, K) array
            Covariance matrix of coefficients.
        """
        cov_me = grad @ cov @ grad.T
        # If scalar (1x1), np.diag will still return a length-1 array
        return np.sqrt(np.diag(cov_me))



    theta0 = probit.starting_values(y, X)
    probit_results = est.estimate(probit.q, theta0, y, X, options=options, method=method)
    probit_tab = est.print_table(x_labels, probit_results, title=f"Probit, y = {y_label}")

    theta0 = logit.starting_values(y, X)
    logit_results = est.estimate(logit.q, theta0, y, X, options=options, method=method)
    logit_tab = est.print_table(x_labels, logit_results, title=f"Logit, y = {y_label}")

    # Coefficient vectors
    b_pr = probit_tab.theta.values
    b_lg = logit_tab.theta.values

    # Covariance matrices
    cov_pr = probit_results["cov"]
    cov_lg = logit_results["cov"]

    n, K = X.shape
    df = n - K  # degrees of freedom for t-test
    n_vars = K - constant  # number of regressors for which to compute ME

    # Values of X at which to evaluate ME
    if average_effect:
        X_eval = np.mean(X, axis=0, keepdims=True)  # shape (1, K)
    else:
        X_eval = X  # shape (n, K)

    I_K = np.eye(K)

    results = {}

    # --- Loop over regressors ---------------------------------------------

    for k in range(n_vars):
        me_pr_list = []
        me_lg_list = []

        grad_pr_list = []
        grad_lg_list = []

        is_continuous = continuous_varlist[k]

        for i in range(X_eval.shape[0]):
            x_me = X_eval[i].reshape(1, -1)
            x_me2 = x_me.copy()
            x_me2[:, k] = 1  # counterfactual: regressor k = 1

            # Probit linear predictors
            xb_pr = x_me @ b_pr
            xb_pr2 = x_me2 @ b_pr

            # Logit linear predictors
            xb_lg = x_me @ b_lg
            xb_lg2 = x_me2 @ b_lg

            if is_continuous:
                # Continuous effect: derivative wrt x_k, scaled by 1/10
                gx0_pr = norm.pdf(xb_pr)  # probit pdf
                me_pr = gx0_pr * b_pr[k] / 10

                gx0_lg = logistic.pdf(xb_lg)  # logit pdf
                me_lg = gx0_lg * b_lg[k] / 10

                # Gradient wrt parameters (delta method)
                # NOTE: grad_pr and grad_lg are KxK here; consider revisiting
                # the algebra if you intended a K-vector gradient.
                bb_pr = np.outer(b_pr, b_pr)
                xx = np.outer(x_me, x_me)
                grad_pr = gx0_pr * (I_K - bb_pr @ xx)

                bb_lg = np.outer(b_lg, b_lg)
                grad_lg = gx0_lg * (I_K - bb_lg @ xx)

            else:
                # Discrete effect: change in probability when x_k moves from 0 to 1
                me_pr = norm.cdf(xb_pr2) - norm.cdf(xb_pr)
                me_lg = logit.G(xb_lg2) - logit.G(xb_lg)

                gx0_pr = norm.pdf(xb_pr)
                gx2_pr = norm.pdf(xb_pr2)
                grad_pr = gx2_pr * x_me2 - gx0_pr * x_me

                gx0_lg = logistic.pdf(xb_lg)
                gx2_lg = logistic.pdf(xb_lg2)
                grad_lg = gx2_lg * x_me2 - gx0_lg * x_me

            me_pr_list.append(me_pr)
            me_lg_list.append(me_lg)

            grad_pr_list.append(grad_pr)
            grad_lg_list.append(grad_lg)

        # Average over evaluation points
        me_pr_avg = np.mean(me_pr_list)
        me_lg_avg = np.mean(me_lg_list)

        grad_pr_avg = np.mean(grad_pr_list, axis=0)
        grad_lg_avg = np.mean(grad_lg_list, axis=0)

        # Delta-method standard errors
        se_pr = get_se(grad_pr_avg, cov_pr)[0]
        se_lg = get_se(grad_lg_avg, cov_lg)[0]

        results[x_labels[k]] = {
            "me_probit": me_pr_avg,
            "se_probit": se_pr,
            "me_logit": me_lg_avg,
            "se_logit": se_lg,
        }

    # --- Format output table ----------------------------------------------

    formatted_rows = []

    for var, vals in results.items():
        me_pr = vals["me_probit"]
        se_pr = vals["se_probit"]
        me_lg = vals["me_logit"]
        se_lg = vals["se_logit"]

        stars_pr = significance_stars(me_pr, se_pr, df)
        stars_lg = significance_stars(me_lg, se_lg, df)

        formatted_rows.append(
            [
                var,
                f"{me_pr:.{decimals}f}{stars_pr}",
                f"{me_lg:.{decimals}f}{stars_lg}",
            ]
        )
        formatted_rows.append(
            [
                "",
                f"({se_pr:.{decimals}f})",
                f"({se_lg:.{decimals}f})",
            ]
        )

    formatted_df = pd.DataFrame(
        formatted_rows, columns=["Variable", "Probit Model", "Logit Model"]
    )

    return formatted_df


In [30]:
# Common estimation options
options = {
    "maxiter": 10000,
    "disp": True,
}

method = "BFGS"
constant = 1
average_effect = True
decimals = 3  # optional, just to make it explicit

# No-controls specification
df_nc = probit_logit_model(
    y=y,
    X=x_nc,
    x_labels=x_lab_nc,
    continuous_varlist=continuous_varlist_nc,
    y_label=y_label,
    options=options,
    method=method,
    constant=constant,
    average_effect=average_effect,
    decimals=decimals,
)

# Controls specification
df_c = probit_logit_model(
    y=y,
    X=x_c,
    x_labels=x_lab_c,
    continuous_varlist=continuous_varlist_c,
    y_label=y_label,
    options=options,
    method=method,
    constant=constant,
    average_effect=average_effect,
    decimals=decimals,
)

# Rename columns for final presentation
df_nc = df_nc.rename(
    columns={
        "Variable": "Variable NC",
        "Probit Model": "Probit No Controls (1)",
        "Logit Model":  "Logit No Controls (2)",
    }
)

df_c = df_c.rename(
    columns={
        "Variable":     "Variables",
        "Probit Model": "Probit Controls (3)",
        "Logit Model":  "Logit Controls (4)",
    }
)

# Combine side by side
formatted_df = pd.concat([df_nc, df_c], axis=1)

# Drop duplicate variable column from the no-controls side
formatted_df = formatted_df.drop(columns=["Variable NC"])

# Order columns explicitly
formatted_df = formatted_df[
    [
        "Variables",
        "Probit No Controls (1)",
        "Logit No Controls (2)",
        "Probit Controls (3)",
        "Logit Controls (4)",
    ]
]

# Replace any remaining NaNs with empty strings for display
formatted_df = formatted_df.fillna("")

formatted_df


NameError: name 'continuous_varlist_nc' is not defined