# Racial Disparities in Policing and Prosecution
### A Case Study in Orange County

In [2]:
# Import statements
import pandas as pd
import numpy as np

# Display all columns
pd.set_option('display.max_columns', None)

## Import the data

In [3]:
# Import 2021-2023 cleaned RIPA policing dataset for Orange County directly from my GitHub
# See "RIPA Cleaning.ipynb" for details on how these were generated from the raw data
ripa_orange_2021 = "https://raw.githubusercontent.com/laurenbchu/honors-thesis/main/data/cleaned_ripa_orange_2021.csv"
ripa_orange_2022 = "https://raw.githubusercontent.com/laurenbchu/honors-thesis/main/data/cleaned_ripa_orange_2022.csv"
ripa_orange_2023 = "https://raw.githubusercontent.com/laurenbchu/honors-thesis/main/data/cleaned_ripa_orange_2023.csv"

ripa_21 = pd.read_csv(ripa_orange_2021)
ripa_22 = pd.read_csv(ripa_orange_2022)
ripa_23 = pd.read_csv(ripa_orange_2023)

In [4]:
# Adding in a YEAR column for ease of use later
ripa_21['YEAR'] = 2021
ripa_22['YEAR'] = 2022
ripa_23['YEAR'] = 2023

In [5]:
# Concatenate all three RIPA datasets together to match the RJA prosecution dataset
policing = pd.concat([ripa_21, ripa_22, ripa_23], axis=0)

# Mapping the codes for race (RAE_FULL) from numbers to race strings
races = {
    1: "Asian",
    2: "Black/African American",
    3: "Hispanic/Latino",
    4: "Middle Eastern/South Asian",
    5: "Native American",
    6: "Pacific Islander",
    7: "White",
    8: "Multiracial"
}

policing['RAE_FULL'] = policing['RAE_FULL'].map(races)

In [6]:
# Import the cleaned ACLU prosecutorial data for Orange County 2021-2023 directly from my GitHub
cleaned_orange_aclu_2021_2023 = "https://raw.githubusercontent.com/laurenbchu/honors-thesis/main/data/cleaned_orange_aclu_2021_2023.csv"
prosecution = pd.read_csv(cleaned_orange_aclu_2021_2023, low_memory=False)

In [7]:
# Adding in a year column for ease of use later
prosecution['filed_date'] = pd.to_datetime(prosecution["filed_date"])
prosecution['year'] = prosecution['filed_date'].dt.year

## Data Summary

In [8]:
# Policing (RIPA) data

print(f'The RIPA Orange County policing data has', policing.shape[0], f'rows and', policing.shape[1], f'columns')
print(f'\nHere are the number of rows broken down by year:', policing.groupby("YEAR").size())
print(f'\nHere are the number of rows broken down by race:', policing["RAE_FULL"].value_counts())
print(f'\nThe NAs per column in descending order are as follows:\n', policing.isna().sum().sort_values(ascending=False))

The RIPA Orange County policing data has 447512 rows and 16 columns

Here are the number of rows broken down by year: YEAR
2021     90714
2022    189834
2023    166964
dtype: int64

Here are the number of rows broken down by race: RAE_FULL
Hispanic/Latino               192344
White                         156527
Asian                          43551
Black/African American         22186
Middle Eastern/South Asian     20763
Multiracial                     6040
Native American                 3690
Pacific Islander                2411
Name: count, dtype: int64

The NAs per column in descending order are as follows:
 RAE_FULL                  0
REASON_FOR_STOP           0
ADS_SEARCH_PERSON         0
ADS_SEARCH_PROPERTY       0
CED_NONE_CONTRABAND       0
CED_FIREARM               0
CED_AMMUNITION            0
CED_WEAPON                0
CED_DRUGS                 0
CED_ALCOHOL               0
CED_MONEY                 0
CED_DRUG_PARAPHERNALIA    0
CED_STOLEN_PROP           0
CED_ELECT_DEVICE 

In [9]:
# Prosecution (RJA) data

print(f'The RJA Orange County prosecution data has', prosecution.shape[0], f'rows and', prosecution.shape[1], f'columns')
print(f'\nHere are the number of rows broken down by year:', prosecution.groupby("year").size())
print(f'\nHere are the number of rows broken down by race:', prosecution["canonical_race"].value_counts())
print(f'\nThe NAs per column in descending order are as follows:\n', prosecution.isna().sum().sort_values(ascending=False))

The RJA Orange County prosecution data has 259118 rows and 30 columns

Here are the number of rows broken down by year: year
2021    102847
2022    102820
2023     53451
dtype: int64

Here are the number of rows broken down by race: canonical_race
Latinx    132414
White      88792
Black      15177
Asian      11808
Other      10927
Name: count, dtype: int64

The NAs per column in descending order are as follows:
 statute_level              44894
sentenced_date             35791
category                      72
disposition_description       30
was_convicted                 30
was_filed_by_da                8
referral_date                  0
filed_date                     0
county_id                      0
source_case_id                 0
source_defendant_id            0
source_incident_id             0
disposition_date               0
source_docket_id               0
statute                        0
type                           0
disposition_id                 0
race_id                

## Summary Statistics

### Policing (RIPA)

In [10]:
# Create a column to indicate if any search occurred 
policing["SEARCHED"] = (
    (policing["ADS_SEARCH_PERSON"] == 1) | 
    (policing["ADS_SEARCH_PROPERTY"] == 1)
).astype(int)

# Calculate the search rate by race
search_rates = policing.groupby("RAE_FULL")["SEARCHED"].mean()
print(f'The search rates by race are:', search_rates.sort_values(ascending=False))

The search rates by race are: RAE_FULL
Black/African American        0.217705
Hispanic/Latino               0.209245
Pacific Islander              0.170883
White                         0.165735
Multiracial                   0.132781
Asian                         0.069505
Middle Eastern/South Asian    0.058855
Native American               0.019241
Name: SEARCHED, dtype: float64


In [11]:
# Create a column to indicate if any hit occurred 
contraband = [
    "CED_FIREARM", 
    "CED_AMMUNITION", 
    "CED_WEAPON", 
    "CED_DRUGS", 
    "CED_ALCOHOL", 
    "CED_MONEY", 
    "CED_DRUG_PARAPHERNALIA", 
    "CED_STOLEN_PROP", 
    "CED_ELECT_DEVICE", 
    "CED_OTHER_CONTRABAND"
]

policing["HIT"] = (policing[contraband].sum(axis=1) > 0)

# Calculate the contraband hit race by race
hit_rates = policing.groupby("RAE_FULL")["HIT"].mean()
print(f'The hit rates by race are:', hit_rates)

The hit rates by race are: RAE_FULL
Asian                         0.033799
Black/African American        0.100018
Hispanic/Latino               0.104287
Middle Eastern/South Asian    0.023552
Multiracial                   0.066887
Native American               0.010298
Pacific Islander              0.085442
White                         0.092310
Name: HIT, dtype: float64


### Prosecution (RJA)

In [12]:
prosecution["was_referred_by_lea"].value_counts()

was_referred_by_lea
True     214224
False     44894
Name: count, dtype: int64

In [13]:
prosecution["was_filed_by_da"].value_counts()

# Because of this result, we cannot compute a disparity rate because there's too little False values

was_filed_by_da
True     259068
False        42
Name: count, dtype: int64

In [29]:
# Calculate conviction rate per race per year
conviction_rates = prosecution.groupby(["canonical_race", "year"])["was_convicted"].agg(["sum", "count"]).reset_index()
conviction_rates.rename(columns={"sum": "number_convicted", "count": "total_cases"}, inplace=True)
conviction_rates["conviction_rate"] = conviction_rates["number_convicted"] / conviction_rates["total_cases"]

conviction_rates

Unnamed: 0,canonical_race,year,number_convicted,total_cases,conviction_rate
0,Asian,2021,3233,4578,0.706204
1,Asian,2022,3051,4228,0.721618
2,Asian,2023,2256,3002,0.751499
3,Black,2021,3829,5250,0.729333
4,Black,2022,4828,6609,0.730519
5,Black,2023,2208,3312,0.666667
6,Latinx,2021,38969,50491,0.771801
7,Latinx,2022,42074,54525,0.771646
8,Latinx,2023,20381,27383,0.744294
9,Other,2021,2871,4653,0.617021


In [30]:
# Calculate enhancement rate, per race per year
enhancement_rates = prosecution.groupby(["canonical_race", "year"])["is_enhancement_charge"].agg(["sum", "count"]).reset_index()
enhancement_rates.rename(columns={"sum": "number_enhancements", "count": "total_cases"}, inplace=True)
enhancement_rates["enhancement_rate"] = enhancement_rates["number_enhancements"] / enhancement_rates["total_cases"]

enhancement_rates

Unnamed: 0,canonical_race,year,number_enhancements,total_cases,enhancement_rate
0,Asian,2021,494,4578,0.107907
1,Asian,2022,422,4228,0.099811
2,Asian,2023,246,3002,0.081945
3,Black,2021,395,5256,0.075152
4,Black,2022,426,6609,0.064458
5,Black,2023,151,3312,0.045592
6,Latinx,2021,4672,50505,0.092506
7,Latinx,2022,4556,54526,0.083556
8,Latinx,2023,2085,27383,0.076142
9,Other,2021,681,4653,0.146357


In [39]:
# Table with convictions and enhancements
merged = conviction_rates.merge(
    enhancement_rates[["canonical_race", "year", "number_enhancements", "enhancement_rate"]],
    on=["canonical_race", "year"],
    how="left"
)    

# Convert rates to percentages for readability
merged["conviction_rate"] = pd.to_numeric(merged["conviction_rate"], errors="coerce")
merged["enhancement_rate"] = pd.to_numeric(merged["enhancement_rate"], errors="coerce")
merged["conviction_rate"] = merged["conviction_rate"] * 100
merged["enhancement_rate"] = merged["enhancement_rate"] * 100

# Round to nearest thousandth
merged["conviction_rate"] = merged["conviction_rate"].round(2)
merged["enhancement_rate"] = merged["enhancement_rate"].round(2)

# Rename from rate to percentage
merged = merged.rename(columns={
        "conviction_rate": "conviction_percentage",
        "enhancement_rate": "enhancement_percentage"
})

# Only 2023
merged[merged["year"] == 2023]

Unnamed: 0,canonical_race,year,number_convicted,total_cases,conviction_percentage,number_enhancements,enhancement_percentage
2,Asian,2023,2256,3002,75.15,246,8.19
5,Black,2023,2208,3312,66.67,151,4.56
8,Latinx,2023,20381,27383,74.43,2085,7.61
11,Other,2023,1295,2003,64.65,165,8.24
14,White,2023,13405,17750,75.52,1138,6.41


In [51]:
# Calculate distribution of types of charges
prosecution["statute_level"].value_counts()

charge_distribution = prosecution.groupby(["canonical_race", "year", "statute_level"]).size().reset_index(name="number_of_cases")
charge_distribution["percent_of_cases"] = charge_distribution.groupby(["canonical_race", "year"])["number_of_cases"].transform(lambda x: x / x.sum())

# Only 2023
charge_distribution[charge_distribution["year"] == 2023].head(9)

Unnamed: 0,canonical_race,year,statute_level,number_of_cases,percent_of_cases
6,Asian,2023,Felony,876,0.397459
7,Asian,2023,Infraction,3,0.001361
8,Asian,2023,Misdemeanor,1325,0.60118
13,Black,2023,Felony,1121,0.432986
14,Black,2023,Infraction,1,0.000386
15,Black,2023,Misdemeanor,1467,0.566628
22,Latinx,2023,Felony,7188,0.332978
23,Latinx,2023,Infraction,24,0.001112
24,Latinx,2023,Misdemeanor,14375,0.66591
