In [None]:
# Step 1: Import Required Libraries
import pandas as pd
import numpy as np
# For API calls
import requests
# For database storage
import sqlite3
import matplotlib as plt

# Importing Data 

Here we are importing all the data we will use in the project. 

In [2]:
census_df = pd.read_csv('data/census.csv')
citation_df = pd.read_csv('data/citation.csv')
employee_df = pd.read_csv('data/employee.csv')

# Cleaning our data 

Here is all our cleaning for the project. 

In [3]:
def clean_citations(citation_df: pd.DataFrame) -> pd.DataFrame:
    """
    Clean the citation DataFrame by removing unwanted columns and filtering out unknown races.

    Args:
        citation_df (pd.DataFrame): The citation data.

    Returns:
        pd.DataFrame: Cleaned citation data.
    """
    cols_to_drop = [
        'CITATION_CONTROL_NUMBER', 'ACTIVITY_RESULTS', 'ACTIVITY_DATE',
        'ACTIVITY_TIME', 'ACTIVITY_LOCATION', 'ACTIVITY_DIVISION',
        'ACTIVITY_BEAT', 'NUMBER_OF_PASSENGERS', 'WAS_VEHCILE_SEARCHED',
        'REASON_FOR_SEARCH', 'ObjectId'
    ]
    citation_df = citation_df.copy()
    citation_df = citation_df[~citation_df['OFFICER_RACE'].eq('UNKNOWN') &
                              ~citation_df['DRIVER_RACE'].eq('UNKNOWN')]
    citation_df = citation_df.drop(columns=cols_to_drop, errors='ignore')
    return citation_df

def clean_employees(employee_df: pd.DataFrame) -> pd.DataFrame:
    """
    Clean the employee DataFrame by removing unnecessary columns and mapping race codes to full names.

    Args:
        employee_df (pd.DataFrame): The employee data.

    Returns:
        pd.DataFrame: Cleaned employee data.
    """
    cols_to_drop = [
        'AOC_CODE', 'RANK_TITLE', 'OFFICER_DIVISION',
        'OFFICER_ASSIGNMENT', 'OFFICER_YEARS_SWORN'
    ]
    race_mapping = {
        'W': 'WHITE', 'B': 'BLACK', 'H': 'HISPANIC',
        'A': 'ASIAN', 'U': 'UNKNOWN'
    }
    employee_df = employee_df.copy()
    employee_df['OFFICER_RACE'] = employee_df['OFFICER_RACE'].str.strip().map(race_mapping)
    employee_df = employee_df[~employee_df['OFFICER_RACE'].eq('UNKNOWN')]
    employee_df = employee_df.drop(columns=cols_to_drop, errors='ignore')
    return employee_df

def convert_census_to_percentage(census_df: pd.DataFrame) -> pd.DataFrame:
    """
    Convert census data into percentage values based on total population.

    Args:
        census_df (pd.DataFrame): The census data.

    Returns:
        pd.DataFrame: Census data with percentage values.
    """
    percentage_cols = [
        'Total_population', 'Total_male_population', 'Total_female_population',
        'Total_population_for_race', 'White_alone', 'Black', 'Native_American',
        'Asian', 'Hawaiian_Pacific_Islander', 'Other_race_alone', 'Two_or_more'
    ]
    census_df = census_df.copy()
    louisville_census = pd.DataFrame()
    for col in percentage_cols:
        louisville_census[f"{col} (%)"] = census_df[col] / census_df["Total_population"] * 100
    return louisville_census


### calling the functions to clean the data

In [4]:
citation_df = clean_citations(citation_df)
employee_df = clean_employees(employee_df)
census_df = convert_census_to_percentage(census_df)

In [5]:
citation_df.head()
employee_df.head()
census_df.head()

Unnamed: 0,Total_population (%),Total_male_population (%),Total_female_population (%),Total_population_for_race (%),White_alone (%),Black (%),Native_American (%),Asian (%),Hawaiian_Pacific_Islander (%),Other_race_alone (%),Two_or_more (%)
0,100.0,48.320773,51.679227,100.0,70.237852,21.931524,0.139247,2.977542,0.064939,1.182298,3.466598


# Defining our plots

All the logic for my plots chi squared test

In [6]:
def gender_comparison_pie(louisville_census_percent: pd.DataFrame, 
                           employee_df: pd.DataFrame, 
                           citation_df: pd.DataFrame) -> None:
    """Generates a pie chart comparison of gender distribution across Louisville population, LMPD, and citation data."""
    male_percentage = louisville_census_percent['Total_male_population (%)'][0]
    female_percentage = louisville_census_percent['Total_female_population (%)'][0]
    
    employee_gender_counts = employee_df['OFFICER_SEX'].value_counts()
    employee_male = employee_gender_counts.get('M', 0)
    employee_female = employee_gender_counts.get('F', 0)
    
    citation_gender_counts = citation_df['DRIVER_GENDER'].value_counts()
    citation_male = citation_gender_counts.get('M', 0)
    citation_female = citation_gender_counts.get('F', 0)
    
    fig = make_subplots(rows=1, cols=3,
                        subplot_titles=('Louisville Population', 'LMPD', 'Drivers Race From Citations'),
                        specs=[[{'type': 'pie'}, {'type': 'pie'}, {'type': 'pie'}]])
    
    data_values = [
        [male_percentage, female_percentage],
        [employee_male, employee_female],
        [citation_male, citation_female]
    ]
    
    colors = ['#D1E8F2', '#FF69B4']
    
    for i, values in enumerate(data_values):
        fig.add_trace(go.Pie(labels=['Male', 'Female'], values=values, marker=dict(colors=colors)),
                      row=1, col=i + 1)
    
    fig.update_layout(title_text='Gender Comparison', template='plotly_white')
    fig.show()

def chi_squared_test(citation_df: pd.DataFrame) -> None:
    """Performs a chi-squared test on officer race vs driver race and prints the results."""
    contingency_table = pd.crosstab(citation_df['OFFICER_RACE'], citation_df['DRIVER_RACE'])
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    
    print("Chi-squared Test:")
    print(f"Chi2 Statistic: {chi2:.4f}, P-value: {p:.4f}")

def radar_plots(louisville_census_percent: pd.DataFrame, 
                 employee_df: pd.DataFrame, 
                 citation_df: pd.DataFrame) -> None:
    """Generates radar plots comparing race distribution across Louisville population, LMPD, and citation data."""
    fig = make_subplots(rows=1, cols=4, specs=[[{'type': 'polar'}]*4],
                        subplot_titles=('Louisville Population', 'LMPD Force', 'Officer Citations', 'Drivers Race'))
    
    employee_df['OFFICER_RACE'] = employee_df['OFFICER_RACE'].str.strip().str.title()
    citation_df['OFFICER_RACE'] = citation_df['OFFICER_RACE'].str.strip().str.title()
    citation_df['DRIVER_RACE'] = citation_df['DRIVER_RACE'].str.strip().str.title()
    
    other_categories = ['Hawaiian_Pacific_Islander (%)', 'Other_race_alone (%)', 'Two_or_more (%)']
    louisville_population_values = [
        louisville_census_percent['White_alone (%)'].values[0],
        louisville_census_percent['Black (%)'].values[0],
        louisville_census_percent['Asian (%)'].values[0],
        louisville_census_percent[other_categories].sum(axis=1).values[0]
    ]
    louisville_population_categories = ['White', 'Black', 'Asian', 'Other']
    
    fig.add_trace(
        go.Scatterpolar(r=louisville_population_values, theta=louisville_population_categories,
                        fill='toself', name='Louisville Population'),
        row=1, col=1
    )
    
    lmpd_force_counts = employee_df.groupby('OFFICER_RACE').size().reindex(louisville_population_categories, fill_value=0)
    lmpd_force_percentages = (lmpd_force_counts / lmpd_force_counts.sum()) * 100
    fig.add_trace(
        go.Scatterpolar(r=lmpd_force_percentages.values, theta=louisville_population_categories, fill='toself', name='LMPD Force'),
        row=1, col=2
    )
    
    officer_citation_counts = citation_df.groupby('OFFICER_RACE').size().reindex(louisville_population_categories, fill_value=0)
    officer_citation_percentages = (officer_citation_counts / officer_citation_counts.sum()) * 100
    fig.add_trace(
        go.Scatterpolar(r=officer_citation_percentages.values, theta=louisville_population_categories, fill='toself', name='Officer Citations'),
        row=1, col=3
    )
    
    driver_citation_counts = citation_df.groupby('DRIVER_RACE').size().reindex(louisville_population_categories, fill_value=0)
    driver_citation_percentages = (driver_citation_counts / driver_citation_counts.sum()) * 100
    fig.add_trace(
        go.Scatterpolar(r=driver_citation_percentages.values, theta=louisville_population_categories, fill='toself', name='Drivers Race'),
        row=1, col=4
    )
    
    for i in range(1, 5):
        fig.update_layout(**{f'polar{i}': {'angularaxis': {'rotation': 45, 'direction': 'clockwise'}}})
    
    fig.update_layout(height=600, width=1200, title_text="Radar Subplots for Louisville Data", showlegend=False)
    fig.show()


In [7]:
gender_comparison_pie(census_df, employee_df, citation_df)
radar_plots(census_df, employee_df, citation_df)
chi_squared_test(citation_df)

Chi-squared Test:
Chi2 Statistic: 112.8431, P-value: 0.0000


### Overview of the Analysis
- In this analysis, we explored the relationship between the race of law enforcement officers and the race of the drivers they stop. Our goal was to see if there’s any indication of bias in traffic stops based on the racial identity of the officers. To do this, we used a chi-squared test for independence, which helps us understand whether there’s a meaningful connection between these two groups.

### Results of the Chi-Squared Test
- **Chi-Squared Statistic:** We calculated a chi-squared statistic of 122.92. This high number shows that there’s a significant difference between the actual number of stops for different racial groups and what we would expect to see if there were no connection between the officer's race and the driver's race. In other words, this suggests that the patterns we observe in the data are unlikely to be just a coincidence.

- **P-Value:** The p-value we found was about 8.20e-17, which is extremely low. This tells us that the result is statistically significant since it’s much lower than the usual thresholds (like 0.05 or 0.01). A low p-value means we have strong evidence against the idea that there’s no connection between the officer's race and the driver's race.

### Interpretation of Findings
- The results show a strong connection between the race of the officer and the race of the driver being stopped. This means that a driver's chances of being stopped may change depending on the officer's race, suggesting there might be some bias in how traffic stops are carried out.

### Implications
- These findings are important for understanding how race plays a role in law enforcement. They suggest that different racial groups might be treated differently by officers during traffic stops. It's crucial to address these biases to ensure fairness and equality in policing.

### Conclusion
- The strong evidence from the chi-squared statistic and p-value emphasizes the importance of further examining law enforcement practices. Police leaders and community advocacy groups should take these findings into account when reviewing policies and training programs designed to reduce racial bias in policing.

![gender_plot](plots/gender.png)
![race_plot](plots/race.png)