# Introduction

Since Jan. 1, 2015, [The Washington Post](https://www.washingtonpost.com/) has been compiling a database of every fatal shooting in the US by a police officer in the line of duty.
[Source of census data](https://factfinder.census.gov/faces/nav/jsf/pages/community_facts.xhtml).

In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

pd.options.display.float_format = '{:,.2f}'.format

house_income_data = pd.read_csv('data/Median_Household_Income_2015.csv', encoding="windows-1252")
pct_poverty_data = pd.read_csv('data/Pct_People_Below_Poverty_Level.csv', encoding="windows-1252")
pct_completed_Hschool_data = pd.read_csv('data/Pct_Over_25_Completed_High_School.csv', encoding="windows-1252")
race_by_city_data = pd.read_csv('data/Share_of_Race_By_City.csv', encoding="windows-1252")
fatalities_data = pd.read_csv('data/Deaths_by_Police_US.csv', encoding="windows-1252")

## Preliminary Data Exploration

* What is the shape of the DataFrames?
* How many rows and columns do they have?
* What are the column names?
* Are there any NaN values or duplicates?

In [None]:
#print(f'Mediand Household Income: {house_income_data.shape}')
print(f'Columns Names {house_income_data.columns}\n')
#print(f'Nan values: {house_income_data.isna().sum()}')
#print(f'Duplicated values: {house_income_data.duplicated().sum()}\n')

#print(f'Poverty Percentage: {pct_poverty_data.shape}')
print(f'Columns Names {pct_poverty_data.columns}\n')
#print(f'Nan values: {pct_poverty_data.isna().sum()}')
#print(f'Duplicated values: {pct_poverty_data.duplicated().sum()}\n')

#print(f'Completed Highschool Percentage: {pct_completed_Hschool_data.shape}')
print(f'Columns Names {pct_completed_Hschool_data.columns}\n')
#print(f'Nan values: {pct_completed_Hschool_data.isna().sum()}')
#print(f'Duplicated values: {pct_completed_Hschool_data.duplicated().sum()}\n')

#print(f'Race by City: {race_by_city_data.shape}')
print(f'Columns Names {race_by_city_data.columns}\n')
#print(f'Nan values: {race_by_city_data.isna().sum()}')
#print(f'Duplicated values: {race_by_city_data.duplicated().sum()}\n')

#print(f'Deaths by Police: {fatalities_data.shape}')
print(f'Columns Names {fatalities_data.columns}\n')
#print(f'Nan values: {fatalities_data.isna().sum()}')
#print(f'Duplicated values: {fatalities_data.duplicated().sum()}\n')
fatalities_data.head()

In [None]:
house_income_data = house_income_data.dropna()

fatalities_data = fatalities_data.dropna(subset=['race'])
fatalities_data['age'] = fatalities_data['age'].fillna(fatalities_data['age'].median())
fatalities_data['flee'] = fatalities_data['flee'].fillna('Not fleeing')
fatalities_data['armed'] = fatalities_data['armed'].fillna('unarmed')


print(f'Nan values: {fatalities_data.isna().sum()}')
print(f'Duplicated values: {fatalities_data.duplicated().sum()}\n')

## Chart of the Poverty Rate in each US State

Bar chart that ranks the poverty rate from highest to lowest by US state. Which state has the highest poverty rate? Which state has the lowest poverty rate?  Bar Plot

In [None]:
# Convert'poverty_rate' to number
pct_poverty_data['poverty_rate'] = pd.to_numeric(pct_poverty_data['poverty_rate'], errors='coerce')
# group by state
poverty_by_state = pct_poverty_data.groupby('Geographic Area')['poverty_rate'].mean().sort_values(ascending=False)

plt.figure(figsize=(12, 8))
poverty_by_state.plot(kind='bar', color='tomato', edgecolor='black')

plt.title('Poverty Rate by US State (Highest to Lowest)', fontsize=14)
plt.ylabel('Poverty Rate (%)')
plt.xlabel('US State')
plt.xticks(rotation=90)
plt.tight_layout()
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

# get the max and min
highest_poverty_state = poverty_by_state.idxmax()
lowest_poverty_state = poverty_by_state.idxmin()

print(f"Estado com MAIOR taxa de pobreza: {highest_poverty_state} ({poverty_by_state.max():.2f}%)")
print(f"Estado com MENOR taxa de pobreza: {lowest_poverty_state} ({poverty_by_state.min():.2f}%)")

## Chart of the High School Graduation Rate by US State

This show the High School Graduation Rate in ascending order of US States. Which state has the lowest high school graduation rate? Which state has the highest?

In [None]:
# Convert to numeric
pct_completed_Hschool_data['percent_completed_hs'] = pd.to_numeric(
    pct_completed_Hschool_data['percent_completed_hs'], errors='coerce'
)

#group by state and get mean
hs_completion_by_state = pct_completed_Hschool_data.groupby('Geographic Area')['percent_completed_hs'] \
                                                   .mean() \
                                                   .sort_values(ascending=True)  #ascending

plt.figure(figsize=(12, 8))
hs_completion_by_state.plot(kind='bar', color='steelblue', edgecolor='black')

plt.title('High School Graduation Rate by US State (Ascending Order)', fontsize=14)
plt.ylabel('Graduation Rate (%)')
plt.xlabel('US State')
plt.xticks(rotation=90)
plt.tight_layout()
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

print(f"Menor taxa de conclusão do ensino médio: {hs_completion_by_state.idxmin()} ({hs_completion_by_state.min():.2f}%)")
print(f"Maior taxa de conclusão do ensino médio: {hs_completion_by_state.idxmax()} ({hs_completion_by_state.max():.2f}%)")

## Visualise the Relationship between Poverty Rates and High School Graduation Rates

#### A line chart with two y-axes to show if the rations of poverty and high school graduation move together.  

In [None]:
# Convert to numeric values
pct_poverty_data['poverty_rate'] = pd.to_numeric(pct_poverty_data['poverty_rate'], errors='coerce')
pct_completed_Hschool_data['percent_completed_hs'] = pd.to_numeric(
    pct_completed_Hschool_data['percent_completed_hs'], errors='coerce')

#group by state
poverty_by_state = pct_poverty_data.groupby('Geographic Area')['poverty_rate'].mean()
hs_completion_by_state = pct_completed_Hschool_data.groupby('Geographic Area')['percent_completed_hs'].mean()

# concatenate data
combined_data = pd.concat([poverty_by_state, hs_completion_by_state], axis=1).dropna()
combined_data = combined_data.sort_index()  # order by state

fig, ax1 = plt.subplots(figsize=(14, 8))

color1 = 'tomato'
ax1.set_xlabel('US State')
ax1.set_ylabel('Poverty Rate (%)', color=color1)
ax1.plot(combined_data.index, combined_data['poverty_rate'], color=color1, label='Poverty Rate')
ax1.tick_params(axis='y', labelcolor=color1)
ax1.set_xticklabels(combined_data.index, rotation=90)

ax2 = ax1.twinx()
color2 = 'steelblue'
ax2.set_ylabel('High School Graduation Rate (%)', color=color2)
ax2.plot(combined_data.index, combined_data['percent_completed_hs'], color=color2, label='Graduation Rate')
ax2.tick_params(axis='y', labelcolor=color2)

plt.title('Poverty Rate vs High School Graduation Rate by US State')
fig.tight_layout()
plt.grid(axis='x', linestyle='--', alpha=0.5)
plt.show()

### Using a Seaborn .jointplot() with a Kernel Density Estimate (KDE) to visualise the same relationship

In [None]:
#convert columns
pct_poverty_data['poverty_rate'] = pd.to_numeric(pct_poverty_data['poverty_rate'], errors='coerce')
pct_completed_Hschool_data['percent_completed_hs'] = pd.to_numeric(pct_completed_Hschool_data['percent_completed_hs'], errors='coerce')

#group by state
poverty_by_state = pct_poverty_data.groupby('Geographic Area')['poverty_rate'].mean()
hs_completion_by_state = pct_completed_Hschool_data.groupby('Geographic Area')['percent_completed_hs'].mean()

#concatenate the data formatted
combined_data = pd.concat([poverty_by_state, hs_completion_by_state], axis=1).dropna()
combined_data.columns = ['poverty_rate', 'percent_completed_hs']

# jointplot with KDE and scatter
sns.set(style='white', color_codes=True)
plot = sns.jointplot(
    data=combined_data,
    x='percent_completed_hs',
    y='poverty_rate',
    kind='kde',  #kernel density
    fill=True,
    cmap='Reds',
    height=8
)
plot.plot_joint(sns.scatterplot, color='black', s=40)  # KDE points
plot.ax_joint.set_xlabel('High School Graduation Rate (%)')
plot.ax_joint.set_ylabel('Poverty Rate (%)')
plot.fig.suptitle('Poverty vs High School Graduation Rate (KDE & Scatter)', y=1.03)
plt.show()

### Seaborn's `.lmplot()` or `.regplot()` to show a linear regression between the poverty ratio and the high school graduation ratio.

In [None]:
#regression chart
plt.figure(figsize=(10, 6))
sns.regplot(
    data=combined_data,
    x='percent_completed_hs',
    y='poverty_rate',
    scatter_kws={'color': 'black', 's': 50},
    line_kws={'color': 'purple'},
)
plt.title('Linear Regression: Poverty Rate vs High School Graduation Rate')
plt.xlabel('High School Graduation Rate (%)')
plt.ylabel('Poverty Rate (%)')
plt.grid(True, linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()

## A Bar Chart with Subsections Showing the Racial Makeup of Each US State

Visualise the share of the white, black, hispanic, asian and native american population in each US State using a bar chart with sub sections.

In [None]:
#convert to numeric
race_columns = ['share_white', 'share_black', 'share_native_american', 'share_asian', 'share_hispanic']
race_by_city_data[race_columns] = race_by_city_data[race_columns].apply(pd.to_numeric, errors='coerce')

# group by state and get mean values
race_by_state = race_by_city_data.groupby('Geographic area')[race_columns].mean().dropna()

#ordering by share_white
race_by_state = race_by_state.sort_values('share_white', ascending=False)

# plot bar chart
fig, ax = plt.subplots(figsize=(14, 8))

bottom = pd.Series([0]*len(race_by_state), index=race_by_state.index)

for column in race_columns:
    ax.bar(
        race_by_state.index,
        race_by_state[column],
        bottom=bottom,
        label=column.replace('share_', '').capitalize()
    )
    bottom += race_by_state[column]

ax.set_title('Racial Makeup by US State')
ax.set_ylabel('Population Share (%)')
ax.set_xlabel('State')
ax.set_xticks(range(len(race_by_state.index)))
ax.set_xticklabels(race_by_state.index, rotation=90)
ax.legend(title='Race')
plt.tight_layout()
plt.show()

## Donut Chart of People Killed by Race


In [None]:
# race ocurrences
race_counts = fatalities_data['race'].value_counts()

fig, ax = plt.subplots(figsize=(8, 8))
wedges, texts, autotexts = ax.pie(
    race_counts,
    labels=race_counts.index,
    autopct='%1.1f%%',
    startangle=180,
    wedgeprops={'width': 0.3},
    pctdistance=0.5 #position ofpercentage texts
)
for autotext in autotexts:
    autotext.set(fontsize=10, color='black', ha='center')

ax.set_title('Deaths by Police - Distribution by Race')
ax.legend(wedges, race_counts.index, title="Race", loc="center left", fontsize=12)

plt.show()

## Chart Comparing the Total Number of Deaths of Men and Women

illustrate how many more men are killed compared to women.

In [None]:
#counting
gender_counts = fatalities_data['gender'].value_counts()


fig, ax = plt.subplots(figsize=(8, 6))
ax.bar(gender_counts.index, gender_counts, color=['blue', 'pink'])
ax.set_title('Total Number of Deaths by Gender')
ax.set_xlabel('Gender')
ax.set_ylabel('Number of Deaths')
ax.set_ylim(0, gender_counts.max() + 500)
plt.tight_layout()
plt.show()

##Box Plot Showing the Age and Manner of Death

Is there a difference between men and women in the manner of death?

In [None]:
filtered_data = fatalities_data.dropna(subset=['age', 'gender', 'manner_of_death'])
plt.figure(figsize=(12, 6))
sns.boxplot(data=filtered_data, x='gender', y='age', hue='manner_of_death')

plt.title('Box Plot: Age and Manner of Death by Gender')
plt.xlabel('Gender')
plt.ylabel('Age')
plt.legend(title='Manner of Death', bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.show()

## Were People Armed?

In what percentage of police killings were people armed? Below is a chart that show what kind of weapon (if any) the deceased was carrying. How many of the people killed by police were armed with guns versus unarmed?

In [None]:
#print(fatalities_data['armed'].value_counts().to_string())

#all ocurrences
armed_status = fatalities_data['armed'].value_counts()
unarmed_count = armed_status.get('unarmed', 0)
#all that werent unarmed
armed_count = armed_status.sum() - unarmed_count

data = {'Armed': armed_count, 'Unarmed': unarmed_count}

fig, ax = plt.subplots(figsize=(8, 6))

ax.bar(data.keys(), data.values(), color=['blue', 'red'])
ax.set_title('People Killed by Police - Armed vs Unarmed')
ax.set_xlabel('Armed Status')
ax.set_ylabel('Number of Deaths')

for i, count in enumerate(data.values()):
    ax.text(i, count + 10, f'{count}', ha='center', fontsize=12)

plt.tight_layout()
plt.show()

## How Old Were the People Killed?

### Work out what percentage of people killed were under 25 years old.  

In [None]:
#counting
under_25 = fatalities_data[fatalities_data['age'] < 25].shape[0]
over_25 = fatalities_data[fatalities_data['age'] >= 25].shape[0]

labels = ['Under 25', '25 and Over']
values = [under_25, over_25]
percentages = [under_25 / (under_25 + over_25) * 100, over_25 / (under_25 + over_25) * 100]

fig, ax = plt.subplots(figsize=(6, 5))
bars = ax.bar(labels, percentages, color=['skyblue', 'steelblue'])

#text above bars
for bar, pct in zip(bars, percentages):
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2, height + 1, f'{pct:.1f}%', ha='center', fontsize=12)

ax.set_ylabel('Percentage of Deaths')
ax.set_title('Police Killings by Age')

plt.ylim(0, max(percentages) + 10)
plt.tight_layout()
plt.show()

## A histogram and KDE plot that shows the distribution of ages of the people killed by police.

In [None]:
#age in numeric value
fatalities_data['age'] = pd.to_numeric(fatalities_data['age'], errors='coerce')
age_data = fatalities_data['age'].dropna()
plt.figure(figsize=(10, 6))
sns.histplot(age_data, kde=True, bins=30, color='red')

plt.title('Distribution of Ages of People Killed by Police')
plt.xlabel('Age')
plt.ylabel('Number of Deaths')
plt.tight_layout()
plt.show()

### Seperate KDE plot for each race. Is there a difference between the distributions?

In [None]:
#numeric and valid formatting
race_age_data = fatalities_data[['age', 'race']].dropna()
race_age_data['age'] = pd.to_numeric(race_age_data['age'], errors='coerce')
race_age_data = race_age_data.dropna()

plt.figure(figsize=(10, 6))
kde = sns.kdeplot(
    data=race_age_data,
    x='age',
    hue='race',
    common_norm=False,
    fill=True,
    alpha=0.4
)

plt.title('Age Distribution by Race - Police Killings')
plt.xlabel('Age')
plt.ylabel('Density')

plt.tight_layout()
plt.show()

## Race of People Killed
Chart that shows the total number of people killed by race.

In [None]:
#race ocurrences
race_counts = fatalities_data['race'].value_counts()

fig, ax = plt.subplots(figsize=(8, 6))
bars = ax.bar(race_counts.index, race_counts.values, color='skyblue')

ax.set_title('Total Number of People Killed by Race', fontsize=14)
ax.set_xlabel('Race')
ax.set_ylabel('Number of Deaths')

for bar in bars:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width() / 2, height + 2, str(int(height)),
            ha='center', va='bottom', fontsize=11)

plt.tight_layout()
plt.show()

## Mental Illness and Police Killings

What percentage of people killed by police have been diagnosed with a mental illness?

In [None]:
#counting
mental_illness_counts = fatalities_data['signs_of_mental_illness'].value_counts()
#calculating pct
total_cases = mental_illness_counts.sum()
percentage_mental_illness = (mental_illness_counts[True] / total_cases) * 100
percentage_no_mental_illness = 100 - percentage_mental_illness

labels = ['Diagnosed Mental Illness', 'No Mental Illness']
sizes = [percentage_mental_illness, percentage_no_mental_illness]
colors = ['#ff9999','#66b3ff']

fig, ax = plt.subplots(figsize=(7, 7))
ax.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=140, colors=colors)
ax.set_title('Mental Illness in People Killed by Police')
plt.show()

## In Which Cities Do the Most Police Killings Take Place?

Chart ranking the top 10 cities with the most police killings. Which cities are the most dangerous?  

In [None]:
deadlier_cities = fatalities_data['city'].value_counts().head(10)#top10 deadlier cities

plt.figure(figsize=(10, 6))
sns.barplot(x=deadlier_cities.values, y=deadlier_cities.index, palette='Reds_r',hue=deadlier_cities.index)

plt.title('Top 10 Cities with the Most Police Killings')
plt.xlabel('Number of Deaths')
plt.ylabel('City')
plt.tight_layout()
plt.show()

## Rate of Death by Race in Cities

The share of each race in the top 10 deadlier cities.

In [None]:
#top10 deadlier cities
deadlier_cities = fatalities_data['city'].value_counts().head(10).index

#empty dataframe
race_distribution_by_city = pd.DataFrame()

#loop through each city in top10
for city in deadlier_cities:
    #get the race pct on the city
    pct_race_in_city = fatalities_data[fatalities_data['city'] == city]

    #calc the distribution
    race_distribution = pct_race_in_city['race'].value_counts(normalize=True) * 100

    #insert at the dataframe
    race_distribution_by_city[city] = race_distribution

#cities as indexes
race_distribution_by_city = race_distribution_by_city.T.fillna(0)

ax = race_distribution_by_city.plot(kind='bar', stacked=True, figsize=(14, 8))
plt.title('Distribuição de Raça nas 10 Cidades Mais Mortíferas')
plt.xlabel('Cidade')
plt.ylabel('Proporção de Raça (%)')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

##Choropleth Map of Police Killings by US State

Which states are the most dangerous? Compare your map with your previous chart. Are these the same states with high degrees of poverty?

In [None]:
# state ISOs
us_states = pd.DataFrame({
    "state": [
        "AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE",
        "FL", "GA", "HI", "ID", "IL", "IN", "IA", "KS", "KY",
        "LA", "ME", "MD", "MA", "MI", "MN", "MS", "MO",
        "MT", "NE", "NV", "NH", "NJ", "NM", "NY", "NC",
        "ND", "OH", "OK", "OR", "PA", "RI", "SC", "SD",
        "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"
    ]
})
#print(fatalities_data['state'].unique())  # Verifique se o formato é com siglas

#merging
state_deaths = fatalities_data.groupby('state').size().reset_index(name='deaths')

#getting all states together
state_deaths = pd.merge(us_states, state_deaths, on="state", how="left").fillna(0)
#print(state_deaths.head())

fig = px.choropleth(
    state_deaths,
    locations='state',
    locationmode="USA-states",
    color='deaths',
    hover_name='state',
    color_continuous_scale="YlOrRd",
    labels={'deaths': 'Police Killings'},
    title="Police Killings by State"
)
fig.update_layout(geo=dict(scope='usa'))
fig.update_geos(showcoastlines=True, coastlinecolor="Black", showland=True, landcolor="white")
fig.show()

## Number of Police Killings Over Time

Analyse if there's there a trend in the data?

In [None]:
#datetime formating
fatalities_data['date'] = pd.to_datetime(fatalities_data['date'])

#decreasing deaths
deaths_per_year = fatalities_data['date'].dt.year.value_counts().sort_index(ascending=False)

plt.figure(figsize=(10, 6))
plt.barh(deaths_per_year.index.astype(str), deaths_per_year.values, color='indianred')

plt.xlabel("Número de Mortes")
plt.ylabel("Ano")
plt.title("Mortes por Ação Policial por Ano (EUA)")
plt.grid(axis='x', linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()

# Epilogue
read [The Washington Post's analysis here](https://www.washingtonpost.com/graphics/investigations/police-shootings-database/).