In [56]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [57]:
df = pd.read_csv('Provisional_COVID-19_Deaths_by_County__and_Race_and_Hispanic_Origin_20240224.csv')

In [58]:
df.head()

Unnamed: 0,Data as of,Start Date,End Date,State,County Name,Urban Rural Code,FIPS State,FIPS County,FIPS Code,Indicator,...,COVID-19 Deaths,Non-Hispanic White,Non-Hispanic Black,Non-Hispanic American Indian or Alaska Native,Non-Hispanic Asian,Non-Hispanic Native Hawaiian or Other Pacific Islander,Hispanic,Other,Urban Rural Description,Footnote
0,09/27/2023,01/01/2020,09/23/2023,AK,Anchorage Municipality,3,2,20,2020,Distribution of all-cause deaths (%),...,787,0.568,0.044,0.216,0.058,0.03,0.033,0.05,Medium metro,
1,09/27/2023,01/01/2020,09/23/2023,AK,Anchorage Municipality,3,2,20,2020,Distribution of COVID-19 deaths (%),...,787,0.452,0.037,0.255,0.111,0.074,0.038,0.033,Medium metro,
2,09/27/2023,01/01/2020,09/23/2023,AK,Anchorage Municipality,3,2,20,2020,Distribution of population (%),...,787,0.564,0.052,0.083,0.098,0.031,0.095,0.077,Medium metro,
3,09/27/2023,01/01/2020,09/23/2023,AK,Fairbanks North Star Borough,4,2,90,2090,Distribution of all-cause deaths (%),...,214,0.71,0.024,0.173,0.02,,0.027,0.044,Small metro,One or more data cells have counts between 1-9...
4,09/27/2023,01/01/2020,09/23/2023,AK,Fairbanks North Star Borough,4,2,90,2090,Distribution of COVID-19 deaths (%),...,214,0.626,,0.257,,,,0.056,Small metro,One or more data cells have counts between 1-9...


In [59]:
#Clean Data to only include rows for deaths from COVID-19
df = df[df['Indicator'] == 'Distribution of COVID-19 deaths (%)']
df.reset_index(drop=True, inplace=True)
df.head(10)

Unnamed: 0,Data as of,Start Date,End Date,State,County Name,Urban Rural Code,FIPS State,FIPS County,FIPS Code,Indicator,...,COVID-19 Deaths,Non-Hispanic White,Non-Hispanic Black,Non-Hispanic American Indian or Alaska Native,Non-Hispanic Asian,Non-Hispanic Native Hawaiian or Other Pacific Islander,Hispanic,Other,Urban Rural Description,Footnote
0,09/27/2023,01/01/2020,09/23/2023,AK,Anchorage Municipality,3,2,20,2020,Distribution of COVID-19 deaths (%),...,787,0.452,0.037,0.255,0.111,0.074,0.038,0.033,Medium metro,
1,09/27/2023,01/01/2020,09/23/2023,AK,Fairbanks North Star Borough,4,2,90,2090,Distribution of COVID-19 deaths (%),...,214,0.626,,0.257,,,,0.056,Small metro,One or more data cells have counts between 1-9...
2,09/27/2023,01/01/2020,09/23/2023,AK,Matanuska-Susitna Borough,3,2,170,2170,Distribution of COVID-19 deaths (%),...,241,0.826,,0.083,,,0.046,,Medium metro,One or more data cells have counts between 1-9...
3,09/27/2023,01/01/2020,09/23/2023,AL,Autauga County,3,1,1,1001,Distribution of COVID-19 deaths (%),...,197,0.761,0.218,,,,,,Medium metro,One or more data cells have counts between 1-9...
4,09/27/2023,01/01/2020,09/23/2023,AL,Baldwin County,4,1,3,1003,Distribution of COVID-19 deaths (%),...,653,0.884,0.086,,,,0.023,,Small metro,One or more data cells have counts between 1-9...
5,09/27/2023,01/01/2020,09/23/2023,AL,Blount County,2,1,9,1009,Distribution of COVID-19 deaths (%),...,103,0.932,,,,,,,Large fringe metro,One or more data cells have counts between 1-9...
6,09/27/2023,01/01/2020,09/23/2023,AL,Calhoun County,4,1,15,1015,Distribution of COVID-19 deaths (%),...,670,0.809,0.17,,,,0.016,,Small metro,One or more data cells have counts between 1-9...
7,09/27/2023,01/01/2020,09/23/2023,AL,Coffee County,5,1,31,1031,Distribution of COVID-19 deaths (%),...,201,0.796,0.154,,,,,,Micropolitan,One or more data cells have counts between 1-9...
8,09/27/2023,01/01/2020,09/23/2023,AL,Colbert County,4,1,33,1033,Distribution of COVID-19 deaths (%),...,348,0.836,0.126,,,,0.034,,Small metro,One or more data cells have counts between 1-9...
9,09/27/2023,01/01/2020,09/23/2023,AL,Covington County,6,1,39,1039,Distribution of COVID-19 deaths (%),...,263,0.867,0.129,,,,,,Noncore,One or more data cells have counts between 1-9...


In [60]:
#Clean Data to only include coastal states(Column with East, West)
coast_states = ['WA', 'OR', 'CA', 'ME', 'NH', 'MA', 'RI', 'CT', 'NY', 'NJ', 'PA', 'DE', 'MD', 'VA', 'NC', 'SC', 'GA', 'FL']
df['Coast'] = df['State'].apply(lambda x: 'West' if x in ['CA', 'OR', 'WA'] else 'East')
df = df[df['State'].isin(coast_states)]
df.shape

(500, 22)

In [61]:
#170 coastal counties where theres info for Non-Hispanic Whites, Non-Hispanic Blacks, Hispanics, Non-Hispanic Asians
mask_white = df['Non-Hispanic White'].notna()
mask_black = df['Non-Hispanic Black'].notna()
mask_asian = df['Non-Hispanic Asian'].notna()
mask_hispanic = df['Hispanic'].notna()
combined_mask = mask_white & mask_black & mask_hispanic &mask_asian
num_rows_non_nan = combined_mask.sum()

print(num_rows_non_nan)



170


One of the biggest challenges we were thinking about was that a lot of counties had different or missing information for ethnicities. Some counties have information for every ethnicity, while others have missing data for one ethnicity only or for many ethnicities. Taking a look at the data, at least a significant number of counties had information for Non-Hispanic White, Black and Asians and Hispanics, which we felt was strong enough to analyze racial data trends for these counties. Thus we kept counties which had information for these categories and added one which were all ethnicities outside of these.

In [62]:
#drop counties where theres missing info in any of aforementioned columns
df = df[combined_mask]
df.reset_index(drop=True, inplace=True)
df['Non-White, Non-Black, Non-Hispanic, Non-Asian'] = 1 - df['Non-Hispanic White'] - df['Non-Hispanic Black'] - df['Hispanic'] - df['Non-Hispanic Asian']
df.head()


Unnamed: 0,Data as of,Start Date,End Date,State,County Name,Urban Rural Code,FIPS State,FIPS County,FIPS Code,Indicator,...,Non-Hispanic White,Non-Hispanic Black,Non-Hispanic American Indian or Alaska Native,Non-Hispanic Asian,Non-Hispanic Native Hawaiian or Other Pacific Islander,Hispanic,Other,Urban Rural Description,Footnote,Coast
0,09/27/2023,01/01/2020,09/23/2023,CA,Alameda County,1,6,1,6001,Distribution of COVID-19 deaths (%),...,0.312,0.191,0.004,0.219,0.017,0.239,0.018,Large central metro,,West
1,09/27/2023,01/01/2020,09/23/2023,CA,Butte County,4,6,7,6007,Distribution of COVID-19 deaths (%),...,0.79,0.015,0.025,0.041,,0.119,,Small metro,One or more data cells have counts between 1-9...,West
2,09/27/2023,01/01/2020,09/23/2023,CA,Contra Costa County,2,6,13,6013,Distribution of COVID-19 deaths (%),...,0.536,0.127,,0.111,0.013,0.198,0.014,Large fringe metro,One or more data cells have counts between 1-9...,West
3,09/27/2023,01/01/2020,09/23/2023,CA,Fresno County,3,6,19,6019,Distribution of COVID-19 deaths (%),...,0.393,0.046,0.012,0.092,,0.45,0.005,Medium metro,One or more data cells have counts between 1-9...,West
4,09/27/2023,01/01/2020,09/23/2023,CA,Kern County,3,6,29,6029,Distribution of COVID-19 deaths (%),...,0.373,0.058,0.012,0.04,,0.505,0.009,Medium metro,One or more data cells have counts between 1-9...,West


In [64]:
#Drop unnecessary columns, Remember Data is from 01/01/2020 to 09/23/2023
df = df.drop(['Data as of', 'Start Date', 'End Date', 'Urban Rural Code', 'FIPS State', 'FIPS Code','FIPS County', 'Indicator'], axis=1)

In [65]:
df = df[["State","County Name", "Coast","COVID-19 Deaths", "Total deaths","Non-Hispanic White","Non-Hispanic Black", "Hispanic", "Non-Hispanic Asian", "Non-White, Non-Black, Non-Hispanic, Non-Asian", "Non-Hispanic American Indian or Alaska Native","Non-Hispanic Native Hawaiian or Other Pacific Islander","Other","Urban Rural Description","Footnote"]]
df.head()

Unnamed: 0,State,County Name,Coast,COVID-19 Deaths,Total deaths,Non-Hispanic White,Non-Hispanic Black,Hispanic,Non-Hispanic Asian,"Non-White, Non-Black, Non-Hispanic, Non-Asian",Non-Hispanic American Indian or Alaska Native,Non-Hispanic Native Hawaiian or Other Pacific Islander,Other,Urban Rural Description,Footnote
0,CA,Alameda County,West,2628,41916,0.312,0.191,0.239,0.219,0.039,0.004,0.017,0.018,Large central metro,
1,CA,Butte County,West,789,9294,0.79,0.015,0.119,0.041,0.035,0.025,,,Small metro,One or more data cells have counts between 1-9...
2,CA,Contra Costa County,West,1754,31889,0.536,0.127,0.198,0.111,0.028,,0.013,0.014,Large fringe metro,One or more data cells have counts between 1-9...
3,CA,Fresno County,West,3278,33704,0.393,0.046,0.45,0.092,0.019,0.012,,0.005,Medium metro,One or more data cells have counts between 1-9...
4,CA,Kern County,West,2711,27860,0.373,0.058,0.505,0.04,0.024,0.012,,0.009,Medium metro,One or more data cells have counts between 1-9...


Because most counties have varying information of ethnicities, such as some having information for every ethnicity, while others only have missing information for all ethnicities except one, w

In [66]:
df = df.drop(['Non-Hispanic American Indian or Alaska Native','Non-Hispanic Native Hawaiian or Other Pacific Islander', 'Other', 'Footnote'], axis=1)
df.head()

Unnamed: 0,State,County Name,Coast,COVID-19 Deaths,Total deaths,Non-Hispanic White,Non-Hispanic Black,Hispanic,Non-Hispanic Asian,"Non-White, Non-Black, Non-Hispanic, Non-Asian",Urban Rural Description
0,CA,Alameda County,West,2628,41916,0.312,0.191,0.239,0.219,0.039,Large central metro
1,CA,Butte County,West,789,9294,0.79,0.015,0.119,0.041,0.035,Small metro
2,CA,Contra Costa County,West,1754,31889,0.536,0.127,0.198,0.111,0.028,Large fringe metro
3,CA,Fresno County,West,3278,33704,0.393,0.046,0.45,0.092,0.019,Medium metro
4,CA,Kern County,West,2711,27860,0.373,0.058,0.505,0.04,0.024,Medium metro


In [68]:
df.shape

(170, 11)