In [1]:
import boto3
import pandas as pd
import numpy as np
import os
from tqdm.notebook import tqdm

# Download data from S3
Download most recent death data from s3 into `deathdata.csv`. Skip if you've done this recently it's a big file :) 

In [2]:
session = boto3.Session(
    aws_access_key_id=os.environ["S3_KEY"],
    aws_secret_access_key=os.environ["S3_PRIVATE_KEY"]
)
s3 = session.client('s3')
from pprint import pprint
def hook(t):
  def inner(bytes_amount):
    t.update(bytes_amount)
  return inner

BUCKET_NAME = 'nvss-deaths'
file = sorted([el['Key'] for el in s3.list_objects(Bucket=BUCKET_NAME)['Contents']],reverse=True)[0]

path = "deathdata.csv"
file_object = s3.get_object(Bucket=BUCKET_NAME, Key=file)
filesize = file_object['ContentLength']

with tqdm(total=filesize, unit='B') as t:
    with open(path, 'wb') as f:
        s3.download_fileobj(BUCKET_NAME, file, f, Callback=hook(t))

  0%|          | 0/765827567 [00:00<?, ?B/s]

# Preprocessing
Read in data from `deathdata.csv` and do necessary preprocessing

In [10]:
# Read in the file
dtype = {
    'State' : 'string',
    'Indicator': 'string',
    'COVID-19 Deaths': 'float64',
    'Pneumonia Deaths': 'float64',
    'Pneumonia and COVID-19 Deaths': 'float64',
    'Influenza Deaths': 'float64',
    'Pneumonia, Influenza, or COVID-19 Deaths': 'float64',
    'Total Deaths': 'float64',
    'Percent of Expected Deaths': 'float64',
    'Start week': 'string',
    'End Week': 'string',
    'Start Date': 'string',
    'End Date': 'string',
    'MMWR Week': 'float64',
    'Week Ending Date': 'string',
    'Data as of': 'string'
}
deaths = pd.read_csv('deathdata.csv',dtype=dtype,low_memory=False).drop('Unnamed: 0',axis=1)
deaths

Unnamed: 0,Unnamed: 0.1,Data as of,Start week,End Week,Group,State,Indicator,COVID-19 Deaths,Total Deaths,Percent of Expected Deaths,...,Pneumonia and COVID-19 Deaths,Influenza Deaths,"Pneumonia, Influenza, or COVID-19 Deaths",Footnote,Start Date,End Date,Year,Month,MMWR Week,Week Ending Date
0,0,05/22/2020,02/01/2020,02/01/2020,by week,United States,Week-ending,0.0,57584.0,0.97,...,0.0,475.0,4188.0,,,,,,,
1,1,05/22/2020,02/08/2020,02/08/2020,by week,United States,Week-ending,1.0,58245.0,0.97,...,0.0,507.0,4223.0,,,,,,,
2,2,05/22/2020,02/15/2020,02/15/2020,by week,United States,Week-ending,0.0,57585.0,0.98,...,0.0,541.0,4288.0,,,,,,,
3,3,05/22/2020,02/22/2020,02/22/2020,by week,United States,Week-ending,2.0,57640.0,0.99,...,0.0,553.0,4165.0,,,,,,,
4,4,05/22/2020,02/29/2020,02/29/2020,by week,United States,Week-ending,5.0,57956.0,1.01,...,3.0,629.0,4358.0,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4135944,4135944,06/02/2023,,,by week,Puerto Rico,,12.0,588.0,108.00,...,,,90.0,One or more data cells have counts between 1-9...,04/23/2023,04/29/2023,2023,,17.0,04/29/2023
4135945,4135945,06/02/2023,,,by week,Puerto Rico,,13.0,605.0,112.00,...,10.0,,91.0,One or more data cells have counts between 1-9...,04/30/2023,05/06/2023,2023,,18.0,05/06/2023
4135946,4135946,06/02/2023,,,by week,Puerto Rico,,13.0,563.0,103.00,...,,0.0,79.0,One or more data cells have counts between 1-9...,05/07/2023,05/13/2023,2023,,19.0,05/13/2023
4135947,4135947,06/02/2023,,,by week,Puerto Rico,,11.0,441.0,78.00,...,,,69.0,One or more data cells have counts between 1-9...,05/14/2023,05/20/2023,2023,,20.0,05/20/2023


In [11]:
# Change date cols to datetime
date_cols = ['Data as of','End Week','Week Ending Date']
for date_col in date_cols:
    deaths[date_col] = pd.to_datetime(deaths[date_col],errors='coerce')
    
# Merge week ending date and end week (changed field name)
deaths['Weekdate'] = deaths['Week Ending Date'].fillna(deaths['End Week'])

# Sort and drop all the irrelevant columns
deaths = deaths.sort_values(by=['State','Weekdate','Data as of'],ascending=[True,True,True])
deaths = deaths[['State','Weekdate','Data as of','COVID-19 Deaths','Total Deaths']]
deaths['Percent COVID Deaths'] = deaths['COVID-19 Deaths']/deaths['Total Deaths']

# Add map categories from https://covid.cdc.gov/covid-data-tracker/#cases_percent-covid-deaths
deaths['Map Category'] = np.select([
    deaths['Percent COVID Deaths'].isna(), # 0
    deaths['Percent COVID Deaths']< 0.02, # 1
    deaths['Percent COVID Deaths'].between(0.02,.04,inclusive='left'), # 2
    deaths['Percent COVID Deaths'].between(0.04,0.06,inclusive='left'), # 3
    deaths['Percent COVID Deaths'].between(0.06,0.08,inclusive='left'), # 4
    deaths['Percent COVID Deaths'] >= 0.08], # 5
    [0,1,2,3,4,5]
)
deaths

Unnamed: 0,State,Weekdate,Data as of,COVID-19 Deaths,Total Deaths,Percent COVID Deaths,Map Category
274157,Alabama,2020-01-04,2021-01-06,0.0,630.0,0.0,1
277019,Alabama,2020-01-04,2021-01-07,0.0,1081.0,0.0,1
279881,Alabama,2020-01-04,2021-01-08,0.0,1081.0,0.0,1
282745,Alabama,2020-01-04,2021-01-11,0.0,1081.0,0.0,1
285661,Alabama,2020-01-04,2021-01-12,0.0,1081.0,0.0,1
...,...,...,...,...,...,...,...
4097322,Wyoming,2023-05-27,2023-05-31,0.0,,,0
4106934,Wyoming,2023-05-27,2023-06-01,0.0,14.0,0.0,1
4116546,Wyoming,2023-05-27,2023-06-02,0.0,21.0,0.0,1
4126158,Wyoming,2023-05-27,2023-06-02,0.0,21.0,0.0,1


# Analysis
Look at overall and jurisdiction-level volatility in percent COVID-19 associated deaths

In [34]:
deaths_no_zero = 
deaths_agg = deaths.groupby(['State','Weekdate']).agg({'Map Category' : ['mean','min','max','count'],
                                                       'Percent COVID Deaths' : ['mean','min','max']})

deaths_agg.columns=deaths_agg.columns.to_flat_index()
deaths_agg = deaths_agg.reset_index()
deaths_agg.columns = [str(c).translate({ord(c): None for c in '(),\''}) for c in deaths_agg.columns]
deaths_agg['cat_diff'] = deaths_agg['Map Category max']-deaths_agg['Map Category min']
deaths_agg['percent_diff'] = deaths_agg['Percent COVID Deaths max']-deaths_agg['Percent COVID Deaths min']
deaths_agg.groupby('State').agg({'cat_diff': ['mean','min','max'],'percent_diff': ['mean','min','max']})

Unnamed: 0_level_0,cat_diff,cat_diff,cat_diff,percent_diff,percent_diff,percent_diff
Unnamed: 0_level_1,mean,min,max,mean,min,max
State,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Alabama,1.980769,0,5,0.029122,0.0,0.183344
Alaska,1.317308,0,5,0.020305,0.0,0.261999
Arizona,1.024038,0,5,0.030535,0.0,0.273027
Arkansas,1.846154,0,5,0.027935,0.0,0.21566
California,1.081731,0,5,0.03471,0.0,0.32709
Colorado,2.0,0,5,0.031522,0.0,0.287403
Connecticut,2.591346,0,5,0.074408,0.0,0.733513
Delaware,2.149038,0,5,0.027531,0.0,0.173261
District of Columbia,1.677885,0,5,0.030829,0.0,0.311111
Florida,0.913462,0,5,0.031099,0.0,0.249948
