In [2]:
import boto3
import pandas as pd
import numpy as np
import os
from tqdm.notebook import tqdm

# Download data from S3
Download most recent death data from s3. You may want to skip this and use the data already in `deathdata.csv`.

In [225]:
session = boto3.Session(
    aws_access_key_id=os.environ["S3_KEY"],
    aws_secret_access_key=os.environ["S3_PRIVATE_KEY"]
)
s3 = session.client('s3')
from pprint import pprint
def hook(t):
  def inner(bytes_amount):
    t.update(bytes_amount)
  return inner

BUCKET_NAME = 'nvss-deaths'
file = sorted([el['Key'] for el in s3.list_objects(Bucket=BUCKET_NAME)['Contents']],reverse=True)[0]

path = "deathdata.csv"
file_object = s3.get_object(Bucket=BUCKET_NAME, Key=file)
filesize = file_object['ContentLength']

with tqdm(total=filesize, unit='B') as t:
    with open(path, 'wb') as f:
        s3.download_fileobj(BUCKET_NAME, file, f, Callback=hook(t))

  0%|          | 0/732447510 [00:00<?, ?B/s]

In [3]:
# Read in the file
dtype = {
    'State' : 'string',
    'Indicator': 'string',
    'COVID-19 Deaths': 'float64',
    'Pneumonia Deaths': 'float64',
    'Pneumonia and COVID-19 Deaths': 'float64',
    'Influenza Deaths': 'float64',
    'Pneumonia, Influenza, or COVID-19 Deaths': 'float64',
    'Total Deaths': 'float64',
    'Percent of Expected Deaths': 'float64',
    'Start week': 'string',
    'End Week': 'string',
    'Start Date': 'string',
    'End Date': 'string',
    'MMWR Week': 'float64',
    'Week Ending Date': 'string',
    'Data as of': 'string'
}
deaths = pd.read_csv('deathdata.csv',dtype=dtype,low_memory=False).drop('Unnamed: 0',axis=1)
deaths

Unnamed: 0,Data as of,Start week,End Week,Group,State,Indicator,COVID-19 Deaths,Total Deaths,Percent of Expected Deaths,Pneumonia Deaths,Pneumonia and COVID-19 Deaths,Influenza Deaths,"Pneumonia, Influenza, or COVID-19 Deaths",Footnote,Start Date,End Date,Year,Month,MMWR Week,Week Ending Date
0,05/22/2020,02/01/2020,02/01/2020,by week,United States,Week-ending,0.0,57584.0,0.97,3713.0,0.0,475.0,4188.0,,,,,,,
1,05/22/2020,02/08/2020,02/08/2020,by week,United States,Week-ending,1.0,58245.0,0.97,3715.0,0.0,507.0,4223.0,,,,,,,
2,05/22/2020,02/15/2020,02/15/2020,by week,United States,Week-ending,0.0,57585.0,0.98,3747.0,0.0,541.0,4288.0,,,,,,,
3,05/22/2020,02/22/2020,02/22/2020,by week,United States,Week-ending,2.0,57640.0,0.99,3610.0,0.0,553.0,4165.0,,,,,,,
4,05/22/2020,02/29/2020,02/29/2020,by week,United States,Week-ending,5.0,57956.0,1.01,3727.0,3.0,629.0,4358.0,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3982800,05/19/2023,,,by week,Puerto Rico,,,633.0,118.00,92.0,,,96.0,One or more data cells have counts between 1-9...,04/09/2023,04/15/2023,2023,,15.0,04/15/2023
3982801,05/19/2023,,,by week,Puerto Rico,,,607.0,115.00,79.0,,,82.0,One or more data cells have counts between 1-9...,04/16/2023,04/22/2023,2023,,16.0,04/22/2023
3982802,05/19/2023,,,by week,Puerto Rico,,12.0,545.0,100.00,77.0,,,86.0,One or more data cells have counts between 1-9...,04/23/2023,04/29/2023,2023,,17.0,04/29/2023
3982803,05/19/2023,,,by week,Puerto Rico,,11.0,447.0,83.00,75.0,,,79.0,One or more data cells have counts between 1-9...,04/30/2023,05/06/2023,2023,,18.0,05/06/2023


In [15]:
# Change date cols to datetime
date_cols = ['Data as of','End Week','Week Ending Date']
for date_col in date_cols:
    deaths[date_col] = pd.to_datetime(deaths[date_col],errors='coerce')
    
# Merge week ending date and end week (changed field name)
deaths['Weekdate'] = deaths['Week Ending Date'].fillna(deaths['End Week'])

# Get rid of August 20 and 28 files, they're weird
deaths = deaths[deaths['Weekdate'].notna()]

# Sort and drop all the irrelevant columns
deaths = deaths.sort_values(by=['State','Weekdate','Data as of'],ascending=[True,True,True])
deaths = deaths[['State','Weekdate','Data as of','COVID-19 Deaths','Total Deaths']]
deaths['Percent COVID Deaths'] = deaths['COVID-19 Deaths']/deaths['Total Deaths']

# Add map categories from https://covid.cdc.gov/covid-data-tracker/#cases_percent-covid-deaths
deaths['Map Category'] = np.select([
    deaths['Percent COVID Deaths'].isna(), #0
    deaths['Percent COVID Deaths']< 0.02, #1
    deaths['Percent COVID Deaths'].between(0.02,.04,inclusive='left'), #2
    deaths['Percent COVID Deaths'].between(0.04,0.06,inclusive='left'), #3
    deaths['Percent COVID Deaths'].between(0.06,0.08,inclusive='left'), #4
    deaths['Percent COVID Deaths'] >= 0.08], #5
    [0,1,2,3,4,5]
)
deaths

Unnamed: 0,State,Weekdate,Data as of,COVID-19 Deaths,Total Deaths,Percent COVID Deaths,Map Category
274157,Alabama,2020-01-04,2021-01-06,0.0,630.0,0.0,1
277019,Alabama,2020-01-04,2021-01-07,0.0,1081.0,0.0,1
279881,Alabama,2020-01-04,2021-01-08,0.0,1081.0,0.0,1
282745,Alabama,2020-01-04,2021-01-11,0.0,1081.0,0.0,1
285661,Alabama,2020-01-04,2021-01-12,0.0,1081.0,0.0,1
...,...,...,...,...,...,...,...
3944612,Wyoming,2023-05-13,2023-05-15,0.0,,,0
3954116,Wyoming,2023-05-13,2023-05-16,0.0,12.0,0.0,1
3963620,Wyoming,2023-05-13,2023-05-17,0.0,18.0,0.0,1
3973124,Wyoming,2023-05-13,2023-05-18,0.0,22.0,0.0,1


In [17]:
date_cols = ['Data as of','End Week','Week Ending Date']
for date_col in date_cols:
    deaths[date_col] = pd.to_datetime(deaths[date_col],errors='coerce')
    
# Merge week ending date and end week (changed field name)
deaths['Weekdate'] = deaths['Week Ending Date'].fillna(deaths['End Week'])


In [40]:
deaths[deaths['Weekdate'].isna()]['Data as of'].unique()

<DatetimeArray>
['2020-08-21 00:00:00', '2020-08-27 00:00:00']
Length: 2, dtype: datetime64[ns]