In [1]:
import pandas as pd
import numpy as np
import csv

import warnings
warnings.filterwarnings('ignore')

from tqdm import tqdm

from datetime import datetime

d = datetime.today().strftime('%Y%m%d-%H%M%S')

data_dir = '/work/users/k/4/k4thryn/Repos/EpSampling/data/'

### <font color=blue> Get state COVIDhub-ensemble predictions.

In [2]:
## Get fips
state_to_fips = pd.read_csv('constants/state_fips.csv')
state_to_fips.rename({'FIPS':'State_fips'},axis=1,inplace=True)

## Add fips to forecast table.
df = pd.read_csv(f'{data_dir}covidhub_ensemble_1wkcum_point_20240911-114640.csv')
df.columns = df.columns.str.capitalize()
df.rename({'Location':'State_fips'},axis=1,inplace=True)

df = df.merge(state_to_fips, on='State_fips')

## Remove and rename columns ...
df_states = df.drop(['Target','Forecast_date'], axis=1, errors='ignore')
df_states.rename({'Target_end_date':'Date', 'Value':'COVIDhubEns_state_deaths'}, axis=1, inplace=True)
df_states

Unnamed: 0,State_fips,Date,COVIDhubEns_state_deaths,State,Postal
0,1,2020-04-18,152.238,Alabama,AL
1,1,2020-05-02,287.067,Alabama,AL
2,1,2020-05-09,350.653,Alabama,AL
3,1,2020-05-16,470.125,Alabama,AL
4,1,2020-05-23,593.986,Alabama,AL
...,...,...,...,...,...
7996,78,2023-02-11,129.000,Virgin Islands,VI
7997,78,2023-02-18,129.000,Virgin Islands,VI
7998,78,2023-02-25,131.000,Virgin Islands,VI
7999,78,2023-03-04,130.000,Virgin Islands,VI


### <font color=blue> Get ground truth covid deaths per county.

In [4]:
df_counties = pd.read_csv(f'{data_dir}nytimes/us-counties.csv')
df_counties.columns = df_counties.columns.str.capitalize()
df_counties.drop(['Cases'],axis=1,inplace=True)
df_counties.dropna(inplace=True)

## Make list of dfs because everything in one df is too big.

forecast_dates = list(df_states.Date.unique())
all_states = list(df_counties.State.unique())

state_dfs = {}

for state in tqdm(all_states, total=len(all_states)):
    df = df_counties[df_counties.State==state]
    df['Fips'] = df['Fips'].astype('int64').astype('str')
    ## Only need dates for counties that we have for states ... 
    df = df[df.Date.isin(forecast_dates)]
    state_dfs[state] = df

100%|██████████| 53/53 [00:08<00:00,  6.57it/s]


<font color=blue> Get pop ratios for each county.

In [5]:
df_pop = pd.read_csv('cached_data/pop_ratios_per_county.csv')
df_pop['Fips'] = df_pop['Fips'].astype('str')
df_pop

Unnamed: 0,Postal,Fips,Pop,State,Pop_ratio
0,NE,31039,8846,Nebraska,0.004573
1,NE,31109,319090,Nebraska,0.164955
2,NE,31129,4148,Nebraska,0.002144
3,NE,31101,8034,Nebraska,0.004153
4,NE,31137,9034,Nebraska,0.004670
...,...,...,...,...,...
3137,RI,44009,125577,Rhode Island,0.118540
3138,RI,44007,638931,Rhode Island,0.603129
3139,RI,44001,48479,Rhode Island,0.045762
3140,RI,44005,82082,Rhode Island,0.077483


In [6]:
for state,df in state_dfs.items():
    df_state_pop = df_pop[df_pop.State==state]    
    df = df.merge(df_state_pop, on=['Fips','State'])
    state_dfs[state] = df
# state_dfs['Alabama']

### <font color=blue> Compute naive deaths for each county.

In [8]:
merged_dfs = {}
for state in tqdm(all_states, total=len(all_states)):
    
    df_state = df_states[df_states.State==state]
    df_counties = state_dfs[state]
    df_merged = df_counties.merge(df_state, on=['Date','State','Postal'])
    df_merged.rename({'Deaths':'True_county_deaths'}, axis=1, inplace=True)
    
    df_merged['Naive_county_deaths'] = df_merged.apply(lambda x: x.Pop_ratio * x.COVIDhubEns_state_deaths, 
                                                       axis=1)
    
    df_merged = df_merged[['State_fips','State','Postal','County','Fips','Date','COVIDhubEns_state_deaths',
                           'Pop','Pop_ratio','True_county_deaths',
                           'Naive_county_deaths']]
    merged_dfs[state] = df_merged

100%|██████████| 53/53 [00:05<00:00,  9.67it/s]


In [10]:
dff = merged_dfs['North Carolina']
dff = dff[dff.County=='Alexander']
dff = dff[dff.Date=='2021-02-06']
dff

Unnamed: 0,State_fips,State,Postal,County,Fips,Date,COVIDhubEns_state_deaths,Pop,Pop_ratio,True_county_deaths,Naive_county_deaths
4090,37,North Carolina,NC,Alexander,37003,2021-02-06,9884.0,37497,0.003575,59.0,35.337279


In [11]:
final_df = pd.concat(merged_dfs.values())
final_df

Unnamed: 0,State_fips,State,Postal,County,Fips,Date,COVIDhubEns_state_deaths,Pop,Pop_ratio,True_county_deaths,Naive_county_deaths
0,53,Washington,WA,Adams,53001,2020-04-18,644.228,19983,0.002624,0.0,1.690583
1,53,Washington,WA,Asotin,53003,2020-04-18,644.228,22582,0.002966,0.0,1.910461
2,53,Washington,WA,Benton,53005,2020-04-18,644.228,204390,0.026841,34.0,17.291610
3,53,Washington,WA,Chelan,53007,2020-04-18,644.228,77200,0.010138,6.0,6.531202
4,53,Washington,WA,Clallam,53009,2020-04-18,644.228,77331,0.010155,0.0,6.542284
...,...,...,...,...,...,...,...,...,...,...,...
530,44,Rhode Island,RI,Bristol,44001,2022-05-07,3547.000,48479,0.045762,169.0,162.319562
531,44,Rhode Island,RI,Kent,44003,2022-05-07,3547.000,164292,0.155086,513.0,550.089841
532,44,Rhode Island,RI,Newport,44005,2022-05-07,3547.000,82082,0.077483,94.0,274.830633
533,44,Rhode Island,RI,Providence,44007,2022-05-07,3547.000,638931,0.603129,2535.0,2139.297423


In [13]:
data_dir = '/work/users/k/4/k4thryn/Repos/EpSampling/data/'
final_df.to_csv(f'{data_dir}naive_deaths_all_counties_{d}.csv',index=False)