In [1]:
%%capture
%load_ext autoreload
%autoreload 2
from setup_nb_env import *

data_dir = '/work/users/k/4/k4thryn/Repos/EpSampling/data/'

In [2]:

from epsampling.utils import load_latest_csv

#############################################
### Get state COVIDhub-ensemble predictions
#############################################

## Get fips
state_to_fips = pd.read_csv('../constants/state_fips.csv')
state_to_fips.rename({'FIPS':'State_fips'},axis=1,inplace=True)

## Add fips to forecast table.
df,_ = load_latest_csv('covidhub_ensemble_1wkcum_point')
df.columns = df.columns.str.capitalize()
df.rename({'Location':'State_fips'},axis=1,inplace=True)

df = df.merge(state_to_fips, on='State_fips')

## Remove and rename columns ...
df_states = df.drop(['Target','Forecast_date'], axis=1, errors='ignore')
df_states.rename({'Target_end_date':'Date', 'Value':'COVIDhubEns_state_deaths'}, axis=1, inplace=True)
df_states

Unnamed: 0,State_fips,Date,COVIDhubEns_state_deaths,State,Postal
0,1,2020-04-18,152.238,Alabama,AL
1,1,2020-05-02,287.067,Alabama,AL
2,1,2020-05-09,350.653,Alabama,AL
3,1,2020-05-16,470.125,Alabama,AL
4,1,2020-05-23,593.986,Alabama,AL
...,...,...,...,...,...
7996,78,2023-02-11,129.000,Virgin Islands,VI
7997,78,2023-02-18,129.000,Virgin Islands,VI
7998,78,2023-02-25,131.000,Virgin Islands,VI
7999,78,2023-03-04,130.000,Virgin Islands,VI


In [3]:
#############################################
### Get ground truth covid deaths per county.
#############################################

df_counties = pd.read_csv(f'{data_dir}nytimes/us-counties.csv')
df_counties.columns = df_counties.columns.str.capitalize()
df_counties.drop(['Cases'],axis=1,inplace=True)
df_counties.dropna(inplace=True)

## Make list of dfs because everything in one df is too big.

forecast_dates = list(df_states.Date.unique())
all_states = list(df_counties.State.unique())

state_dfs = {}

for state in tqdm(all_states, total=len(all_states)):
#     if state in ['Virgin Islands','Northern Mariana Islands']:
#         continue
    df = df_counties[df_counties.State==state]
    df['Fips'] = df['Fips'].astype('int64').astype('str')
    ## Only need dates for counties that we have for states ... 
    df = df[df.Date.isin(forecast_dates)]
    state_dfs[state] = df

100%|██████████| 53/53 [00:06<00:00,  7.59it/s]


In [4]:
### Get population ratios.
df_pop,_ = load_latest_csv('pop_ratios_per_county',f'{data_dir}processed/')
df_pop['Fips'] = df_pop['Fips'].astype('int64').astype('str')
df_pop = df_pop[['Postal','Fips', 'Pop', 'State', 'Pop_ratio']]
df_pop
# df_pop.State.unique()

Unnamed: 0,Postal,Fips,Pop,State,Pop_ratio
0,AK,2013,3409,Alaska,0.00463
1,AK,2016,5251,Alaska,0.00713
2,AK,2020,292545,Alaska,0.39751
3,AK,2050,18514,Alaska,0.02516
4,AK,2060,849,Alaska,0.00115
...,...,...,...,...,...
3137,WY,56037,42459,Wyoming,0.07363
3138,WY,56039,23319,Wyoming,0.04044
3139,WY,56041,20514,Wyoming,0.03557
3140,WY,56043,7768,Wyoming,0.01347


In [5]:
for state,df in state_dfs.items():
    df_state_pop = df_pop[df_pop.State==state]    
    df = df.merge(df_state_pop, on=['Fips','State'])
    state_dfs[state] = df

In [6]:
merged_dfs = {}
for state in tqdm(state_dfs.keys(), total=len(state_dfs)):
    
    df_state = df_states[df_states.State==state]
    df_counties = state_dfs[state]
    
#     print(df_state.columns, df_counties.columns)
#     display(df_counties, df_state)
    df_merged = df_counties.merge(df_state, on=['Date','State','Postal'])
    df_merged.rename({'Deaths':'True_county_deaths'}, axis=1, inplace=True)
    
    df_merged['Naive_county_deaths'] = df_merged.apply(lambda x: x.Pop_ratio * x.COVIDhubEns_state_deaths, 
                                                       axis=1)
    
    df_merged = df_merged[['State_fips','State','Postal','County','Fips','Date','COVIDhubEns_state_deaths',
                           'Pop','Pop_ratio','True_county_deaths',
                           'Naive_county_deaths']]
    merged_dfs[state] = df_merged

100%|██████████| 53/53 [00:03<00:00, 13.46it/s]


In [7]:
final_df = pd.concat(merged_dfs.values())
final_df

Unnamed: 0,State_fips,State,Postal,County,Fips,Date,COVIDhubEns_state_deaths,Pop,Pop_ratio,True_county_deaths,Naive_county_deaths
0,53,Washington,WA,Adams,53001,2020-04-18,644.228,20353,0.00267,0.0,1.720089
1,53,Washington,WA,Asotin,53003,2020-04-18,644.228,22285,0.00293,0.0,1.887588
2,53,Washington,WA,Benton,53005,2020-04-18,644.228,204551,0.02685,34.0,17.297522
3,53,Washington,WA,Chelan,53007,2020-04-18,644.228,78508,0.01031,6.0,6.641991
4,53,Washington,WA,Clallam,53009,2020-04-18,644.228,76727,0.01007,0.0,6.487376
...,...,...,...,...,...,...,...,...,...,...,...
530,44,Rhode Island,RI,Bristol,44001,2022-05-07,3547.000,50672,0.04641,169.0,164.616270
531,44,Rhode Island,RI,Kent,44003,2022-05-07,3547.000,169345,0.15509,513.0,550.104230
532,44,Rhode Island,RI,Newport,44005,2022-05-07,3547.000,85525,0.07832,94.0,277.801040
533,44,Rhode Island,RI,Providence,44007,2022-05-07,3547.000,656672,0.60138,2535.0,2133.094860


In [None]:
all_states


In [None]:
for state,df in state_dfs.items():
    df_state_pop = df_pop[df_pop.State==state]    
    df = df.merge(df_state_pop, on=['Fips','State'])
    state_dfs[state] = df

In [None]:
state_dfs

In [None]:
# df_pop = pd.read_csv('cached_data/pop_ratios_per_county.csv')
# df_pop['Fips'] = df_pop['Fips'].astype('str')
# df_pop

### Get population ratios.
df_pop,_ = load_latest_csv('pop_ratios_per_county',f'{DATA_DIR}processed/')
df_pop['Fips'] = df_pop['Fips'].astype('int64').astype('str')

In [None]:
df_pop

In [None]:
for state,df in state_dfs.items():
    print(state)
    df_state_pop = df_pop[df_pop.State==state]    
    display(df_state_pop.head())
#     display(df)
#     break
    df = df.merge(df_state_pop, on=['Fips','State'])
    state_dfs[state] = df
#     display(df)
#     break

In [None]:
all_states = ['Washington',
 'Illinois',
 'California',
 'Arizona',
 'Massachusetts',
 'Wisconsin',
 'Texas',
 'Nebraska',
 'Utah',
 'Oregon',
 'Florida',
 'Georgia',
 'New Hampshire',
 'North Carolina',
 'New Jersey',
 'New York',
 'Colorado',
 'Maryland',
 'Nevada',
 'Tennessee',
 'Hawaii',
 'Indiana',
 'Kentucky',
 'Minnesota',
 'Oklahoma',
 'Pennsylvania',
 'South Carolina',
 'District of Columbia',
 'Kansas',
 'Missouri',
 'Vermont',
 'Virginia',
 'Connecticut',
 'Iowa',
 'Louisiana',
 'Ohio',
 'Michigan',
 'South Dakota',
 'Arkansas',
 'Delaware',
 'Mississippi',
 'New Mexico',
 'North Dakota',
 'Wyoming',
 'Alaska',
 'Maine',
 'Alabama',
 'Idaho',
 'Montana',
 'West Virginia',
 'Rhode Island']

In [None]:
merged_dfs = {}

for state in tqdm(all_states, total=len(all_states)):
    
    df_state = df_states[df_states.State==state]
    df_counties = state_dfs[state]
    
    display("df_state",df_state.head())
    display("df_counties",df_counties.head())
    
#     break
    df_merged = df_counties.merge(df_state, on=['Date','State','Postal','State_fips'])
    
    display("df_merged",df_merged)
#     break
    df_merged.rename({'Deaths':'True_county_deaths'}, axis=1, inplace=True)
    
    df_merged['Naive_county_deaths'] = df_merged.apply(lambda x: x.Pop_ratio * x.COVIDhubEns_state_deaths, 
                                                       axis=1)
    
    df_merged = df_merged[['State_fips','State','Postal','County','Fips','Date','COVIDhubEns_state_deaths',
                           'Pop','Pop_ratio','True_county_deaths',
                           'Naive_county_deaths']]
#     break
    merged_dfs[state] = df_merged
#     break

In [None]:
from epsampling.utils import load_latest_csv

### Get state COVIDhub-ensemble predictions

## Load covid hub ensemble predictions per state.
df_deaths,_ = load_latest_csv('covidhub_ensemble_1wkcum_point')
df_deaths.columns = df_deaths.columns.str.capitalize()
df_deaths.rename({'Location':'State_fips'},axis=1,inplace=True)

## Add fips to forecast table.
state_to_fips = pd.read_csv('../constants/state_fips.csv')
state_to_fips.rename({'FIPS':'State_fips'},axis=1,inplace=True)
df_states = df_states.merge(state_to_fips, on='State_fips')

## Remove and rename columns ...
df_states = df_states.drop(['Target','Forecast_date'], axis=1, errors='ignore')
df_states.rename({'Target_end_date':'Date', 
                  'Value':'COVIDhubEns_state_deaths'}, axis=1, inplace=True)


### Get ground truth covid deaths per county.
df_counties = pd.read_csv(f'{DATA_DIR}nytimes/us-counties.csv')
df_counties.columns = df_counties.columns.str.capitalize()
df_counties.drop(['Cases'],axis=1,inplace=True)
df_counties.dropna(inplace=True)
df_counties['Fips'] = df_counties['Fips'].astype('int64').astype('str')
df_counties.rename({'Deaths':'True_deaths'},axis=1,inplace=True)


### Get population ratios.
df_pop,_ = load_latest_csv('pop_ratios_per_county',f'{DATA_DIR}processed/')
df_pop['Fips'] = df_pop['Fips'].astype('int64').astype('str')

In [None]:
display(df_deaths, df_counties, df_pop)

In [None]:
### Get ground truth covid deaths per county.

df_counties = pd.read_csv(f'{DATA_DIR}nytimes/us-counties.csv')
df_counties.columns = df_counties.columns.str.capitalize()
df_counties.drop(['Cases'],axis=1,inplace=True)
df_counties.dropna(inplace=True)

## Make list of dfs because everything in one df is too big.

forecast_dates = list(df_deaths.Date.unique())
all_states = list(df_counties.State.unique())

state_dfs = {}

for state in tqdm(all_states, total=len(all_states), desc='Make df per state'):
    df = df_counties[df_counties.State==state]
    df['Fips'] = df['Fips'].astype('int64').astype('str')
    ## Only need dates for counties that we also have for states ... 
    df = df[df.Date.isin(forecast_dates)]
    ## Get county pops for the state
    df_state_pop = df_pop[df_pop.State==state]  
    df = df.merge(df_state_pop, on=['Fips','State'])
#     ## Compute naive deaths
#     df['Deaths'] = df_merged.apply(lambda x: x.Pop_ratio * x.COVIDhubEns_state_deaths, 
#                                                        axis=1)    
#     display(df)
#     break
    state_dfs[state] = df

In [None]:
state_dfs['Texas']

In [None]:
### Get ground truth covid deaths per county.

df_counties = pd.read_csv(f'{DATA_DIR}nytimes/us-counties.csv')
df_counties.columns = df_counties.columns.str.capitalize()
df_counties.drop(['Cases'],axis=1,inplace=True)
df_counties.dropna(inplace=True)

## Make list of dfs because everything in one df is too big.

forecast_dates = list(df_deaths.Date.unique())
all_states = list(df_counties.State.unique())

state_dfs = {}

for state in tqdm(all_states, total=len(all_states), desc='Make df per state'):
    df = df_counties[df_counties.State==state]
    df['Fips'] = df['Fips'].astype('int64').astype('str')
    ## Only need dates for counties that we also have for states ... 
    df = df[df.Date.isin(forecast_dates)]
    
    ## Get county pops for the state
    df_state_pop = df_pop[df_pop.State==state]  
#     df = df.merge(df_state_pop, on=['Fips','State'])
    
    ## Get covid hub predictions
    df_state_deaths = df_deaths[df_deaths.State==state]
    df_counties = state_dfs[state]
    
    
    df_merged = df_counties.merge(df_state, on=['Date','Postal','State'])
#     df_merged.rename({'Deaths':'True_county_deaths'}, axis=1, inplace=True)
    
    
    
    
    df = df.merge(df_state, on=['Date','Postal','State'])
    
    
    
    
    ## Compute naive deaths
    df['Deaths_naive'] = df_merged.apply(lambda x: x.Pop_ratio * x.COVIDhubEns_state_deaths, 
                                                       axis=1)    
#     display(df)
#     break
    state_dfs[state] = df

In [None]:
state_dfs['Texas']

In [None]:
# df_pop.dtypes

In [None]:
## For each state df, merge with pop df for that state
for state,df in state_dfs.items():
    df_state_pop = df_pop[df_pop.State==state]    
    df = df.merge(df_state_pop, on=['Fips'])
    state_dfs[state] = df
# state_dfs['Alabama']

In [None]:
display(df_deaths)

In [None]:
state_dfs['Alabama']

In [None]:
## Compute naive deaths for each county.

merged_dfs = {}
for state in tqdm(all_states, total=len(all_states)):
    
    df_state = df_deaths[df_deaths.State==state]
    df_counties = state_dfs[state]
    df_merged = df_counties.merge(df_state, on=['Date','Postal','State'])
    df_merged.rename({'Deaths':'True_county_deaths'}, axis=1, inplace=True)
    
    df_merged['Naive_county_deaths'] = df_merged.apply(lambda x: x.Pop_ratio * x.COVIDhubEns_state_deaths, 
                                                       axis=1)
    display(df_merged)
    
    df_merged = df_merged[['State_fips','State','Postal','County','Fips','Date','COVIDhubEns_state_deaths',
                           'Pop','Pop_ratio','True_county_deaths',
                           'Naive_county_deaths']]
    merged_dfs[state] = df_merged

In [None]:
state_dfs.keys()

In [None]:
state_dfs['Texas']

In [None]:
df

In [None]:
df_pop

In [None]:
df_counties

In [None]:
### Get pop ratios for each county.
df_pop,name = load_latest_csv('pop_ratios_per_county',f'{DATA_DIR}processed/')
df_pop

In [None]:
df

In [None]:
df_pop

In [None]:
for state,df in state_dfs.items():
    df_state_pop = df_pop[df_pop.State==state]    
    df = df.merge(df_state_pop, on=['Fips','Postal'])
    state_dfs[state] = df

In [None]:
# df_states, df_counties

In [None]:
df_pop

In [None]:
 df_counties

In [None]:
merged_dfs = {}
for state in tqdm(all_states, total=len(all_states)):
    
    df_state = df_states[df_states.State==state]
    df_counties = state_dfs[state]
    df_merged = df_counties.merge(df_state, on=['Date','State','State_name'])
    
    df_merged.rename({'Deaths':'True_county_deaths'}, axis=1, inplace=True)
    
    df_merged['Naive_county_deaths'] = df_merged.apply(lambda x: x.Pop_ratio * x.COVIDhubEns_state_deaths, 
                                                       axis=1)
    
    df_merged = df_merged[['State_fips','State','State_name','County','Fips','Date','COVIDhubEns_state_deaths',
                           'Pop','Pop_ratio','True_county_deaths',
                           'Naive_county_deaths']]
    merged_dfs[state] = df_merged

### <font color=blue> Get state COVIDhub-ensemble predictions.

In [None]:
# ## Get fips
# state_to_fips = pd.read_csv('constants/state_fips.csv')
# state_to_fips.rename({'FIPS':'State_fips'},axis=1,inplace=True)

# ## Add fips to forecast table.
# df = pd.read_csv(f'{data_dir}covidhub_ensemble_1wkcum_point_20240911-114640.csv')
# df.columns = df.columns.str.capitalize()
# df.rename({'Location':'State_fips'},axis=1,inplace=True)

# df = df.merge(state_to_fips, on='State_fips')

# ## Remove and rename columns ...
# df_states = df.drop(['Target','Forecast_date'], axis=1, errors='ignore')
# df_states.rename({'Target_end_date':'Date', 'Value':'COVIDhubEns_state_deaths'}, axis=1, inplace=True)
# df_states

### <font color=blue> Get ground truth covid deaths per county.

In [None]:
# df_counties = pd.read_csv(f'{data_dir}nytimes/us-counties.csv')
# df_counties.columns = df_counties.columns.str.capitalize()
# df_counties.drop(['Cases'],axis=1,inplace=True)
# df_counties.dropna(inplace=True)

# ## Make list of dfs because everything in one df is too big.

# forecast_dates = list(df_states.Date.unique())
# all_states = list(df_counties.State.unique())

# state_dfs = {}

# for state in tqdm(all_states, total=len(all_states)):
#     df = df_counties[df_counties.State==state]
#     df['Fips'] = df['Fips'].astype('int64').astype('str')
#     ## Only need dates for counties that we have for states ... 
#     df = df[df.Date.isin(forecast_dates)]
#     state_dfs[state] = df

<font color=blue> Get pop ratios for each county.

In [None]:
# df_pop = pd.read_csv('cached_data/pop_ratios_per_county.csv')
# df_pop['Fips'] = df_pop['Fips'].astype('str')
# df_pop

In [None]:
# for state,df in state_dfs.items():
#     df_state_pop = df_pop[df_pop.State==state]    
#     df = df.merge(df_state_pop, on=['Fips','State'])
#     state_dfs[state] = df
# # state_dfs['Alabama']

### <font color=blue> Compute naive deaths for each county.

In [None]:
merged_dfs = {}
for state in tqdm(all_states, total=len(all_states)):
    
    df_state = df_states[df_states.State==state]
    df_counties = state_dfs[state]
    df_merged = df_counties.merge(df_state, on=['Date','State','Postal'])
    df_merged.rename({'Deaths':'True_county_deaths'}, axis=1, inplace=True)
    
    df_merged['Naive_county_deaths'] = df_merged.apply(lambda x: x.Pop_ratio * x.COVIDhubEns_state_deaths, 
                                                       axis=1)
    
    df_merged = df_merged[['State_fips','State','Postal','County','Fips','Date','COVIDhubEns_state_deaths',
                           'Pop','Pop_ratio','True_county_deaths',
                           'Naive_county_deaths']]
    merged_dfs[state] = df_merged

In [None]:
dff = merged_dfs['North Carolina']
dff = dff[dff.County=='Alexander']
dff = dff[dff.Date=='2021-02-06']
dff

In [None]:
final_df = pd.concat(merged_dfs.values())
final_df

In [None]:
data_dir = '/work/users/k/4/k4thryn/Repos/EpSampling/data/'
final_df.to_csv(f'{data_dir}naive_deaths_all_counties_{d}.csv',index=False)