In [1]:
%%capture
%load_ext autoreload
%autoreload 2
from setup_nb_env import *

from epsampling.utils import load_csv
pd.set_option('display.float_format', lambda x: '%.3f' % x)
from epsampling.utils import drop_sers_with_nans
from epsampling.utils import date_str_to_int

DATA_DIR = '/work/users/k/4/k4thryn/Repos/EpSampling/data/'
DT = datetime.today().strftime('%Y%m%d-%H%M%S')

### NOTE: Need to run this all at once, cannot re-run cells due to reuse of "df" variable name.

### <font color=blue> CovidHub ensemble state predictions

In [26]:
#####################################################
## COVID HUB ENSEMBLE STATE PREDICTIONS ############
#####################################################

import glob
import os
from tqdm.notebook import tqdm 

my_dir = os.path.join(DATA_DIR,'raw','COVIDhub-ensemble')
files = glob.glob(f'{my_dir}/*.csv')

types = ['point']
targets = ['1 wk ahead inc death']

all_dfs = []
for f in tqdm(files,total=len(files)):
    df = pd.read_csv(f)
    ## Choose which types (only point for now)
    df = df[df.type=='point']
    ## Only 1 wk ahead inc
    df = df[df.target.isin(targets)]
    all_dfs.append(df)
df_all = pd.concat(all_dfs)
# df_all.sort_values(['Fips','Date'], inplace=True)
df_all.reset_index(drop=True,inplace=True)
# display(df_all)

## check for nans
df = drop_sers_with_nans(df_all, from_axis='cols')

## REFORMAT dataframe ...
## rename cols
df.rename({'location':'State_fips',
           'target_end_date':'Date',
           'value':'Proj_inc_deaths'}, axis=1, inplace=True)
## choose cols
df = df[['State_fips','Date','Proj_inc_deaths']]

## IMPORTANT: choose only state fips
df = df[df.State_fips!='US']

## convert all cols to numerical
df['State_fips'] = df.State_fips.astype(int)
df['Date'] = df.Date.apply(lambda x: date_str_to_int(x))

## reset index
df.reset_index(inplace=True,drop=True)

## SAVE CSV!
# fpath = os.path.join(DATA_DIR,'processed','scratch',f'formatted_COVIDhub-ensemble_{DT}.csv')
fpath = os.path.join(DATA_DIR,'processed',f'formatted_COVIDhub-ensemble_{DT}.csv')
df.to_csv(fpath,index=False)
print(DT)

## READ BACK AND CHECK
dff = pd.read_csv(fpath)
# display(df, dff)
# dff.dtypes

  0%|          | 0/210 [00:00<?, ?it/s]

Dropped cols with NaNs!
Num cols before: 7
Num cols after: 6
20241009-144131


### <font color=blue> NYTimes true county deaths



In [27]:
#####################################################
## NYT TRUE COUNTY DEATHS ###########################
#####################################################

fpath = os.path.join(DATA_DIR,'raw','nytimes','us-counties.csv')
df = pd.read_csv(fpath)

## check for rows with nans. 
# display(df)
df = drop_sers_with_nans(df, from_axis='rows')
# df.reset_index(drop=True, inplace=True)
# display(df)

## REFORMAT dataframe ...
## capitalize cols.
df.columns = df.columns.str.capitalize()
## rename cols
df.rename({'Deaths':'True_cum_deaths'},axis=1,inplace=True)
## choose cols
df = df[['Fips','Date', 'True_cum_deaths']]

## convert all cols to numerical
df['Fips'] = df.Fips.astype(int)
df['Date'] = df.Date.apply(lambda x: date_str_to_int(x))

## IMPORTANT: pull out samples from 'nytimes' that have matched dates to 'COVIDhub-ensemble' ...
df_hub,_ = load_csv('formatted_COVIDhub-ensemble')
my_dates = df_hub.Date.unique().tolist()
df = df[df.Date.isin(my_dates)]
df.reset_index(inplace=True,drop=True)

## reset index
df.reset_index(inplace=True,drop=True)

## SAVE CSV!
# fpath = os.path.join(DATA_DIR,'processed','scratch',f'formatted_nytimes-us-counties_{DT}.csv')
fpath = os.path.join(DATA_DIR,'processed',f'formatted_nytimes-us-counties_{DT}.csv')
df.to_csv(fpath,index=False)
print(DT)

## READ BACK AND CHECK
dff = pd.read_csv(fpath)
# display(df, dff)
# dff.dtypes

Dropped rows with NaNs!
Num rows before: 2502832
Num rows after: 2421549
20241009-144131


# <font color=blue> ACS RESULTS _(normed)_

In [28]:
# problem dfs:
# XX_blockgroup_acs5_2021_healthinsurance.csv
# XX_blockgroup_acs5_2021_income.csv

def normalize_attrs_by_pop(df, f):
    f = (f[f.rindex('/')+1:-4])
    f = f[f.rindex('_')+1:]
    
    df.set_index('GEOID',drop=True,inplace=True)

    if f=='healthinsurance':
        df['HINS_A0018'] = df['HINS_A0018'].div(df['POP_A0018'])
        df['HINS_A1934'] = df['HINS_A1934'].div(df['POP_A1934'])
        df['HINS_A3564'] = df['HINS_A3564'].div(df['POP_A3564'])
        df['HINS_A65p'] = df['HINS_A65p'].div(df['POP_A65p'])
        
        dff = df.drop(['POP_A0018','POP_A1934','POP_A1934','POP_A65p'],inplace=False,axis=1)
        
    elif f=='income':
        denom = df['HH']
        
        dff = df.apply(lambda x: x/denom, axis=0) 
        ## fix MHI since its not supposed to be normalized
        dff['MHI'] = df['MHI']
        dff['HH'] = denom

    else:
        universe = df.columns[0]
        denom = df[universe]
        
        dff = df.apply(lambda x: x/denom, axis=0)
        dff[universe] = denom
        
    dff = dff.reset_index(inplace=False, drop=False)
    return dff

def get_state_df(files):
    first_df = None
    for i,f in enumerate(files):
        this_df = pd.read_csv(f)
        this_df = normalize_attrs_by_pop(this_df, f)
        if first_df is None:
            first_df = this_df
        else:
            df = pd.merge(first_df, this_df, on='GEOID', suffixes=(f'_x{i}', f'_x{i+1}'))
            first_df = df
    return df


import glob
import os

acs_dir = '/work/users/k/4/k4thryn/Repos/EpSampling/data/raw/acs_results/'

all_st_dfs = []

state_dirs = [x for x in os.walk(acs_dir)][0][1]

for i,state in enumerate(state_dirs):
    if i==0:
        continue    
        
    files = glob.glob(f'{acs_dir}{state}/*.csv')
    df = get_state_df(files)
    ## REFORMAT dataframe ... rename cols.
    df.rename({'GEOID':'Fips'},axis=1,inplace=True)
    all_st_dfs.append(df)
    
df = pd.concat(all_st_dfs)
df.reset_index(drop=True,inplace=True)

## check for cols with nans.
df = drop_sers_with_nans(df, from_axis='cols')

## reset index
df.reset_index(inplace=True,drop=True)

## SAVE CSV!
# fpath = os.path.join(DATA_DIR,'processed','scratch',f'formatted_acs_results_{DT}.csv')
fpath = os.path.join(DATA_DIR,'processed',f'formatted_acs_results_normed_{DT}.csv')
df.to_csv(fpath,index=False)
print(DT)

## READ BACK AND CHECK
# dff = pd.read_csv(fpath)
# display(df, dff)

Dropped cols with NaNs!
Num cols before: 189
Num cols after: 188
20241009-144131


### <font color=blue> Add pop ratio to ACS and dedup identical covariates.

In [29]:
from epsampling.utils import drop_duplicate_cols

## Remove duplicate columns
df = drop_duplicate_cols(df)

## Rename cols
df.rename({'POP_x2':'Pop'},axis=1,inplace=True)
## Reorder columns
df = df[['Fips','Pop'] + [c for c in df.columns if c not in ['Fips','Pop']]]

# # # # # # # # # # # # # # # # # # # # # # # #
## Get county ratios and insert state pop, state fips, and county ratio cols.
# # # # # # # # # # # # # # # # # # # # # # # # 

df.insert(2, 'State_fips', 0)
df.insert(3, 'State_pop',0)
df.insert(4, 'Ratio', 0)

for tup in df.itertuples():
    
    state_fips = tup.Fips // 1000
    df.at[tup.Index, 'State_fips'] = state_fips    

for tup in df.itertuples():
    
    state_pop = sum(df[df.State_fips==tup.State_fips].Pop)
    ratio = tup.Pop / state_pop
    
    df.at[tup.Index, 'State_pop'] = state_pop
    df.at[tup.Index, 'Ratio'] = ratio
    
## reset cols
df.reset_index(inplace=True, drop=True)


## SAVE CSV!
# fpath = os.path.join(DATA_DIR,'processed','scratch',f'formatted_acs_pop_ratio_{DT}.csv')
fpath = os.path.join(DATA_DIR,'processed',f'formatted_acs_normed_pop_ratio_{DT}.csv')
df.to_csv(fpath,index=False)
print(DT)

## READ BACK AND CHECK
dff = pd.read_csv(fpath)
display(dff)
# dff.dtypes

20241009-144131


Unnamed: 0,Fips,Pop,State_fips,State_pop,Ratio,POP_NHPINH,POP_NHPI,POP_AIANNH,POP_AIAN,IND_AFFHM_MQE,...,HINS_A3564,HU_OCC,POP_NH,HINS_A0018,HINS_A65p,HH_x5,POP_A3564,HU_x14,POP_16p_EMP_x7,POP_A25p
0,1001.000,58239.000,1,4997675,0.012,0.000,0.000,0.002,0.002,0.002,...,0.898,0.904,0.970,0.979,0.998,21856.000,22690.000,24170.000,25871.000,39614.000
1,1003.000,227131.000,1,4997675,0.045,0.000,0.000,0.005,0.006,0.002,...,0.885,0.716,0.953,0.941,0.992,87190.000,89031.000,121763.000,104367.000,161977.000
2,1005.000,25259.000,1,4997675,0.005,0.000,0.000,0.003,0.003,0.000,...,0.868,0.779,0.953,0.972,0.999,9088.000,7823.000,11667.000,8561.000,17995.000
3,1007.000,22412.000,1,4997675,0.004,0.000,0.000,0.001,0.001,0.022,...,0.870,0.786,0.972,0.980,1.000,7083.000,8347.000,9013.000,8223.000,16057.000
4,1009.000,58884.000,1,4997675,0.012,0.001,0.001,0.001,0.003,0.004,...,0.860,0.868,0.905,0.966,0.998,21300.000,22918.000,24527.000,24244.000,40668.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3107,56037.000,42459.000,56,576641,0.074,0.000,0.001,0.011,0.012,0.150,...,0.853,0.810,0.839,0.903,1.000,15529.000,16838.000,19174.000,20884.000,27816.000
3108,56039.000,23319.000,56,576641,0.040,0.001,0.001,0.001,0.001,0.004,...,0.893,0.719,0.851,0.892,1.000,9531.000,9875.000,13255.000,14952.000,17659.000
3109,56041.000,20514.000,56,576641,0.036,0.000,0.000,0.001,0.003,0.077,...,0.853,0.870,0.903,0.932,0.994,7675.000,7709.000,8819.000,9688.000,13233.000
3110,56043.000,7768.000,56,576641,0.013,0.000,0.000,0.005,0.009,0.027,...,0.815,0.877,0.856,0.971,0.986,3370.000,2958.000,3842.000,3907.000,5423.000


# <font color=blue> TARGET PROCESSING

### <font color=blue> Compute GROUND TRUTH incident deaths for nyt county-level dataset.

In [30]:
fpath = os.path.join(DATA_DIR,'processed',f'formatted_nytimes-us-counties_{DT}.csv')
df_nyt = pd.read_csv(fpath)

## Compute inc deaths as (cum deaths) at t minus (cum deaths) at t-1 ...
## keep nans and negatives !!

df_nyt = df_nyt.sort_values(['Fips','Date'])

dfs = []
for fips in tqdm(df_nyt.Fips.unique()):
    
    df_county = df_nyt[df_nyt.Fips==fips]
    df_county.reset_index(inplace=True, drop=True)
    
    inc_deathss = []
    for i in range(len(df_county)):
        if i==0:
            inc_deaths = np.nan
        else:          
#             inc_deaths = df_county.True_cum_deaths.values[i] - df_county.True_cum_deaths.values[i-1]
            inc_deaths = df_county.at[i,'True_cum_deaths'] - df_county.at[i-1,'True_cum_deaths']   
        inc_deathss.append(inc_deaths)
            
    df_county['True_inc_deaths'] = inc_deathss
    dfs.append(df_county)
    
df_tot = pd.concat(dfs)
# df_tot

  0%|          | 0/3141 [00:00<?, ?it/s]

### <font color=blue> Add necessary columns (for naive death computation) from acs and hub to main (nyt) dataframe. Do naive death computation.

In [31]:
fpath = os.path.join(DATA_DIR,'processed',f'formatted_acs_normed_pop_ratio_{DT}.csv')
df_acs = pd.read_csv(fpath)
fpath = os.path.join(DATA_DIR,'processed',f'formatted_COVIDhub-ensemble_{DT}.csv')
df_hub = pd.read_csv(fpath)

## Join with acs[[fips,pop,state_fips,state_pop,ratio]] ...
df = df_tot.merge(df_acs[['Fips','Pop','State_fips','State_pop','Ratio']], on='Fips')

## Add on proj_inc_deaths from hub ...
df = df.merge(df_hub,on=['State_fips','Date'])

## Reorder columns
df = df[['Fips','State_fips','Pop','State_pop','Ratio','Date',
         'Proj_inc_deaths', 'True_cum_deaths', 'True_inc_deaths']]

## Compute naive inc deaths.
df['Naive_inc_deaths'] = df.apply(lambda x: x.Proj_inc_deaths * x.Ratio, axis=1)


### <font color=blue> Get cum deaths at t-1 for each sample.

In [32]:
from tqdm import tqdm

fipss = df.Fips.unique()

dfs = []
for fips in tqdm(fipss):
    df_county = df[df.Fips==fips]
    df_county.sort_values('Date',inplace=True)
    df_county.reset_index(inplace=True,drop=True)
    
    cum_deaths_tm1s = []
    
    for i in range(len(df_county)):
        if i==0:
            cum_deaths_tm1 = np.nan
        else:
            cum_deaths_tm1 = df_county.at[i-1, 'True_cum_deaths']
        cum_deaths_tm1s.append(cum_deaths_tm1)
        
    df_county['Cum_deaths_tm1'] = cum_deaths_tm1s
    dfs.append(df_county)
                
    
df_tot = pd.concat(dfs)

## Reorder columns 
df_tot = df_tot[['Fips', 'State_fips', 'Pop', 'State_pop', 'Ratio', 'Date',
                 'Proj_inc_deaths', 'True_cum_deaths',  'Cum_deaths_tm1',
                 'True_inc_deaths','Naive_inc_deaths']]

df_tot.reset_index(inplace=True,drop=True)

df = df_tot.copy()

## reset index
df.reset_index(inplace=True,drop=True)

## SAVE CSV!
# fpath = os.path.join(DATA_DIR,'processed','scratch', f'processed_naive_deaths_{DT}.csv')
fpath = os.path.join(DATA_DIR,'processed', f'processed_naive_deaths_{DT}.csv')

df.to_csv(fpath,index=False)
print(DT)

## READ BACK AND CHECK
dff = pd.read_csv(fpath)
display(dff)
# dff.dtypes

100%|██████████| 3107/3107 [00:02<00:00, 1108.87it/s]


20241009-144131


Unnamed: 0,Fips,State_fips,Pop,State_pop,Ratio,Date,Proj_inc_deaths,True_cum_deaths,Cum_deaths_tm1,True_inc_deaths,Naive_inc_deaths
0,1001,1,58239.000,4997675,0.012,20200613,69.730,6.000,,,0.813
1,1001,1,58239.000,4997675,0.012,20200620,83.302,9.000,6.000,3.000,0.971
2,1001,1,58239.000,4997675,0.012,20200627,68.179,12.000,9.000,3.000,0.795
3,1001,1,58239.000,4997675,0.012,20200704,76.239,13.000,12.000,1.000,0.888
4,1001,1,58239.000,4997675,0.012,20200711,88.363,15.000,13.000,2.000,1.030
...,...,...,...,...,...,...,...,...,...,...,...
309954,56045,56,6891.000,576641,0.012,20220409,8.000,18.000,18.000,0.000,0.096
309955,56045,56,6891.000,576641,0.012,20220416,5.000,18.000,18.000,0.000,0.060
309956,56045,56,6891.000,576641,0.012,20220423,4.000,18.000,18.000,0.000,0.048
309957,56045,56,6891.000,576641,0.012,20220430,4.000,18.000,18.000,0.000,0.048
