In [1]:
%%capture
%load_ext autoreload
%autoreload 2
from setup_nb_env import *

from epsampling.utils import load_csv
# pd.set_option('display.float_format', lambda x: '%.3f' % x)
from epsampling.utils import drop_sers_with_nans
from epsampling.utils import date_str_to_int

DATA_DIR = '/work/users/k/4/k4thryn/Repos/EpSampling/data/'
DT = datetime.today().strftime('%Y%m%d-%H%M%S')

from IPython.display import Audio

def meow():
    display(Audio(filename='../cat_meow2.wav', autoplay=True))

In [2]:
import glob
import os
from tqdm.notebook import tqdm 

In [3]:
def get_hub_df(state_fips=None):
    '''
    params:
        state_fips (str) -> which state data to subset if desired. if not then
        all states' data are returned. NC state fips is '37'.
    returns:
        df (pd.DataFrame) -> dataframe of covidhub ensemble projections.
    '''
    my_dir = os.path.join(DATA_DIR,'raw','COVIDhub-ensemble')
    files = glob.glob(f'{my_dir}/*.csv')

    types = ['point']
    targets = ['1 wk ahead inc death']

    all_dfs = []
    for f in tqdm(files,total=len(files)):
        df = pd.read_csv(f)
        df = df[df.type=='point']
        df = df[df.target.isin(targets)]
        df = df[df.location!='US']
        all_dfs.append(df)
    df_all = pd.concat(all_dfs)
    
    df = df_all[['location','target_end_date','value']]    
    df.rename({'location':'State_fips',
               'target_end_date':'Date',
               'value':'Proj_state_inc_deaths'}, axis=1, inplace=True)

    df['State_fips'] = df.State_fips.astype(int)
    df['Date'] = df.Date.apply(lambda x: date_str_to_int(x))

    if state_fips is not None:
        df = df[df.State_fips==state_fips]
        
    # set to 32-bit
    df[df.select_dtypes(np.float64).columns] = df.select_dtypes(np.float64).astype(np.float32)
    df[df.select_dtypes(np.int64).columns] = df.select_dtypes(np.int64).astype(np.int32)
        
    df.reset_index(drop=True,inplace=True)
    df.sort_values(['State_fips','Date'], inplace=True)
    return df

In [4]:
def get_death_df(state_fips=None):
    
    '''
    params:
        state_fips (str) -> which state data to subset if desired. if not then
        all states' data are returned. NC state fips is '37'.
    returns:
        df (pd.DataFrame) -> dataframe of nyt reports per county.
    '''
    
    fpath = os.path.join(DATA_DIR,'raw','nytimes','us-counties.csv')
    df = pd.read_csv(fpath)

    df = drop_sers_with_nans(df, from_axis='rows', print_out=False)

    ## REFORMAT dataframe ...
    df.columns = df.columns.str.capitalize()
    df.rename({'Deaths':'True_county_cum_deaths'},axis=1,inplace=True)
    df = df[['Fips','Date', 'True_county_cum_deaths']]
    df['Fips'] = df.Fips.astype(int)

    ## Pull out samples from 'nytimes' that have matched dates to 'COVIDhub-ensemble' ...
    df['Date'] = df.Date.apply(lambda x: date_str_to_int(x))
    df_hub,_ = load_csv('formatted_COVIDhub-ensemble')
    my_dates = df_hub.Date.unique().tolist()
    df = df[df.Date.isin(my_dates)]

    ## only nc
    if state_fips is not None:
        df = df[df.Fips.astype(str).str.startswith(str(state_fips))]

    ## get county inc deaths
    dfs = []
    for fips in tqdm(df.Fips.unique()):

        df_county = df[df.Fips==fips]
        df_county.reset_index(inplace=True, drop=True)

        inc_deathss = []
        for i in range(len(df_county)):
            if i==0:
                inc_deaths = np.nan
            else:     
                inc_deaths = df_county.at[i,'True_county_cum_deaths'] - \
                df_county.at[i-1,'True_county_cum_deaths']   
            inc_deathss.append(inc_deaths)

        df_county['True_county_inc_deaths'] = inc_deathss
        dfs.append(df_county)

    df = pd.concat(dfs)
    
    # set to 32-bit
    df[df.select_dtypes(np.float64).columns] = df.select_dtypes(np.float64).astype(np.float32)
    df[df.select_dtypes(np.int64).columns] = df.select_dtypes(np.int64).astype(np.int32)
    
    df.reset_index(inplace=True,drop=True)
    return df

In [20]:
import glob
import os

def normalize_attrs_by_pop(df, f):
    f = (f[f.rindex('/')+1:-4])
    f = f[f.rindex('_')+1:]
    
    df.set_index('GEOID',drop=True,inplace=True)

    if f=='healthinsurance':
        df['HINS_A0018'] = df['HINS_A0018'].div(df['POP_A0018'])
        df['HINS_A1934'] = df['HINS_A1934'].div(df['POP_A1934'])
        df['HINS_A3564'] = df['HINS_A3564'].div(df['POP_A3564'])
        df['HINS_A65p'] = df['HINS_A65p'].div(df['POP_A65p'])
        
        dff = df.drop(['POP_A0018','POP_A1934','POP_A1934','POP_A65p'],inplace=False,axis=1)
        
    elif f=='income':
        denom = df['HH']
        
        dff = df.apply(lambda x: x/denom, axis=0) 
        ## fix MHI since its not supposed to be normalized
        dff['MHI'] = df['MHI']
        dff['HH'] = denom

    else:
        universe = df.columns[0]
        denom = df[universe]
        
        dff = df.apply(lambda x: x/denom, axis=0)
        dff[universe] = denom
        
    dff = dff.reset_index(inplace=False, drop=False)
    return dff

def get_state_df(files):
    first_df = None
    for i,f in enumerate(files):
        this_df = pd.read_csv(f)
        this_df = normalize_attrs_by_pop(this_df, f)
        if first_df is None:
            first_df = this_df
        else:
            df = pd.merge(first_df, this_df, on='GEOID', suffixes=(f'_x{i}', f'_x{i+1}'))
            first_df = df
    return df

    
def get_acs_df(state_fips=None):

    acs_dir = '/work/users/k/4/k4thryn/Repos/EpSampling/data/raw/acs_results/'

    all_st_dfs = []

    state_dirs = [x for x in os.walk(acs_dir)][0][1]

    for i,state in enumerate(state_dirs):
        if i==0:
            continue    

        files = glob.glob(f'{acs_dir}{state}/*.csv')
        df = get_state_df(files)
        ## REFORMAT dataframe ... rename cols.
        df.rename({'GEOID':'Fips'},axis=1,inplace=True)
        all_st_dfs.append(df)

    df = pd.concat(all_st_dfs)
    df.reset_index(drop=True,inplace=True)

    ## check for cols with nans.
    df = drop_sers_with_nans(df, from_axis='cols')
    
    # # # # # #
    # Add pop ratio and dedup identical covs.
    # # # # # #
    
    ## Rename cols
    df.rename({'POP_x2':'Pop'},axis=1,inplace=True)
    ## Reorder columns
    df = df[['Fips','Pop'] + [c for c in df.columns if c not in ['Fips','Pop']]]

    # # # # # # # # # # # # # # # # # # # # # # # #
    ## Get county ratios and insert state pop, state fips, and county ratio cols.
    # # # # # # # # # # # # # # # # # # # # # # # # 

    df.insert(2, 'State_fips', 0)
    df.insert(3, 'State_pop',0)
    df.insert(4, 'Ratio', 0)

    for tup in df.itertuples():

        st_fips = tup.Fips // 1000
        df.at[tup.Index, 'State_fips'] = st_fips    

    for tup in df.itertuples():

        state_pop = sum(df[df.State_fips==tup.State_fips].Pop)
        ratio = tup.Pop / state_pop

        df.at[tup.Index, 'State_pop'] = state_pop
        df.at[tup.Index, 'Ratio'] = ratio
        
        
    if state_fips is not None:
        df = df[df.State_fips==state_fips]

    df.reset_index(inplace=True, drop=True)    
    return df



def get_state_inc_deaths_col(df_hub, df_death):
    
    df = df_hub.copy()
    df.set_index('Date',drop=False,inplace=True)
    df['True_state_inc_deaths'] = 0
    
    for date in df.Date.unique():
        subdf = df_death[df_death.Date==date]
        inc_deaths = sum(subdf.True_county_inc_deaths)
        df.at[date, 'True_state_inc_deaths'] = inc_deaths

    return df


def get_naive_deaths_col(df, df_acs):

    ## Join with acs[[fips,pop,state_fips,state_pop,ratio]] ...
    df = df.merge(df_acs[['Fips','Pop','State_pop','Ratio']], on='Fips')
    
#     display(df)

    ## Add on proj_inc_deaths from hub ...
#     df = df.merge(df_hub,on=['State_fips','Date'])


    ## Compute naive inc deaths.
    df['Naive_proj_deaths'] = df.apply(lambda x: x.Proj_state_inc_deaths * x.Ratio, axis=1)
    df['Naive_true_deaths'] = df.apply(lambda x: x.True_state_inc_deaths * x.Ratio, axis=1)
    
    

    ## Reorder columns
    df = df[['Date',  'State_fips','Fips', 'Pop', 'State_pop', 'Ratio', 
'Proj_state_inc_deaths', 'True_state_inc_deaths',
        'True_county_cum_deaths', 'True_county_inc_deaths', 
       'Naive_proj_deaths', 'Naive_true_deaths']]
    
    return df

In [6]:
df_acs = get_acs_df(37)
df_hub = get_hub_df(state_fips=37)
df_death = get_death_df(state_fips=37)

  0%|          | 0/210 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

In [17]:
df.columns

Index(['Date', 'State_fips', 'Proj_state_inc_deaths', 'True_state_inc_deaths',
       'Fips', 'True_county_cum_deaths', 'True_county_inc_deaths', 'Pop',
       'State_pop', 'Ratio', 'Naive_proj_deaths', 'Naive_true_deaths'],
      dtype='object')

In [21]:
def get_full_target_df(df_hub, df_death, df_acs):
    
    df = get_state_inc_deaths_col(df_hub, df_death)
    
#     display(df)
    
    df.drop('Date',axis=1,inplace=True)
#     df.dropna(inplace=True)
#     df_death.dropna(inplace=True)
    
#     display(df, df_death)
    
    df = df.merge(df_death,on='Date')
    
    display(df)
        
    df = get_naive_deaths_col(df, df_acs)
    
    df.dropna(inplace=True)
    df.reset_index(inplace=True, drop=True)

    return df

df = get_full_target_df(df_hub, df_death, df_acs)
df

Unnamed: 0,Date,State_fips,Proj_state_inc_deaths,True_state_inc_deaths,Fips,True_county_cum_deaths,True_county_inc_deaths
0,20200613,37,124.24012,,37001,32.0,
1,20200613,37,124.24012,,37003,0.0,
2,20200613,37,124.24012,,37005,0.0,
3,20200613,37,124.24012,,37007,1.0,
4,20200613,37,124.24012,,37009,1.0,
...,...,...,...,...,...,...,...
9995,20220507,37,33.00000,1172.0,37191,405.0,18.0
9996,20220507,37,33.00000,1172.0,37193,264.0,5.0
9997,20220507,37,33.00000,1172.0,37195,316.0,13.0
9998,20220507,37,33.00000,1172.0,37197,120.0,6.0


Unnamed: 0,Date,State_fips,Fips,Pop,State_pop,Ratio,Proj_state_inc_deaths,True_state_inc_deaths,True_county_cum_deaths,True_county_inc_deaths,Naive_proj_deaths,Naive_true_deaths
0,20200620,37,37001,169185,10367022,0.016320,122.952377,104.0,35.0,3.0,2.006526,1.697232
1,20200627,37,37001,169185,10367022,0.016320,100.308655,95.0,36.0,1.0,1.636991,1.550356
2,20200704,37,37001,169185,10367022,0.016320,119.399124,80.0,37.0,1.0,1.948538,1.305563
3,20200711,37,37001,169185,10367022,0.016320,100.119064,106.0,37.0,0.0,1.633897,1.729871
4,20200718,37,37001,169185,10367022,0.016320,120.176331,133.0,39.0,2.0,1.961222,2.170498
...,...,...,...,...,...,...,...,...,...,...,...,...
9895,20220409,37,37199,18357,10367022,0.001771,55.000000,43.0,51.0,0.0,0.097389,0.076141
9896,20220416,37,37199,18357,10367022,0.001771,41.000000,76.0,51.0,0.0,0.072599,0.134574
9897,20220423,37,37199,18357,10367022,0.001771,60.000000,29.0,51.0,0.0,0.106243,0.051351
9898,20220430,37,37199,18357,10367022,0.001771,33.000000,42.0,51.0,0.0,0.058433,0.074370


In [26]:
def get_only_acs_covs_df(df_acs):
    df = df_acs.drop(['State_fips','State_pop','Ratio'],axis=1)
    df.rename({'Pop':'POP'},axis=1,inplace=True)
#     df.set_index('Fips',inplace=True,drop=True)
    return df

dff_acs = get_only_acs_covs_df(df_acs)
dff_acs

Unnamed: 0,Fips,POP,POP_M,POP_F,POP_A0004,POP_A0509,POP_A1014,POP_A1517,POP_A1819,POP_A20,...,HU_x15,HU_UIS01D,HU_UIS01A,HU_UIS02,HU_UIS0304,HU_UIS0509,HU_UIS1019,HU_UIS2049,HU_UIS50P,HU_UISOTHER
0,37001,169185,0.476916,0.523084,0.057529,0.061714,0.064805,0.039111,0.039572,0.016627,...,72651,0.665153,0.03503,0.023427,0.03587,0.049194,0.034728,0.029084,0.011507,0.116007
1,37003,36491,0.508838,0.491162,0.043846,0.04634,0.073306,0.039023,0.019073,0.010331,...,15997,0.693505,0.007189,0.01519,0.010127,0.004313,0.006876,0.00075,0.0,0.262049
2,37005,10910,0.492392,0.507608,0.041613,0.054537,0.043355,0.031347,0.018057,0.008066,...,7721,0.789276,0.005699,0.012952,0.002331,0.020982,0.009584,0.001813,0.0,0.157363
3,37007,22388,0.50469,0.49531,0.050116,0.051501,0.068608,0.03618,0.024031,0.01005,...,10027,0.709085,0.00738,0.010771,0.011668,0.024135,0.004189,0.007779,0.003391,0.221602
4,37009,26598,0.49233,0.50767,0.040078,0.044439,0.056696,0.032672,0.019588,0.010565,...,17006,0.731977,0.006527,0.021169,0.023404,0.014818,0.011996,0.005939,0.003822,0.180348
5,37011,17747,0.552544,0.447456,0.034597,0.038035,0.044515,0.028343,0.019722,0.01713,...,13843,0.675937,0.01387,0.026584,0.054107,0.031352,0.014375,0.028029,0.03453,0.121216
6,37013,44898,0.477104,0.522896,0.048287,0.051138,0.062252,0.038599,0.020424,0.010802,...,24292,0.629796,0.021283,0.019636,0.021489,0.013708,0.011732,0.00918,0.000535,0.272641
7,37015,18105,0.508202,0.491798,0.042088,0.058271,0.039437,0.03198,0.019939,0.005413,...,9043,0.644698,0.004534,0.017914,0.003649,0.000553,0.0,0.00188,0.0,0.326772
8,37017,30105,0.479289,0.520711,0.050889,0.060787,0.057266,0.037735,0.023551,0.02096,...,15459,0.625137,0.002911,0.012679,0.01656,0.006275,0.003946,0.004852,0.000582,0.327059
9,37019,133789,0.482514,0.517486,0.037305,0.040145,0.046416,0.026019,0.014134,0.008745,...,87421,0.662392,0.039293,0.010306,0.013738,0.029684,0.017536,0.01693,0.007733,0.202388


In [27]:
## SAVE CSVs!
fpath = os.path.join(DATA_DIR,'processed',f'training_target_df_{DT}.csv')
df.to_csv(fpath,index=False)
print(DT)

fpath = os.path.join(DATA_DIR,'processed',f'training_acs_df_{DT}.csv')
dff_acs.to_csv(fpath,index=False)
print(DT)

20241106-112904
20241106-112904


In [None]:
# def compute_true_inc_deaths_col(df):

#     df = df.sort_values(['Fips','Date'])

#     dfs = []
#     for fips in tqdm(df.Fips.unique()):

#         df_county = df[df.Fips==fips]
#         df_county.reset_index(inplace=True, drop=True)

#         inc_deathss = []
#         for i in range(len(df_county)):
#             if i==0:
#                 inc_deaths = np.nan
#             else:          
#     #             inc_deaths = df_county.True_cum_deaths.values[i] - df_county.True_cum_deaths.values[i-1]
#                 inc_deaths = df_county.at[i,'True_cum_deaths'] - df_county.at[i-1,'True_cum_deaths']   
#             inc_deathss.append(inc_deaths)

#         df_county['True_inc_deaths'] = inc_deathss
#         dfs.append(df_county)

#     df_tot = pd.concat(dfs)
# # df_tot    
    

In [None]:
# ## get true state deaths
# dff = df_county[['Date','True_county_inc_deaths','True_county_cum_deaths']]
# sers = []
# for date in dff.Date.unique():
#     subdf = dff[dff.Date==date]
#     state_cum = sum(subdf.True_county_cum_deaths)
#     state_inc = sum(subdf.True_county_inc_deaths)
#     sers.append({'Date':date, 'True_state_inc_deaths':state_inc, 'True_state_cum_deaths':state_cum})
# df_state = pd.DataFrame(sers)
# df_state.set_index('Date',drop=True,inplace=True)

# # df = df.join(df_state_cum, on='Date')

# display(df_state)