In [44]:
%%capture
%load_ext autoreload
%autoreload 2
from setup_nb_env import *

from epsampling.utils import load_csv
# pd.set_option('display.float_format', lambda x: '%.3f' % x)
from epsampling.utils import drop_sers_with_nans
from epsampling.utils import date_str_to_int

DATA_DIR = '/work/users/k/4/k4thryn/Repos/EpSampling/data/'
DT = datetime.today().strftime('%Y%m%d-%H%M%S')

from IPython.display import Audio

def meow():
    display(Audio(filename='../cat_meow2.wav', autoplay=True))

In [45]:
import glob
import os
from tqdm.notebook import tqdm 

In [53]:
def get_hub_df(state_fips=None):
    '''
    params:
        state_fips (str) -> which state data to subset if desired. if not then
        all states' data are returned. NC state fips is '37'.
    returns:
        df (pd.DataFrame) -> dataframe of covidhub ensemble projections.
    '''
    my_dir = os.path.join(DATA_DIR,'raw','COVIDhub-ensemble')
    files = glob.glob(f'{my_dir}/*.csv')

    types = ['point']
    targets = ['1 wk ahead inc death']

    all_dfs = []
    for f in tqdm(files,total=len(files)):
        df = pd.read_csv(f)
        df = df[df.type=='point']
        df = df[df.target.isin(targets)]
        df = df[df.location!='US']
        all_dfs.append(df)
    df_all = pd.concat(all_dfs)
    
    df = df_all[['location','target_end_date','value']]    
    df.rename({'location':'State_fips',
               'target_end_date':'Date',
               'value':'Proj_inc_deaths'}, axis=1, inplace=True)

    df['State_fips'] = df.State_fips.astype(int)
    df['Date'] = df.Date.apply(lambda x: date_str_to_int(x))

    if state_fips is not None:
        df = df[df.State_fips==state_fips]
        
    # set to 32-bit
    df[df.select_dtypes(np.float64).columns] = df.select_dtypes(np.float64).astype(np.float32)
    df[df.select_dtypes(np.int64).columns] = df.select_dtypes(np.int64).astype(np.int32)
        
    df.reset_index(drop=True,inplace=True)
    df.sort_values(['State_fips','Date'], inplace=True)
    return df

In [54]:
def get_death_df(state_fips=None):
    
    '''
    params:
        state_fips (str) -> which state data to subset if desired. if not then
        all states' data are returned. NC state fips is '37'.
    returns:
        df (pd.DataFrame) -> dataframe of nyt reports per county.
    '''
    
    fpath = os.path.join(DATA_DIR,'raw','nytimes','us-counties.csv')
    df = pd.read_csv(fpath)

    df = drop_sers_with_nans(df, from_axis='rows', print_out=False)

    ## REFORMAT dataframe ...
    df.columns = df.columns.str.capitalize()
    df.rename({'Deaths':'True_county_cum_deaths'},axis=1,inplace=True)
    df = df[['Fips','Date', 'True_county_cum_deaths']]
    df['Fips'] = df.Fips.astype(int)

    ## Pull out samples from 'nytimes' that have matched dates to 'COVIDhub-ensemble' ...
    df['Date'] = df.Date.apply(lambda x: date_str_to_int(x))
    df_hub,_ = load_csv('formatted_COVIDhub-ensemble')
    my_dates = df_hub.Date.unique().tolist()
    df = df[df.Date.isin(my_dates)]

    ## only nc
    if state_fips is not None:
        df = df[df.Fips.astype(str).str.startswith(str(state_fips))]

    ## get county inc deaths
    dfs = []
    for fips in tqdm(df.Fips.unique()):

        df_county = df[df.Fips==fips]
        df_county.reset_index(inplace=True, drop=True)

        inc_deathss = []
        for i in range(len(df_county)):
            if i==0:
                inc_deaths = np.nan
            else:     
                inc_deaths = df_county.at[i,'True_county_cum_deaths'] - \
                df_county.at[i-1,'True_county_cum_deaths']   
            inc_deathss.append(inc_deaths)

        df_county['True_county_inc_deaths'] = inc_deathss
        dfs.append(df_county)

    df = pd.concat(dfs)
    
    # set to 32-bit
    df[df.select_dtypes(np.float64).columns] = df.select_dtypes(np.float64).astype(np.float32)
    df[df.select_dtypes(np.int64).columns] = df.select_dtypes(np.int64).astype(np.int32)
    
    df.reset_index(inplace=True,drop=True)
    return df


In [55]:
df_hub = get_hub_df(state_fips=37)
display(df_hub)
df_death = get_death_df(state_fips=37)
display(df_death)

  0%|          | 0/210 [00:00<?, ?it/s]

Unnamed: 0,State_fips,Date,Proj_inc_deaths
0,37,20200613,124.24012
1,37,20200620,122.952377
2,37,20200627,100.308655
3,37,20200704,119.399124
4,37,20200711,100.119064
5,37,20200718,120.176331
6,37,20200725,131.0
7,37,20200801,152.0
8,37,20200808,162.0
9,37,20200815,208.0


  0%|          | 0/100 [00:00<?, ?it/s]

Unnamed: 0,Fips,Date,True_county_cum_deaths,True_county_inc_deaths
0,37001,20200613,32.0,
1,37001,20200620,35.0,3.0
2,37001,20200627,36.0,1.0
3,37001,20200704,37.0,1.0
4,37001,20200711,37.0,0.0
...,...,...,...,...
9995,37199,20220409,51.0,0.0
9996,37199,20220416,51.0,0.0
9997,37199,20220423,51.0,0.0
9998,37199,20220430,51.0,0.0


In [None]:
def compute_true_inc_deaths_col(df):

    df = df.sort_values(['Fips','Date'])

    dfs = []
    for fips in tqdm(df.Fips.unique()):

        df_county = df[df.Fips==fips]
        df_county.reset_index(inplace=True, drop=True)

        inc_deathss = []
        for i in range(len(df_county)):
            if i==0:
                inc_deaths = np.nan
            else:          
    #             inc_deaths = df_county.True_cum_deaths.values[i] - df_county.True_cum_deaths.values[i-1]
                inc_deaths = df_county.at[i,'True_cum_deaths'] - df_county.at[i-1,'True_cum_deaths']   
            inc_deathss.append(inc_deaths)

        df_county['True_inc_deaths'] = inc_deathss
        dfs.append(df_county)

    df_tot = pd.concat(dfs)
# df_tot    
    

In [None]:
## get true state deaths
dff = df_county[['Date','True_county_inc_deaths','True_county_cum_deaths']]
sers = []
for date in dff.Date.unique():
    subdf = dff[dff.Date==date]
    state_cum = sum(subdf.True_county_cum_deaths)
    state_inc = sum(subdf.True_county_inc_deaths)
    sers.append({'Date':date, 'True_state_inc_deaths':state_inc, 'True_state_cum_deaths':state_cum})
df_state = pd.DataFrame(sers)
df_state.set_index('Date',drop=True,inplace=True)

# df = df.join(df_state_cum, on='Date')

display(df_state)