In [1]:
%%capture
%load_ext autoreload
%autoreload 2
from setup_nb_env import *

from epsampling.utils import load_csv
# pd.set_option('display.float_format', lambda x: '%.3f' % x)

DATA_DIR = '/work/users/k/4/k4thryn/Repos/EpSampling/data/'
DT = datetime.today().strftime('%Y%m%d-%H%M%S')

In [None]:
## df = pd.read_csv('../constants/state_fips.csv')
## df.rename({'FIPS':'Fips'},axis=1,inplace=True)
## df.to_csv('state_fips.csv',index=False)

# df = pd.read_csv('state_fips.csv',index_col='Fips')
# df.head()

### <font color=blue> CovidHub ensemble state predictions

In [None]:
#####################################################
## COVID HUB ENSEMBLE STATE PREDICTIONS ############
#####################################################

import glob
import os
from tqdm.notebook import tqdm 

my_dir = os.path.join(DATA_DIR,'raw','COVIDhub-ensemble')
files = glob.glob(f'{my_dir}/*.csv')

types = ['point']
targets = ['1 wk ahead inc death']

all_dfs = []
for f in tqdm(files,total=len(files)):
    df = pd.read_csv(f)
    ## Choose which types (only point for now)
    df = df[df.type=='point']
    ## Only 1 wk ahead inc
    df = df[df.target.isin(targets)]
    all_dfs.append(df)
df_all = pd.concat(all_dfs)

## REFORMAT dataframe ...
## choose cols
df = df_all[['location','target_end_date','value']]
## rename cols
df.rename({'location':'Fips',
           'target_end_date':'Date',
           'value':'Pred_state_deaths'}, axis=1, inplace=True)
## dtype check: good

## SAVE CSV!
fpath = os.path.join(DATA_DIR,'processed',f'formatted_COVIDhub-ensemble_{DT}.csv')
df.to_csv(fpath,index=False)

### <font color=blue> NYTimes true county deaths

In [None]:
def drop_rows_with_nans(df):
    if df.isnull().values.any()==True:
        dff = df.dropna(axis=0, inplace=False)
        print(f'Dropped rows with NaNs!')
        print(f'Num rows before: {df.shape[0]}')
        print(f'Num rows after: {dff.shape[0]}')
        return dff
    else:
        print(f'No NaNs! :)')
        return df

In [None]:
#####################################################
## NYT TRUE COUNTY DEATHS ###########################
#####################################################

fpath = os.path.join(DATA_DIR,'raw','nytimes','us-counties.csv')
df = pd.read_csv(fpath)
## check for rows with nans. 
df = drop_rows_with_nans(df)



# if df.isnull().values.any()==True:
#     print('Dropped NaNs!')
#     df.dropna(inplace=True, axis=0)
    
## capitalize cols.
df.columns = df.columns.str.capitalize()

## IMPORTANT: pull out samples from 'nytimes' that have matched dates to 'COVIDhub-ensemble' ...
df_hub,_ = load_csv('formatted_COVIDhub-ensemble')
my_dates = df_hub.Date.unique().tolist()
df = df[df.Date.isin(my_dates)]

## REFORMAT dataframe ...
## choose cols
df = df[['Date','Fips','Deaths']]
## rename cols
df.rename({'Deaths':'True_county_deaths'},axis=1,inplace=True)
## fix dtypes
df['Fips'] = df.Fips.astype('int64').astype('str')

## SAVE CSV!
fpath = os.path.join(DATA_DIR,'processed',f'formatted_nytimes-us-counties_{DT}.csv')
df.to_csv(fpath,index=False)

<font color=blue> _Check if nyt data is cumulative ..._

In [None]:
## Check if nyt data is cumulative ...
# max_deaths = df.True_county_deaths.max()
# max_fips = df[df.True_county_deaths==max_deaths].Fips.values[0]
# dff = df[df.Fips==max_fips]
# dff = dff.sort_values('Date')
# print(dff.True_county_deaths.is_monotonic_increasing)

### <font color=blue> Fips dictionary

In [None]:
#####################################################
## FIPS DICTIONARY  #################################
#####################################################

df = pd.read_csv('../constants/state_fips.csv')
## REFORMAT dataframe ...
## rename cols
df.rename({'FIPS':'Fips'},axis=1,inplace=True)
## fix dtypes
df['Fips'] = df.Fips.astype('int64').astype('str')

fpath = os.path.join(DATA_DIR,'processed',f'formatted_nytimes-us-counties_{DT}.csv')
df.to_csv(fpath,index=False)
df.to_csv('state_fips.csv',index=False)

fpath = os.path.join(DATA_DIR,'processed',f'formatted_state_fips_{DT}.csv')
df.to_csv(fpath,index=False)

# df_fips_dict = pd.read_csv('state_fips.csv',index_col='Fips')

# df_postal_dict = pd.read_csv('state_fips.csv',index_col='Postal')
# df_postal_dict.head()

### <font color=blue> ACS results

In [2]:
#####################################################
## ACS (CENSUS) RESULTS #############################
#####################################################

def get_state_acs_df(files):
    first_df = None
    for i,f in enumerate(files):
        this_df = pd.read_csv(f)
        if first_df is None:
            first_df = this_df
        else:
            df = pd.merge(first_df, this_df, on='GEOID', suffixes=(f'_x{i}', f'_x{i+1}'))
            first_df = df
    return df

def get_subdirs(path):
    return [d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))]

def drop_cols_with_nans(df):
    if df.isnull().values.any()==True:
        dff = df.dropna(axis=1, inplace=False)
        print(f'Dropped columns with NaNs!')
        print(f'Num cols before: {(df.shape[1])}')
        print(f'Num cols after: {(dff.shape[1])}')
        return dff
    else:
        print(f'No NaNs! :)')
        return df


In [3]:
# print(f'hello \N{Rock}')

In [6]:
import glob
import os


dir_path = os.path.join(DATA_DIR,'raw','acs_results')

all_st_dfs = []
for i,(subdir, dirs, _) in tqdm(enumerate(os.walk(dir_path))):
    
    if subdir==dir_path: 
        continue
        
    else:
        postal = subdir[-2:]
        
#         fips = df_postal_dict.loc[postal].Fips
        files = glob.glob(f'{subdir}/*.csv')
        
        df = get_state_acs_df(files)

        
        ## REFORMAT dataframe ...
        ## rename cols
        df.rename({'GEOID':'Fips'},axis=1,inplace=True)
#         df.columns = df.columns.str.capitalize()
        ## fix dtypes
        df['Fips'] = df.Fips.astype('int64').astype('str')
        
        
#         break

#         df['State_fips'] = fips
#         df['State'] = state
#         df = df[['State'] + ['State_fips'] + 
#                 [col for col in df.columns if col not in ['State_fips','State']]]

        all_st_dfs.append(df)
    
df = pd.concat(all_st_dfs)
df = drop_cols_with_nans(df)
df
#         ## check for nans.
#         if tot_df.isnull().values.any()==True:
#             print(f'Dropped NaNs! Postal: {postal}')
#             print(f'Before drop: {len(tot_df.columns)}')
#             df.dropna(axis=1, inplace=True)
#             print(f'After drop: {len(tot_df.columns)}')
    
# final_df = pd.concat(all_st_dfs)
# final_df.rename({'GEOID':'Fips'},axis=1,inplace=True)
# final_df

51it [00:03, 13.60it/s]

Dropped columns with NaNs!
Num cols before: 192
Num cols after: 191





Unnamed: 0,Fips,POP_x2,POP_M,POP_F,POP_A0004,POP_A0509,POP_A1014,POP_A1517,POP_A1819,POP_A20,...,HU_x15,HU_UIS01D,HU_UIS01A,HU_UIS02,HU_UIS0304,HU_UIS0509,HU_UIS1019,HU_UIS2049,HU_UIS50P,HU_UISOTHER
0,2013,3409,2014,1395,122,103,151,106,52,37,...,1113,835,3,57,72,41,35,0,0,70
1,2016,5251,2995,2256,162,215,148,110,154,55,...,1456,417,70,242,275,154,79,180,12,27
2,2020,292545,149648,142897,20218,20836,18642,11308,7022,3885,...,118055,56649,16185,6372,12729,6852,4719,6892,2642,5015
3,2050,18514,9724,8790,1902,1923,1696,1028,612,261,...,5992,4826,125,305,228,56,65,150,9,228
4,2060,849,480,369,67,41,28,23,8,3,...,922,711,33,23,29,51,12,7,0,56
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18,56037,42459,21850,20609,2657,3456,3008,1824,1193,485,...,19174,11446,847,478,1023,559,514,404,73,3830
19,56039,23319,12280,11039,1068,1032,1475,575,569,64,...,13255,7788,1314,555,1098,1056,505,404,97,438
20,56041,20514,10437,10077,1413,1712,1762,986,341,129,...,8819,5230,606,99,237,230,424,125,62,1806
21,56043,7768,4004,3764,399,321,606,442,191,76,...,3842,2847,87,128,267,128,22,17,5,341


In [7]:

## SAVE CSV!
fpath = os.path.join(DATA_DIR,'processed',f'formatted_acs_results_{DT}.csv')
df.to_csv(fpath,index=False)

In [8]:

dff = pd.read_csv(fpath)
dff

Unnamed: 0,Fips,POP_x2,POP_M,POP_F,POP_A0004,POP_A0509,POP_A1014,POP_A1517,POP_A1819,POP_A20,...,HU_x15,HU_UIS01D,HU_UIS01A,HU_UIS02,HU_UIS0304,HU_UIS0509,HU_UIS1019,HU_UIS2049,HU_UIS50P,HU_UISOTHER
0,2013,3409,2014,1395,122,103,151,106,52,37,...,1113,835,3,57,72,41,35,0,0,70
1,2016,5251,2995,2256,162,215,148,110,154,55,...,1456,417,70,242,275,154,79,180,12,27
2,2020,292545,149648,142897,20218,20836,18642,11308,7022,3885,...,118055,56649,16185,6372,12729,6852,4719,6892,2642,5015
3,2050,18514,9724,8790,1902,1923,1696,1028,612,261,...,5992,4826,125,305,228,56,65,150,9,228
4,2060,849,480,369,67,41,28,23,8,3,...,922,711,33,23,29,51,12,7,0,56
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3137,56037,42459,21850,20609,2657,3456,3008,1824,1193,485,...,19174,11446,847,478,1023,559,514,404,73,3830
3138,56039,23319,12280,11039,1068,1032,1475,575,569,64,...,13255,7788,1314,555,1098,1056,505,404,97,438
3139,56041,20514,10437,10077,1413,1712,1762,986,341,129,...,8819,5230,606,99,237,230,424,125,62,1806
3140,56043,7768,4004,3764,399,321,606,442,191,76,...,3842,2847,87,128,267,128,22,17,5,341


In [None]:
import glob
import os


dir_path = os.path.join(DATA_DIR,'raw','acs_results')

all_st_dfs = []
for i,(subdir, dirs, _) in tqdm(enumerate(os.walk(dir_path))):
    
    if subdir==dir_path: 
        continue
        
    else:
        postal = subdir[-2:]
        
#         fips = df_postal_dict.loc[postal].Fips
        files = glob.glob(f'{subdir}/*.csv')
        
        df = get_state_acs_df(files)

        
        ## REFORMAT dataframe ...
        ## rename cols
        df.rename({'GEOID':'Fips'},axis=1,inplace=True)
#         df.columns = df.columns.str.capitalize()
        ## fix dtypes
        df['Fips'] = df.Fips.astype('int64').astype('str')
        
        
#         break

#         df['State_fips'] = fips
#         df['State'] = state
#         df = df[['State'] + ['State_fips'] + 
#                 [col for col in df.columns if col not in ['State_fips','State']]]

        all_st_dfs.append(df)
    
tot_df = pd.concat(all_st_dfs)
tot_df = filter_nans(tot_df)
tot_df
#         ## check for nans.
#         if tot_df.isnull().values.any()==True:
#             print(f'Dropped NaNs! Postal: {postal}')
#             print(f'Before drop: {len(tot_df.columns)}')
#             df.dropna(axis=1, inplace=True)
#             print(f'After drop: {len(tot_df.columns)}')
    
# final_df = pd.concat(all_st_dfs)
# final_df.rename({'GEOID':'Fips'},axis=1,inplace=True)
# final_df

In [None]:
final_df = pd.concat(all_st_dfs)
final_df

In [None]:
df.dtypes