In [2]:
import pandas as pd
import numpy as np
import json
import glob
import h5py

In [4]:
f = h5py.File(hdf_path, 'r')


In [21]:
for i in range(27):
    print(f['val_num'][:,:,i].mean())

43.632347
12.737316
1.5866382
8.062045
2014.8015
0.035787757
0.85838157
0.2998135
0.6266145
0.109648146
1313.5308
94.202934
0.007960345
0.49021593
0.39521575
0.07820518
0.020755006
0.0077544143
0.0046618017
0.0016030621
0.0010490061
0.00048060142
2.5126654e-05
3.123887e-05
2.8184502e-06
0.0
0.0


In [3]:
# Set some params
categorical_cols = ['id', 'huc8', 'author']
n_cat_cols = len(categorical_cols)
n_num_cols = 27 # Hard coded, if you add more you'll have to change this

# Input paths
lc_frac_path = '../data/fraster_landcover_allyears_bigger.csv'
state_county_csv_pattern = '../data/state_county_csvs/counties/inun_frac*' 

# Output hdf path
hdf_path = '../data/all_prepped_data_earlyshift.h5'

# Set to true to have val as 2002 to 2006, test 2006 to 2010, and ignore 2010 to 2018 (for reviews)
early_val_shift = False
if early_val_shift:
    train_length_months = 226
else:
    train_length_months = 322

In [None]:
def split_train_test_val(df, early_shift=False):
    """early_shift=True means val is from 2002 to 2006, test 2006 to 2010, and after that is ignored"""
    ind_year = np.where(np.array(df.index.names)=='year')[0][0]
    if not early_shift:
        train_df = df.loc[df.index.get_level_values(ind_year)<=2010]
        val_df = df.loc[(df.index.get_level_values(ind_year)>2010) & (df.index.get_level_values(ind_year)<=2014)]
        test_df = df.loc[df.index.get_level_values(ind_year)>2014]
    else:
        train_df = df.loc[df.index.get_level_values(ind_year)<=2002]
        val_df = df.loc[(df.index.get_level_values(ind_year)>2002) & (df.index.get_level_values(ind_year)<=2006)]
        test_df = df.loc[(df.index.get_level_values(ind_year)>2006) & (df.index.get_level_values(ind_year)<=2010)]
    return train_df, val_df, test_df

def split_by_playa(x, x_cat, y, seq_length):
    seq_starts = np.arange(0, x.shape[0], seq_length)
    x_arr = np.array([x[i:(i+seq_length)] for i in seq_starts])
    x_cat_arr = np.array([x_cat[i:(i+seq_length)] for i in seq_starts])
    y_arr = np.array([y[i:(i+seq_length)] for i in seq_starts])
    return x_arr,  x_cat_arr, y_arr

In [None]:
def prep_lc_frac_df(ids=[]):
    """LC Frac csv is hardcoded! Change if you need it"""
    lc_df = pd.read_csv(lc_frac_path).set_index('id')
    if len(ids)>0:
        lc_df = lc_df.loc[ids]
    lc_frac = pd.DataFrame()
    for col in lc_df.columns:
        year = int(col[0:4])
        jsond = lc_df[col].str.replace(r'([0-9]+)(:)', r'"\1"\2', regex=True).apply(json.loads)
        temp_frac_df = (pd.json_normalize(jsond)/5000)
        temp_frac_df.columns = ['lcf{}'.format(lc) for lc in temp_frac_df.columns]
        temp_frac_df = temp_frac_df.assign(id=lc_df.index, year=year)
        lc_frac = lc_frac.append(temp_frac_df)
    
    # Fill in rest of columns (if there's no playas with some LC in this county)
    all_lc_cols = np.array(['lcf11', 'lcf13', 'lcf12', 'lcf9', 'lcf2', 'lcf6', 'lcf1', 'lcf14',
                   'lcf7', 'lcf15', 'lcf16', 'lcf8', 'lcf10', 'lcf3'])
    for col in all_lc_cols[~np.isin(all_lc_cols, lc_frac.columns)]:
        lc_frac[col] = 0   
    lc_frac.fillna(0,inplace=True)
    
    return lc_frac.set_index(['id','year'])


def read_join_csv(inun_csv, drop_zeros=True, early_shift=False):
    # Prep inundation data
    inun_df = pd.read_csv(inun_csv)
    
    # Calculate mean inundation fraction (pre 2011 or pre 2003, depending on if earlyshift or not)
    mean_inun = inun_df.loc[inun_df['year'] < 2011, ['id','inundation']
                           ].rename(columns={'inundation':'mean_inun'}).groupby('id').mean()['mean_inun']
    inun_df = inun_df.set_index('id').join(mean_inun).reset_index()
    
    inun_df.set_index(['id','year','month'], inplace=True)
    
    # If playa overlapped HUCs, got duplicates. This removes them: 
    if inun_df.index.get_level_values(0).unique().shape[0] != inun_df.shape[0]/418:
        inun_df = inun_df.groupby(level=[0,1,2]).first()
                
    # If we don't want zero-inundation playas, drop them here
    if drop_zeros:
        max_inun = inun_df.groupby('id').agg({'inundation':'max'})
        zero_ids = max_inun.loc[max_inun['inundation']==0].index
        inun_df.drop(zero_ids, inplace=True)
        if inun_df.shape[0]==0:
            return 
        
        
    # Prep weather data
    weather_csv = inun_csv.replace('inun_frac_','weather_')
    weather_df = pd.read_csv(weather_csv)
    weather_df.set_index(['id','year','month'], inplace=True)
    
    # If playa overlapped HUCs, got duplicates. This removes them: 
    if weather_df.index.get_level_values(0).unique().shape[0] != weather_df.shape[0]/418:
        weather_df = weather_df.groupby(level=[0,1,2]).first()
        
    joined_df = weather_df.join(inun_df, how='inner')
                               
    if joined_df.index.get_level_values(0).unique().shape[0] != joined_df.shape[0]/418:
        print('still not')

    
    # Finally, prep landcover fraction dataframe
    # Both prep and join are a bit slow
    # Could prep into fractions ahead of time
    # And also split up lc df by county
    lc_frac_df = prep_lc_frac_df(ids=joined_df.index.get_level_values(0).unique())
    joined_df = joined_df.join(lc_frac_df, how='inner')
    
    return joined_df


def read_clean_county(csv, binarize=True, early_shift=False):
    traj = read_join_csv(csv, drop_zeros=False, early_shift=early_shift)
    traj.fillna(0, inplace=True)
    
    # Drop area (which is acres*inundation)
    traj.drop(columns=['area'], inplace=True)

    # Set ID as another column
    traj['id'] = traj.index.get_level_values(0)
    
    # Pop inundation to the end
    inun = traj.pop('inundation')
    if binarize:
        traj['inundation'] = (inun > 0).astype(int)
    else:
        traj['inundation'] = inun
    

    return traj


def split_x_y(df, cat_cols):
    x, y = df.iloc[:, :-1], df.iloc[:, -1]
    return x.drop(columns=cat_cols), x[cat_cols], y


def fill_nas(train, val, test):
    """sthick2013 has NAs, recorded as -9999. So does windevyr (as 9999).
    Here we fill with median values from the county, or if unavailable from whole dataset"""
    # Fill sthick 13
    sthick_full_median = 72.03190000000001 # Manually calculated previously
    if (train['sthick2013']>-9999).sum() > 0:
        sthick_fill = np.median(train.loc[train['sthick2013']>-9999, 'sthick2013'])
    else:
        sthick_fill = sthick_full_median
        
    train.loc[train['sthick2013']==-9999, 'sthick2013'] = sthick_fill
    val.loc[val['sthick2013']==-9999, 'sthick2013'] = sthick_fill
    test.loc[test['sthick2013']==-9999, 'sthick2013'] = sthick_fill
    
    # Fill winddevyr 
    winddevyr_full_median = 2015 #manually calculated previously
    if (train['winddevyr']<9999).sum() > 0:
        winddevyr_fill = np.median(train.loc[train['winddevyr']<9999, 'winddevyr'])
    else:
        winddevyr_fill = winddevyr_full_median
    train.loc[train['winddevyr']==9999, 'winddevyr'] = winddevyr_fill
    val.loc[val['winddevyr']==9999, 'winddevyr'] = winddevyr_fill
    test.loc[test['winddevyr']==9999, 'winddevyr'] = winddevyr_fill
    
    return train, val, test


In [None]:
# H5 stuff
def create_h5_dsets(path, n_num_features, n_cat_features):
    with h5py.File(path, "a") as f:
        f.create_dataset('train_num', (0, train_length_months,n_num_features), maxshape=(None,train_length_months,n_num_features))
        f.create_dataset('train_cat', (0, train_length_months,n_cat_features), maxshape=(None,train_length_months,n_cat_features))
        f.create_dataset('train_y', (0, train_length_months), maxshape=(None,train_length_months))
        
        f.create_dataset('val_num', (0, 48,n_num_features), maxshape=(None,48,n_num_features))
        f.create_dataset('val_cat', (0, 48,n_cat_features), maxshape=(None,48,n_cat_features))
        f.create_dataset('val_y', (0, 48), maxshape=(None,48))

        f.create_dataset('test_num', (0, 48,n_num_features), maxshape=(None,48,n_num_features))
        f.create_dataset('test_cat', (0, 48,n_cat_features), maxshape=(None,48,n_cat_features))
        f.create_dataset('test_y', (0, 48), maxshape=(None,48))
        

def append_to_dset(hdf_fh, dset_key, array):
    dset = hdf_fh[dset_key]
    dset.resize(dset.shape[0]+array.shape[0], axis=0)
    dset[-array.shape[0]:] = array
    
    return


def save_set_hdf(X_num, X_cat, y, dset_prefix, path):
    """
    Inputs:
        X_num (pd.DataFrame) =  Numerical features
        X_cat (pd.DataFrame) = Categorical features for embeddings
        y (pd.Series) = Target (inundation)
        dset_prefix (str) = prefix for HDF5 dataset (train, val, or test)
    """
    
    seq_length = X_num.loc[X_num.index.get_level_values(0)[0]].shape[0]
    X_num_ar, X_cat_ar, y_ar = split_by_playa(
        X_num.values, X_cat.values, y.values, seq_length=seq_length
    )
    
    with h5py.File(path, "a") as f:
        append_to_dset(f, '{}_num'.format(dset_prefix), X_num_ar)
        append_to_dset(f, '{}_cat'.format(dset_prefix), X_cat_ar)
        append_to_dset(f, '{}_y'.format(dset_prefix), y_ar)



In [None]:
def full_read_append_to_hdf(csv, hdf_path, categorical_cols):
    traj = read_clean_county(csv, early_shift=early_val_shift)
    
    train, val, test = split_train_test_val(traj, early_shift=early_val_shift)
    
    # Split into features and target
    print(train.shape)
    train_X_num, train_X_cat, train_y = split_x_y(train, categorical_cols)
    print(train_X_num.shape)
    val_X_num, val_X_cat, val_y = split_x_y(val, categorical_cols)
    test_X_num, test_X_cat, test_y = split_x_y(test, categorical_cols)

    # Fill NAs
    train_X_num, val_X_num, test_X_num = fill_nas(train_X_num, val_X_num, test_X_num)
    
    # Save each set to the hdf5 file
    save_set_hdf(train_X_num, train_X_cat, train_y, 'train', hdf_path)
    save_set_hdf(val_X_num, val_X_cat, val_y, 'val', hdf_path)
    save_set_hdf(test_X_num, test_X_cat, test_y, 'test', hdf_path)
    
    return

# Creating the hdf

In [None]:
create_h5_dsets(hdf_path, n_num_cols, n_cat_cols)

In [None]:
inun_csv_list =  glob.glob(state_county_csv_pattern)
for cur_csv in inun_csv_list:
    full_read_append_to_hdf(cur_csv, hdf_path, categorical_cols)