In [1]:
import pyarrow.parquet as pq
import pyarrow as pa
import pathlib, os, glob
import tempfile
import numpy as np
import pyarrow.dataset as ds
from dask import delayed
from fastparquet import ParquetFile
import pandas as pd
import dask.dataframe as dd
import matplotlib.pyplot as plt
from tqdm import tqdm

In [2]:
geodata = pd.read_csv('../COVID19_USA/data/geodata_territories_2019_statelevel.csv',
                     converters={'geoid': lambda x: str(x)})

In [3]:
run_ids = ['R10_optWan_highBoo_test5poispois_CA-FL-MD-MN-WA', 'R10_optWan_highBoo_WildOnly_CA-FL-MD-MN-WA']
def get_folder(run_id, ftype='hnpi'):
    return f's3pull/{run_id}/{ftype}/USA/inference/med/{run_id}/global/final/'


In [9]:
max_f = -1
run_files = {}
all_files = []
ftype = 'snpi'
for run_id in run_ids:
    file_list = glob.glob(get_folder(run_id, ftype) + "*.parquet") # "*.csv")# 
    file_list.sort()
    file_list = file_list[:max_f]
    print(run_id, len(file_list))
    run_files[run_id] = file_list 
    all_files += file_list 
print(len(all_files))

R10_optWan_highBoo_test5poispois_CA-FL-MD-MN-WA 299
R10_optWan_highBoo_WildOnly_CA-FL-MD-MN-WA 299
598


In [10]:
features_matrices = {}
for run_id in run_ids:
    fn = run_files[run_id][0]
    n_feature = len(pq.read_table(fn).to_pandas().sort_values(['geoid','npi_name'], ignore_index=True)['reduction'].to_numpy())
    print(f"{run_id} as {n_feature} features")
    arr = np.empty((300,n_feature))
    arr[:] = np.NaN#1000
    features_matrices[run_id] = arr
    
    for fn in tqdm(run_files[run_id]):
        features = pq.read_table(fn).to_pandas().sort_values(['geoid','npi_name'], ignore_index=True)['reduction'].to_numpy()
        slot = int(fn.split('/')[-1].split('.')[0]) -1
        run_id = fn.split('/')[-1].split('.')[1]
        features_matrices[run_id][slot] = features

 10%|█         | 31/299 [00:00<00:00, 302.68it/s]

R10_optWan_highBoo_test5poispois_CA-FL-MD-MN-WA as 10 features


100%|██████████| 299/299 [00:00<00:00, 359.63it/s]
 12%|█▏        | 36/299 [00:00<00:00, 358.22it/s]

R10_optWan_highBoo_WildOnly_CA-FL-MD-MN-WA as 10 features


100%|██████████| 299/299 [00:00<00:00, 373.24it/s]


In [11]:
dfs = []
for run_id in run_ids:
    fn = run_files[run_id][0]
    my_df = pq.read_table(fn).to_pandas().sort_values(['geoid','npi_name'], ignore_index=True).drop('reduction', axis = 1)
    feat_mat = features_matrices[run_id]

    my_df['mean'] = np.nanmean(feat_mat, axis = 0)
    my_df['q0.025'] = np.nanquantile(feat_mat, q=0.025, axis=0)
    my_df['q0.05'] = np.nanquantile(feat_mat, q=0.05, axis=0)
    my_df['median'] = np.nanmedian(feat_mat, axis = 0)
    my_df['q0.95'] = np.nanquantile(feat_mat, q=0.95, axis=0)
    my_df['q0.975'] = np.nanquantile(feat_mat, q=0.975, axis=0)
    my_df['run_id'] = run_id
    my_df['npi_type'] = ftype
    dfs.append(my_df)

In [13]:
full_df = pd.concat(dfs)

In [18]:
run_ids

['R10_optWan_highBoo_test5poispois_CA-FL-MD-MN-WA',
 'R10_optWan_highBoo_WildOnly_CA-FL-MD-MN-WA']

In [19]:
full_df.to_csv(f'hnpi-snpi{"".join(run_ids)}', index=False)

In [20]:
full_df

Unnamed: 0,geoid,npi_name,start_date,end_date,parameter,mean,q0.025,q0.05,median,q0.95,q0.975,run_id,npi_type
0,06000,CA_Dose1_apr2021_age0to17,2021-04-01,2021-04-30,nu1age0to17,0.000180,0.000180,0.000180,0.000180,0.000180,0.000180,R10_optWan_highBoo_test5poispois_CA-FL-MD-MN-WA,snpi
1,06000,CA_Dose1_apr2021_age18to64,2021-04-01,2021-04-30,nu1age18to64,0.013210,0.013210,0.013210,0.013210,0.013210,0.013210,R10_optWan_highBoo_test5poispois_CA-FL-MD-MN-WA,snpi
2,06000,CA_Dose1_apr2021_age65to100,2021-04-01,2021-04-30,nu1age65to100,0.022040,0.022040,0.022040,0.022040,0.022040,0.022040,R10_optWan_highBoo_test5poispois_CA-FL-MD-MN-WA,snpi
3,06000,CA_Dose1_apr2022_age0to17,2022-04-01,2022-04-30,nu1age0to17,0.001330,0.001330,0.001330,0.001330,0.001330,0.001330,R10_optWan_highBoo_test5poispois_CA-FL-MD-MN-WA,snpi
4,06000,CA_Dose1_apr2022_age18to64,2022-04-01,2022-04-30,nu1age18to64,0.000790,0.000790,0.000790,0.000790,0.000790,0.000790,R10_optWan_highBoo_test5poispois_CA-FL-MD-MN-WA,snpi
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5,24000,MD_incidCshift2,2020-07-01,2021-03-06,inciditoc_all,0.608438,0.509566,0.521192,0.606281,0.699589,0.719820,R10_optWan_highBoo_WildOnly_CA-FL-MD-MN-WA,hnpi
6,27000,MN_incidCshift1,2020-01-01,2020-06-14,inciditoc_all,0.875154,0.758705,0.784523,0.875759,0.963212,0.975130,R10_optWan_highBoo_WildOnly_CA-FL-MD-MN-WA,hnpi
7,27000,MN_incidCshift2,2020-06-15,2021-03-06,inciditoc_all,0.526455,0.391189,0.404380,0.529187,0.629567,0.645994,R10_optWan_highBoo_WildOnly_CA-FL-MD-MN-WA,hnpi
8,53000,WA_incidCshift1,2020-01-01,2020-05-31,inciditoc_all,0.893273,0.773054,0.789689,0.892329,0.984719,0.990969,R10_optWan_highBoo_WildOnly_CA-FL-MD-MN-WA,hnpi


In [None]:
my_df 

In [None]:
df = my_df[my_df['parameter']=='r0']

In [None]:
df