In [1]:
import pyarrow.parquet as pq
import pyarrow as pa
import pathlib, os, glob
import tempfile
import numpy as np
import pyarrow.dataset as ds
from dask import delayed
from fastparquet import ParquetFile
import pandas as pd
import dask.dataframe as dd
import matplotlib.pyplot as plt
from tqdm import tqdm

In [2]:
geodata = pd.read_csv('../COVID19_USA/data/geodata_territories_2019_statelevel.csv',
                     converters={'geoid': lambda x: str(x)})

In [3]:
run_ids = ['SMH_R9_ChildVax_noVar']
def get_folder(run_id, ftype='hnpi'):
    return f's3pull/plotNPI/{ftype}/USA/inference/med/{run_id}/global/final/'


In [4]:
max_f = -1
run_files = {}
all_files = []
for run_id in run_ids:
    file_list = glob.glob(get_folder(run_id, 'snpi') + "*.parquet") # "*.csv")# 
    file_list.sort()
    file_list = file_list[:max_f]
    print(run_id, len(file_list))
    run_files[run_id] = file_list 
    all_files += file_list 
print(len(all_files))

SMH_R9_ChildVax_noVar 283
283


In [5]:
fn = run_files[run_id][0]
n_feature = len(pq.read_table(fn).to_pandas().sort_values(['geoid','npi_name'], ignore_index=True)['reduction'].to_numpy())
fn.split('/')[-1].split('.')

['000000001', 'SMH_R9_ChildVax_noVar', 'snpi', 'parquet']

In [6]:
features_matrices = {}
for run_id in run_ids:
    arr = np.empty((300,n_feature))
    arr[:] = np.NaN#1000
    features_matrices[run_id] = arr
    
for fn in tqdm(all_files):
    features = pq.read_table(fn).to_pandas().sort_values(['geoid','npi_name'], ignore_index=True)['reduction'].to_numpy()
    slot = int(fn.split('/')[-1].split('.')[0]) -1
    run_id = fn.split('/')[-1].split('.')[1]
    features_matrices[run_id][slot] = features

100%|██████████| 283/283 [00:01<00:00, 148.84it/s]


In [7]:
my_df = pq.read_table(fn).to_pandas().sort_values(['geoid','npi_name'], ignore_index=True).drop('reduction', axis = 1)
feat_mat = features_matrices[run_ids[0]]

In [8]:

my_df['mean'] = np.nanmean(feat_mat, axis = 0)
my_df['q0.025'] = np.nanquantile(feat_mat, q=0.025, axis=0)
my_df['q0.05'] = np.nanquantile(feat_mat, q=0.05, axis=0)
my_df['median'] = np.nanmedian(feat_mat, axis = 0)
my_df['q0.95'] = np.nanquantile(feat_mat, q=0.95, axis=0)
my_df['q0.975'] = np.nanquantile(feat_mat, q=0.975, axis=0)

In [9]:
my_df.to_csv('NPI_SMH_R9_ChildVax_noVar.csv', index=False)

In [14]:
my_df 

Unnamed: 0,geoid,npi_name,start_date,end_date,parameter,mean,q0.025,q0.05,median,q0.95,q0.975
0,01000,AL_Dose1_apr2021,2021-04-01,2021-04-30,transition_rate 0,0.005087,0.005087,0.005087,0.005087,0.005087,0.005087
1,01000,AL_Dose1_aug2021,2021-08-01,2021-08-31,transition_rate 0,0.003289,0.003289,0.003289,0.003289,0.003289,0.003289
2,01000,AL_Dose1_dec2021,2021-12-01,2021-12-31,transition_rate 0,0.002050,0.002050,0.002050,0.002050,0.002050,0.002050
3,01000,AL_Dose1_feb2021,2021-02-01,2021-02-28,transition_rate 0,0.003395,0.003395,0.003395,0.003395,0.003395,0.003395
4,01000,AL_Dose1_feb2022,2022-02-01,2022-02-28,transition_rate 0,0.001796,0.001796,0.001796,0.001796,0.001796,0.001796
...,...,...,...,...,...,...,...,...,...,...,...
3418,78000,lockdown,"2020-03-25,2020-08-17","2020-05-03,2020-09-18",r0,0.548060,0.202839,0.278516,0.560936,0.774913,0.798089
3419,78000,open_p1,2020-05-04,2020-05-31,r0,0.490614,0.107989,0.212464,0.501113,0.771482,0.803381
3420,78000,open_p2,"2020-06-01,2020-09-19,2020-10-13,2020-12-17","2020-08-16,2020-10-12,2020-11-08,2021-03-07",r0,0.447025,0.170744,0.219202,0.455639,0.634811,0.662830
3421,78000,open_p3,"2020-11-09,2021-03-08,2021-03-29,2021-04-23","2020-12-16,2021-03-28,2021-04-22,2021-05-31",r0,0.619308,0.410530,0.448339,0.624328,0.762789,0.787203


In [15]:
df = my_df[my_df['parameter']=='r0']

In [16]:
df

Unnamed: 0,geoid,npi_name,start_date,end_date,parameter,mean,q0.025,q0.05,median,q0.95,q0.975
15,01000,ALvariantR0adj_1.05,2021-01-24,2021-02-06,r0,-0.054759,-0.148650,-0.127613,-0.047876,-0.006184,-0.003950
16,01000,ALvariantR0adj_1.1,2021-02-07,2021-02-13,r0,-0.100527,-0.188782,-0.175625,-0.097194,-0.020587,-0.015023
17,01000,ALvariantR0adj_1.15,2021-02-14,2021-02-20,r0,-0.150310,-0.241874,-0.233522,-0.148743,-0.073578,-0.054558
18,01000,ALvariantR0adj_1.2,2021-02-21,2021-03-06,r0,-0.196792,-0.304170,-0.277804,-0.196051,-0.120249,-0.111474
19,01000,ALvariantR0adj_1.3,2021-03-07,2021-03-13,r0,-0.297626,-0.392921,-0.382466,-0.295692,-0.220846,-0.203280
...,...,...,...,...,...,...,...,...,...,...,...
3418,78000,lockdown,"2020-03-25,2020-08-17","2020-05-03,2020-09-18",r0,0.548060,0.202839,0.278516,0.560936,0.774913,0.798089
3419,78000,open_p1,2020-05-04,2020-05-31,r0,0.490614,0.107989,0.212464,0.501113,0.771482,0.803381
3420,78000,open_p2,"2020-06-01,2020-09-19,2020-10-13,2020-12-17","2020-08-16,2020-10-12,2020-11-08,2021-03-07",r0,0.447025,0.170744,0.219202,0.455639,0.634811,0.662830
3421,78000,open_p3,"2020-11-09,2021-03-08,2021-03-29,2021-04-23","2020-12-16,2021-03-28,2021-04-22,2021-05-31",r0,0.619308,0.410530,0.448339,0.624328,0.762789,0.787203
