In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pathlib
import logging
import shutil

import numpy as np
import pandas as pd
import torch
import h5py
import mat73
from tqdm import tqdm

import utils

In [3]:
utils.logging.basic_config()
logger = logging.getLogger()

data_path = pathlib.Path('data')

In [27]:
filename = data_path / 'tmp.hdf5'
tmp_path = data_path / 'tmp'
tmp_path.mkdir(exist_ok=True)

data = {str(i): np.random.normal(size=(5,100,100)) for i in range(1000)}
with utils.timeit.timeit():
    utils.io.to_hdf(filename, data)
with utils.timeit.timeit():
    for k, v in data.items():
        np.save(tmp_path / f'{k}.npy', v)
    
data = {}
with utils.timeit.timeit():
    data = utils.io.from_hdf(filename)
data = {}
with utils.timeit.timeit():
    for path in tmp_path.glob('*.npy'):
        data[path.stem] = np.load(path)
    
filename.unlink(missing_ok=True)
shutil.rmtree(tmp_path, ignore_errors=True)

100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 2764.51it/s]
INFO: utils.timeit: Time: 0.3631 seconds
INFO: utils.timeit: Time: 0.7204 seconds
100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 9669.13it/s]
INFO: utils.timeit: Time: 0.1428 seconds
INFO: utils.timeit: Time: 0.1553 seconds


In [15]:
h = pd.DataFrame([{'a': 1}], index=['i'])
h['b'] = [np.array([1,2])]
h['c'] = [np.array(['a','b'])]
d = {
    'h': h,
    'a': [1,2,3],
    'b': np.array([4,5,6]),
    'c': np.array(['a','b','c']),
    'd': {
        'da': (1,2,3),
        'db': pd.Series(['a','b','c'], index=[9,8,7]),
        'dc': pd.DataFrame({'a': [1,2], 'b':[3,4]}, index=[5,6]),
    },
    'e': torch.normal(mean=0, std=1, size=(3,4)),
    'f': 'abc',
    'g': 123,
}

In [20]:
filename = data_path / 'tmp.hdf5'
logger.setLevel('INFO')

def callback(k, v, logger):
    # if isinstance(v, np.ndarray) and not v.dtype.kind in 'biufcS':
    #     return v.tolist()
    logger.info(f'{k}, {v}')
        
with utils.timeit.timeit():
    utils.io.to_hdf(filename, d, groupname='', callback=callback, errors='log', progress=True)

def callback(d, k, v, logger):
    logger.info(f'{k}, {v}')
    # return f'hello_{k}', v

with utils.timeit.timeit():
    new_d = utils.io.from_hdf(filename, groupname='', callback=callback, progress=True)

# utils.pprint.pprint(utils.io.desc_hdf(filename), verbose=True)
# utils.pprint.pprint(dict(sorted(new_d.items())), verbose=True)

filename.unlink(missing_ok=True)

INFO: utils.io: h,    a       b       c                                                                                  
i  1  [1, 2]  [a, b]
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block1_values] [items->Index(['b', 'c'], dtype='object')]

  except Exception as err:

INFO: utils.io: a, [1, 2, 3]                                                                                             
INFO: utils.io: b, [4 5 6]                                                                                               
INFO: utils.io: c, ['a' 'b' 'c']                                                                                         
ERROR: utils.io: Error when creating dataset with key c: No conversion path for dtype: dtype('<U1')                      
INFO: utils.io: d/da, (1, 2, 3)                                                                                          
INFO: utils.io: d/db, 9    a 

In [16]:
def callback(k, v, logger):
    if k in ['exp/holoTargets', 'exp/rois']:
        dtype = np.float32 if k == 'exp/holoTargets' else int
        v = np.array(v)
        if v.ndim == 1 and v.dtype.kind != 'O':
            v = v.reshape(1,-1)
        v = [np.atleast_1d(vi).astype(dtype) for vi in v]
        dtype = h5py.vlen_dtype(dtype)
        with h5py.File(hdf5_filename, 'a') as f:
            dset = f[exp_name].create_dataset(k, shape=len(v), dtype=dtype)
            dset[...] = v
        raise utils.io.ItemProcessed

    if isinstance(v, np.ndarray):
        if np.prod(v.shape) > 0:
            try:
                v = v.astype(type(v.reshape(-1)[0]))
            except Exception:
                pass
        
        try:
            v = np.stack(v)
        except Exception:
            pass
        
        return v
        
    if isinstance(v, list):
        arr_v = np.array(v)
        if arr_v.dtype.kind in 'biufcS':
            return arr_v
        
    return v
    
logger.setLevel('INFO') # suppress ERROR level logs

hdf5_filename = data_path / 'tmp.hdf5'
desc_hdf5_filename = data_path / 'tmp.json'
hdf5_filename.unlink(missing_ok=True)
desc_hdf5_filename.unlink(missing_ok=True)
mat_path = pathlib.Path('/Users/hoyinchau/local_documents/research/ken/V1-perturb/data/experiment/will/mat')

for filename in tqdm(sorted(list(mat_path.glob('*.mat')))[:5]):
    exp_name = filename.stem.removesuffix('_outfile')
    
    data = utils.io.loadmat(filename)['out']
    
    utils.io.to_hdf(hdf5_filename, data, groupname=exp_name, callback=callback, errors='log', progress=False)
    
utils.io.save(desc_hdf5_filename, utils.io.desc_hdf(hdf5_filename))

  v = np.array(v)

100%|██████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:08<00:00,  1.74s/it]
100%|███████████████████████████████████████████████████████████████████████████████| 415/415 [00:00<00:00, 23645.13it/s]
