In [None]:
import os
import glob

import h5py as h5
import numpy as np

In [None]:
h5file = '/data/cooperation_data/Preliminary_projects/AgyrisPapantonis_ChromatinTexture_AgeingCells/auto_sir_ageing-cells/20201208_IMR90_3day/rep1/3f2f7d32d280ce05293143834aa15a08.h5'
out_h5file = '/scratch/hoerl/compression_test.h5'


def create_compressed_h5_copy(h5_file_in, h5_file_out, compression_level=4, force=False):
    
    # warn if we try to overwrite output file
    if os.path.exists(h5_file_out) and not force:
        raise ValueError('Target file {} exists.'.format(h5_file_out))

    with h5.File(h5_file_in, 'r') as fd_in:
        with h5.File(h5_file_out, 'w') as fd_out:

            # copy attributes
            # NB: need to use AttributeManager, otherwise it complains about data types?
            am = h5.AttributeManager(fd_out)
            for (k,v) in fd_in.attrs.items():
                am[k] = v

            # dfs copy
            keys = list(fd_in.keys())
            while len(keys) > 0:

                k = keys.pop()

                # we arrived at a dataset -> copy compressed
                if isinstance(fd_in[k], h5.Dataset):
                    # TODO: check if dtype specification is necessary
                    fd_out.create_dataset(k, data=fd_in[k], shuffle=True, compression='gzip', compression_opts=compression_level)
                # we are at a datset
                else:
                    fd_out.create_group(k)
                    # add children to working list
                    keys.extend(map(lambda ki: '{}/{}'.format(k,ki), fd_in[k].keys()))

                am = h5.AttributeManager(fd_out[k])
                for (ke,v) in fd_in[k].attrs.items():
                    am[ke] = v
    
# create_compressed_h5_copy(h5file, out_h5file, compression_level=4, force=True)

In [None]:
from functools import reduce
from operator import add

# root = '/data/cooperation_data/Preliminary_projects/AgyrisPapantonis_ChromatinTexture_AgeingCells/auto_sir_ageing-cells/'
root = '/data/cooperation_data/ArgyrisPapantonis-nuclear_architecture/Simona_Nasiscionyte/STED'

# datasets = ['20201214_IMR90_9day', '20201208_IMR90_3day', '2020622_IMR90_untreated_old',
#             '2020625_IMR90_3d_ICM_young', '2020629_IMR90_6d_ICM_young', '2020702_IMR90_9d_ICM_young',
#            '2020705_IMR90_young_untreated', '20210326_IMR90_young_untr', '20210402_IMR90_old']

datasets = ['20220107_IMR90_young', '20220111_IMR90_old']

h5_files = reduce(add, [glob.glob(os.path.join(root, d, '*', '*.h5')) for d in datasets])
# h5_files = reduce(add, [glob.glob(os.path.join(root, d, '*', '*', '*.h5')) for d in datasets])
# h5_files = reduce(add, [glob.glob(os.path.join(root, d, '*', '*.h5')) for d in datasets])
h5_files

In [None]:
out_dir = '/scratch/hoerl/auto_sir_dna_comp'

for h5file in h5_files:
    outfile = h5file.replace(root, out_dir)
    d, _ = os.path.split(outfile)
    
    if not os.path.exists(d):
        os.makedirs(d)
    
    try:
        create_compressed_h5_copy(h5file, outfile)
    except ValueError as e:
        raise