# DATA TRANSFORMATION SCRIPT
The Data we have is arranged by cells. That means the features of a cell are consecutive in memory. If we want to look at multiple features of different cells this results in really long loading times. Since memory on hard drive is no huge concern we will create transposed datasets that allow fast access of features.

In [1]:
# imports
# libs for reading and writing hdf5 files
import h5py
import hdf5plugin

# libs for env and file handling
import os
from pathlib import Path

# ceil function from numpy
from numpy import ceil as ceil

In [2]:
# read the data dir from the environment and create a list of files that should be transposed
data_path = Path(os.environ['DATA_DIR'])
f_names = [a.stem for a in data_path.glob('./*[!T].h5')]
f_names

['test_multi_inputs',
 'train_cite_targets',
 'train_multi_inputs',
 'train_cite_inputs',
 'train_multi_targets',
 'test_cite_inputs']

In [3]:
# iterate over files to be transposed and create transposed counterpart
for f_name in f_names:
    print(f"\nWorking on File: {f_name}")
    from_path = Path(data_path / (f_name + '.h5'))
    to_path = Path(data_path / (f_name + '_T.h5'))
    print(f"Input File: {from_path}")
    print(f"Output File: {to_path}")
    
    # copy index and columnnames
    with h5py.File(from_path, 'r') as from_file:
        cells_dset = from_file[f_name + '/axis1']
        from_cells = {
            'data': cells_dset[:],
            'dtype': cells_dset.dtype,
            'shape':cells_dset.shape,
        }
        features_dset = from_file[f_name + '/axis0']
        from_features = {
            'data': features_dset[:],
            'dtype': features_dset.dtype,
            'shape':features_dset.shape,
        }
        
    num_cells = from_cells['shape'][0]
    num_features = from_features['shape'][0]

    with h5py.File(to_path, 'w') as to_file:
        to_cells = to_file.create_dataset('cells', from_cells['shape'], dtype=from_cells['dtype'], compression="gzip")
        to_features = to_file.create_dataset('features', from_features['shape'], dtype=from_features['dtype'], compression="gzip")
        
        to_cells[:] = from_cells['data']
        to_features[:] = from_features['data']
    
    
    # iterate over old dataset
    # we have 55GB ram availabl for the transformations, adjust to your ram
    avail_ram_in_GB = 55
    avail_ram_bytes = avail_ram_in_GB * 2**(3*10)
    max_float_in_mem = int(avail_ram_bytes / 4)
    batchsize = int(ceil(max_float_in_mem / num_features))
    iterations = int(ceil(num_cells / batchsize))

    # create dataset
    with h5py.File(from_path, 'r') as from_file, h5py.File(to_path, 'r+') as to_file:
        to_file.create_dataset('values', 
                               from_file[f_name + '/block0_values'].shape[::-1], 
                               dtype=from_file[f_name + '/block0_values'].dtype,
                               compression="gzip"
        )
    
    print(f"Starting Data Copy. Total iterations: {iterations}")
    for i in range(iterations):
        # create dataset objects
        s_index = i * batchsize
        e_index = (i+1) * batchsize
    
        with h5py.File(from_path, 'r') as from_file, h5py.File(to_path, 'r+') as to_file:
            to_file['/values'][i*batchsize:(i+1)*batchsize, :] = from_file[f_name + '/block0_values'][:, i*batchsize: (i+1)*batchsize].T


Working on File: test_multi_inputs
Input File: /home/len/Data/Multimodal/test_multi_inputs.h5
Output File: /home/len/Data/Multimodal/test_multi_inputs_T.h5
Starting Data Copy. Total iterations: 1

Working on File: train_cite_targets
Input File: /home/len/Data/Multimodal/train_cite_targets.h5
Output File: /home/len/Data/Multimodal/train_cite_targets_T.h5
Starting Data Copy. Total iterations: 1

Working on File: train_multi_inputs
Input File: /home/len/Data/Multimodal/train_multi_inputs.h5
Output File: /home/len/Data/Multimodal/train_multi_inputs_T.h5
Starting Data Copy. Total iterations: 2

Working on File: train_cite_inputs
Input File: /home/len/Data/Multimodal/train_cite_inputs.h5
Output File: /home/len/Data/Multimodal/train_cite_inputs_T.h5
Starting Data Copy. Total iterations: 1

Working on File: train_multi_targets
Input File: /home/len/Data/Multimodal/train_multi_targets.h5
Output File: /home/len/Data/Multimodal/train_multi_targets_T.h5
Starting Data Copy. Total iterations: 1

Wo