# DATA TRANSFORMATION SCRIPT
The Data we have is arranged by cells. That means the features of a cell are consecutive in memory. If we want to look at multiple features of different cells this results in really long loading times. Since memory on hard drive is no huge concern we will create transposed datasets that allow fast access of features.

In [1]:
# imports
# libs for reading and writing hdf5 files
import h5py
import hdf5plugin

# libs for env and file handling
import os
from pathlib import Path

# ceil function from numpy
from numpy import ceil as ceil

In [2]:
# read the data dir from the environment and create a list of files that should be transposed
data_path = Path(os.environ['DATA_DIR'])

source_files = list((data_path / 'source').glob('./*[!8].h5'))
source_files

[PosixPath('/home/len/Data/Multimodal/source/test_multi_inputs.h5'),
 PosixPath('/home/len/Data/Multimodal/source/train_cite_targets.h5'),
 PosixPath('/home/len/Data/Multimodal/source/train_multi_inputs.h5'),
 PosixPath('/home/len/Data/Multimodal/source/train_cite_inputs.h5'),
 PosixPath('/home/len/Data/Multimodal/source/train_multi_targets.h5'),
 PosixPath('/home/len/Data/Multimodal/source/test_cite_inputs.h5')]

In [3]:
dest_path = data_path / 'transposed'
dest_path

PosixPath('/home/len/Data/Multimodal/transposed')

In [4]:
# iterate over files to be transposed and create transposed counterpart
for source_file in source_files:
    print(f"\n Input File: {source_file}")
    
    dest_file =  dest_path / source_file.name
    print(f"Output File: {dest_file}")
    
    # copy index and columnnames
    with h5py.File(source_file, 'r') as sf, h5py.File(dest_file, 'w') as df:
        source_cells = sf[source_file.stem + '/axis1']
        source_features = sf[source_file.stem + '/axis0']
        
        num_cells = source_cells.shape[0]
        num_features = source_features.shape[0]
        
        dest_cells = df.create_dataset(
            'cells', 
            shape=source_cells.shape, 
            dtype=source_cells.dtype, 
            compression="gzip",
        )
        
        dest_features = df.create_dataset(
            'features',
            shape=source_features.shape,
            dtype=source_features.dtype,
            compression="gzip",
        )
        
        dest_cells[:] = source_cells[:]
        dest_features[:] = source_features[:]
    
    # iterate over old dataset
    # we have 55GB ram availabl for the transformations, adjust to your ram
    avail_ram_in_GB = 20
    avail_ram_bytes = avail_ram_in_GB * 2**(3*10)
    max_float_in_mem = int(avail_ram_bytes / 4)
    
    batchsize = int(ceil(max_float_in_mem / num_cells))
    iterations = int(ceil(num_features / batchsize))

    # create dataset
    with h5py.File(source_file, 'r') as sf, h5py.File(dest_file, 'r+') as df:
        source_data = sf[source_file.stem + '/block0_values']
        
        df.create_dataset(
            'values', 
            shape=source_data.shape[::-1], 
            dtype=source_data.dtype,
            compression="gzip"
        )
    
    print(f"Starting Data Copy. Total iterations: {iterations}")
    for i in range(iterations):
        # create dataset objects
        s_index = i * batchsize
        e_index = (i+1) * batchsize
    
        with h5py.File(source_file, 'r') as sf, h5py.File(dest_file, 'r+') as df:
            df['/values'][s_index: e_index, :] = (sf[source_file.stem + '/block0_values'][:, s_index: e_index]).T


 Input File: /home/len/Data/Multimodal/source/test_multi_inputs.h5
Output File: /home/len/Data/Multimodal/transposed/test_multi_inputs.h5
Starting Data Copy. Total iterations: 3

 Input File: /home/len/Data/Multimodal/source/train_cite_targets.h5
Output File: /home/len/Data/Multimodal/transposed/train_cite_targets.h5
Starting Data Copy. Total iterations: 1

 Input File: /home/len/Data/Multimodal/source/train_multi_inputs.h5
Output File: /home/len/Data/Multimodal/transposed/train_multi_inputs.h5
Starting Data Copy. Total iterations: 5

 Input File: /home/len/Data/Multimodal/source/train_cite_inputs.h5
Output File: /home/len/Data/Multimodal/transposed/train_cite_inputs.h5
Starting Data Copy. Total iterations: 1

 Input File: /home/len/Data/Multimodal/source/train_multi_targets.h5
Output File: /home/len/Data/Multimodal/transposed/train_multi_targets.h5
Starting Data Copy. Total iterations: 1

 Input File: /home/len/Data/Multimodal/source/test_cite_inputs.h5
Output File: /home/len/Data/Mu