In [1]:
import os
import h5py
import numpy as np

# Small trials with small dataset

### Open the file

In [3]:
fname = "/home/teitxe/data/fibercup/endpoint_bundling/hdf5_database/fibercup_bundles_prob_tracking.hdf5"

# Open HDF5 file
f = h5py.File(fname, "r")
data = f["fibercup"]


### Print the structure

In [4]:
def print_all_groups(name):
    obj = f[name]
    if isinstance(obj, h5py.Group):
        print(name)

In [5]:
f["fibercup/subjects/train/sub-0001/tractography_data"]

<HDF5 group "/fibercup/subjects/train/sub-0001/tractography_data" (7 members)>

In [6]:
f.visit(print_all_groups)

fibercup
fibercup/subjects
fibercup/subjects/test
fibercup/subjects/test/sub-0003
fibercup/subjects/test/sub-0003/tractography_data
fibercup/subjects/test/sub-0003/tractography_data/bundle_1
fibercup/subjects/test/sub-0003/tractography_data/bundle_2
fibercup/subjects/test/sub-0003/tractography_data/bundle_3
fibercup/subjects/test/sub-0003/tractography_data/bundle_4
fibercup/subjects/test/sub-0003/tractography_data/bundle_5
fibercup/subjects/test/sub-0003/tractography_data/bundle_6
fibercup/subjects/test/sub-0003/tractography_data/bundle_7
fibercup/subjects/train
fibercup/subjects/train/sub-0001
fibercup/subjects/train/sub-0001/tractography_data
fibercup/subjects/train/sub-0001/tractography_data/bundle_1
fibercup/subjects/train/sub-0001/tractography_data/bundle_2
fibercup/subjects/train/sub-0001/tractography_data/bundle_3
fibercup/subjects/train/sub-0001/tractography_data/bundle_4
fibercup/subjects/train/sub-0001/tractography_data/bundle_5
fibercup/subjects/train/sub-0001/tractography_d

### Get the data

In [10]:
# Get the data
data = f["fibercup/subjects/train/sub-0001/tractography_data"]
dataset = []
for bundle in data:
    dataset.append(np.array(data[bundle]["data"]))

len(dataset)

7

# Trial with big dataset (TractoInferno)

In [2]:
import os
import h5py
import numpy as np

# Open HDF5 file
fname = "/home/teitxe/data/tractolearn_data/data_tractoinferno_hcp_qbx.hdf5"
f = h5py.File(fname, "r")

In [5]:
data = np.array(f['train']['0']['streamline'])
print(data.shape)

(3220, 256, 3)


In [6]:
len(f['train'])

107695

### Try the HDF5Matrix from Keras

In [7]:
from tensorflow.keras.utils import Sequence

2024-06-17 14:12:38.619298: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-17 14:12:38.619784: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-06-17 14:12:38.625006: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-06-17 14:12:38.680940: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [11]:
class TractoInfernoDataGenerator(Sequence):
    def __init__(self,
                 hdf5_fname: str,
                 mode: str = 'train',
                 batch_size: int = 32):
        self.file = h5py.File(hdf5_fname, 'r')
        self.hdf5_file = hdf5_fname
        self.mode = mode
        self.batch_size = batch_size

        with h5py.File(self.hdf5_file, 'r') as f:
            self.num_subjs = len(f[self.mode])
            self.subj_ids = list(f[self.mode].keys())

    def __len__(self):
        return int(np.ceil(self.num_subjs / self.batch_size))

    def __getitem__(self, idx):
        subj_id = self.subj_ids[idx]
        x_batch = np.array(self.file[self.mode][subj_id]["streamline"])
        return x_batch, x_batch

In [40]:
dataset.file['train']['100']['streamline']

<HDF5 dataset "streamline": shape (45, 256, 3), type "<f4">

In [48]:
dataset = TractoInfernoDataGenerator(fname)
c = 0
for i in range(len(dataset)):
    a, _ = dataset.__getitem__(i)
    sh = a.shape[0]
    if sh > 100:
        print(sh)
        c += 1

print(c)

3220
10840
5416
529
308
189
1974
172
4187
578
2543
3146
8065
182
728
605
387
944
131
115
168
2000
196
495
131
1076
272
347
424
9112
6226
599
323
396
161
27648
667
2711
11718
1820
4290
75011
124
160
671
8495
1174
176
125
111
144
888
148
293
706
8802
2242
202
260
617
7055
115
293
2203
376
456
789
1702
161
341
136
673
1714
150
19093
5631
10248
174
334
9273
2280
4087
181
370
313
85


(10, 256, 3)