In [8]:
import os

import h5py


def aggregate_h5_files(data_dir, input_files, output_file):
    """
    Aggregate multiple H5 files into a single file (memory efficient)
    """
    # First pass: determine total dimensions
    total_samples = 0
    vector_shape = None
    label_shape = None

    print("Scanning files to determine dimensions...")
    for input_file in input_files:
        input_path = os.path.join(data_dir, input_file)
        with h5py.File(input_path, 'r') as f_in:
            vectors = f_in['vectors']
            labels = f_in['labels']

            if vector_shape is None:
                vector_shape = vectors.shape[1:]  # All dimensions except first
                label_shape = labels.shape[1:] if len(labels.shape) > 1 else ()

            total_samples += vectors.shape[0]
            print(f"  {input_file}: {vectors.shape[0]} samples")

    print(f"Total samples to aggregate: {total_samples}")
    print(f"Vector shape per sample: {vector_shape}")

    # Create output file with pre-allocated datasets
    output_path = os.path.join(data_dir, output_file)
    with h5py.File(output_path, 'w') as f_out:
        # Create datasets with known total size
        if len(vector_shape) > 0:
            full_vector_shape = (total_samples,) + vector_shape
        else:
            full_vector_shape = (total_samples,)

        if len(label_shape) > 0:
            full_label_shape = (total_samples,) + label_shape
        else:
            full_label_shape = (total_samples,)

        vectors_dset = f_out.create_dataset('vectors', shape=full_vector_shape,
                                            dtype='float32')  # Adjust dtype as needed
        labels_dset = f_out.create_dataset('labels', shape=full_label_shape,
                                           dtype='int32')  # Adjust dtype as needed

        # Second pass: copy data chunk by chunk
        current_idx = 0
        for i, input_file in enumerate(input_files):
            input_path = os.path.join(data_dir, input_file)
            print(f"Processing {input_file}...")

            with h5py.File(input_path, 'r') as f_in:
                vectors = f_in['vectors']
                labels = f_in['labels']

                num_samples = vectors.shape[0]
                end_idx = current_idx + num_samples

                # Copy data directly without loading into memory
                vectors_dset[current_idx:end_idx] = vectors[:]
                labels_dset[current_idx:end_idx] = labels[:]

                current_idx = end_idx
                print(f"  Copied {num_samples} samples (total so far: {current_idx})")

        # # Add metadata
        # f_out.attrs['source_files'] = [f.encode('utf-8') for f in input_files]
        # f_out.attrs['num_source_files'] = len(input_files)
        # f_out.attrs['total_samples'] = total_samples

    print(f"Successfully aggregated {len(input_files)} files into {output_file}")
    print(f"Total samples: {total_samples}")


# Usage
data_dir = "E:\\master\\final_project\\data\\my_data\\embedding_output_data\\normalized_hdfs_pfcgr_embedding\\100_400\\fold_1\\test"
input_files = ['data_1.h5', 'data_2.h5', 'data_3.h5', 'data_4.h5', 'data_5.h5']
aggregate_h5_files(data_dir, input_files, 'data.h5')

Scanning files to determine dimensions...
  data_1.h5: 22916 samples
  data_2.h5: 22916 samples
  data_3.h5: 22916 samples
  data_4.h5: 22916 samples
  data_5.h5: 22920 samples
Total samples to aggregate: 114584
Vector shape per sample: (5, 64, 64)
Processing data_1.h5...
  Copied 22916 samples (total so far: 22916)
Processing data_2.h5...
  Copied 22916 samples (total so far: 45832)
Processing data_3.h5...
  Copied 22916 samples (total so far: 68748)
Processing data_4.h5...
  Copied 22916 samples (total so far: 91664)
Processing data_5.h5...
  Copied 22920 samples (total so far: 114584)
Successfully aggregated 5 files into data.h5
Total samples: 114584


In [6]:
with h5py.File('C:\\Users\Admin\Temp\\100_400\\fold_1\\train\data.h5', 'r') as f:
    print(f['vectors'][:].shape)

(460632, 5, 64, 64)
