In [None]:
import h5py
import numpy as np
import tensorflow as tf
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

input_file_path = "drive/MyDrive/final_dataset/features/fused/fused_features_2.h5"
output_file_path = "drive/MyDrive/final_dataset/features/fused/fused_features_padded_2.h5"

TARGET_LENGTH = 48
PAD_VALUE = 0.0
BATCH_SIZE = 64

physical_devices = tf.config.list_physical_devices("GPU")
if physical_devices:
    for device in physical_devices:
        try:
            tf.config.experimental.set_memory_growth(device, True)
            tf.config.set_logical_device_configuration(
                device, [tf.config.LogicalDeviceConfiguration(memory_limit=13000)]
            )
            print("Configured GPU with a memory limit of 13 000 MB.")
        except Exception as e:
            print(f"Error configuring GPU: {e}")
else:
    print("No GPU detected, running on CPU.")

try:
    tf.keras.mixed_precision.set_global_policy("mixed_float16")
    print("Mixed precision enabled for speedup.")
except ValueError:
    print("Mixed precision not supported, running with default precision.")

def pad_sequence(sequence, target_length, pad_value):
    current_length = sequence.shape[0]
    if current_length >= target_length:
        return sequence[:target_length]
    else:
        padding = np.full((target_length - current_length, sequence.shape[1]), pad_value)
        return np.vstack([sequence, padding])

def process_features_in_parallel(features, batch_size):
    print("Processing features with parallelism...")
    processed_batches = []

    def process_batch(start, end):
        batch = features[start:end]
        padded_batch = [pad_sequence(f, TARGET_LENGTH, PAD_VALUE) for f in batch]
        return padded_batch

    with ThreadPoolExecutor() as executor:
        futures = [
            executor.submit(process_batch, i, min(i + batch_size, len(features)))
            for i in range(0, len(features), batch_size)
        ]
        for future in tqdm(futures, desc="Feature Processing", unit="batch"):
            processed_batches.extend(future.result())

    return np.array(processed_batches)

def process_h5_file(input_file, output_file, batch_size):
    with h5py.File(input_file, "r") as input_h5, h5py.File(output_file, "w") as output_h5:
        keys = list(input_h5.keys())
        print(f"Found {len(keys)} features in the input file.")

        for key in tqdm(keys, desc="Processing Keys", unit="key"):
            features = input_h5[key][:]
            labels = np.zeros(len(features))  

            padded_features = process_features_in_parallel(features, batch_size)

            output_h5.create_dataset(key, data=padded_features)

process_h5_file(
    input_file=input_file_path,
    output_file=output_file_path,
    batch_size=BATCH_SIZE
)

Configured GPU with a memory limit of 13 000 MB.
Mixed precision enabled for speedup.
Found 2 features in the input file.


Processing Keys:   0%|          | 0/2 [00:00<?, ?key/s]

Processing features with parallelism...



Feature Processing:   0%|          | 0/16 [00:00<?, ?batch/s][A
Feature Processing: 100%|██████████| 16/16 [00:00<00:00, 130.28batch/s]
Processing Keys:  50%|█████     | 1/2 [00:01<00:01,  1.95s/key]

Processing features with parallelism...



Feature Processing: 100%|██████████| 16/16 [00:00<00:00, 124506.24batch/s]
Processing Keys: 100%|██████████| 2/2 [00:03<00:00,  1.95s/key]


In [None]:
import h5py
import numpy as np
from tqdm import tqdm

input_file_path = "drive/MyDrive/final_dataset/features/fused/fused_features_padded_2.h5"
output_file_path = "drive/MyDrive/final_dataset/features/fused/fused_features_standardized_2.h5"

def standardize_features(input_file, output_file):
    with h5py.File(input_file, "r") as input_h5, h5py.File(output_file, "w") as output_h5:
        keys = list(input_h5.keys())
        print(f"Found {len(keys)} features in the input file.")

        for key in tqdm(keys, desc="Standardizing Keys", unit="key"):
            features = input_h5[key][:]

            mean = np.mean(features, axis=0)
            std = np.std(features, axis=0)

            std[std == 0] = 1
            standardized_features = (features - mean) / std

            output_h5.create_dataset(key, data=standardized_features)

# Run the standardization
standardize_features(input_file_path, output_file_path)

Found 2 features in the input file.


Standardizing Keys: 100%|██████████| 2/2 [00:02<00:00,  1.33s/key]
