In [1]:
import h5py
import numpy as np
from collections import Counter

# Load the dataset
with h5py.File("./data/train_raw.h5", "r") as f:
    X_test = f['data'][:]
    Y_test = f['labels'][:]
    S_test = f['sessions'][:]
    H_test = f['hardwares'][:]

# Initialize a new list to store indices to keep
indices_to_keep = []

# Find unique labels
unique_labels = np.unique(Y_test)

# Process each label
for label in unique_labels:
    # Get indices of the current label
    label_indices = np.where(Y_test == label)[0]
    # Extract hardware types for the current label
    hardware_for_label = H_test[label_indices]
    # Count occurrences of each hardware
    hardware_counts = Counter(hardware_for_label)
    # Find the hardware with the majority occurrences
    majority_hardware = max(hardware_counts, key=hardware_counts.get)
    # Keep indices for the majority hardware
    majority_indices = label_indices[hardware_for_label == majority_hardware]
    indices_to_keep.extend(majority_indices)

# Filter the dataset
indices_to_keep = np.array(indices_to_keep)
X_filtered = X_test[indices_to_keep]
Y_filtered = Y_test[indices_to_keep]
S_filtered = S_test[indices_to_keep]
H_filtered = H_test[indices_to_keep]

# Save the filtered dataset
with h5py.File("./data/train_raw_unconnected.h5", "w") as f_out:
    f_out.create_dataset('data', data=X_filtered)
    f_out.create_dataset('labels', data=Y_filtered)
    f_out.create_dataset('sessions', data=S_filtered)
    f_out.create_dataset('hardwares', data=H_filtered)

print("Filtered dataset saved as train_raw_unconnected.h5.")


Filtered dataset saved as train_raw_unconnected.h5.


In [2]:
hardware_counts

Counter({b'BioSemi': 400})

In [3]:
# Calculate unique triplets (label, session, hardware) and count them by hardware
def count_unique_triplets(data):
    triplets = np.core.records.fromarrays([data['Y'], data['S'], data['H']], names='label, session, hardware')
    unique_triplets = np.unique(triplets)
    hardware_counts = {}
    for triplet in unique_triplets:
        hardware = triplet['hardware']
        if hardware in hardware_counts:
            hardware_counts[hardware] += 1
        else:
            hardware_counts[hardware] = 1
    return unique_triplets, hardware_counts

unique_triplets, hardware_counts = count_unique_triplets(data)
print(f"Total number of unique (label, session, hardware) triplets in the dataset: {len(unique_triplets)}")
print("Number of unique triplets per hardware:")
for hardware, count in hardware_counts.items():
    print(f"Hardware {hardware.decode('utf-8') if isinstance(hardware, bytes) else hardware}: {count} times")


# Function to calculate all possible hardware pairs from unique triplets
def calculate_hardware_pairs(unique_triplets):
    all_pairs = {}
    subjects = np.unique(unique_triplets['label'])

    for subject in subjects:
        subject_triplets = unique_triplets[unique_triplets['label'] == subject]
        hardware_combinations = subject_triplets['hardware']

        # Generate all pairs for this subject including pairs from the same hardware in different sessions
        for i in range(len(hardware_combinations)):
            for j in range(i + 1, len(hardware_combinations)):
                pair = tuple(sorted((hardware_combinations[i], hardware_combinations[j])))
                all_pairs[pair] = all_pairs.get(pair, 0) + 1

    return all_pairs

hardware_pairs = calculate_hardware_pairs(unique_triplets)

print("All possible hardware pairs used by subjects including same hardware in different sessions:")
for pair, count in hardware_pairs.items():
    print(f"Pair {pair}: {count} times")

Total number of unique (label, session, hardware) triplets in the dataset: 3091
Number of unique triplets per hardware:
Hardware HydroCe: 1751 times
Hardware Geodisi: 435 times
Hardware BioSemi: 905 times
All possible hardware pairs used by subjects including same hardware in different sessions:
Pair (b'HydroCe', b'HydroCe'): 14294 times
Pair (b'Geodisi', b'Geodisi'): 2830 times
Pair (b'BioSemi', b'BioSemi'): 9224 times


In [1]:
import h5py
import numpy as np

# Load data function
def load_data(file_path):
    with h5py.File(file_path, "r") as file:
        data = {
            'X': file['data'][:],
            'Y': file['labels'][:],
            'S': file['sessions'][:],
            'H': file['hardwares'][:]
        }
    return data

# Load and combine datasets
data_test = load_data("./data/train_raw_unconnected.h5")



# Shuffle the combined data
indices = np.random.permutation(len(data_test['X']))
for key in data_test:
    data_test[key] = data_test[key][indices]

# Split data based on hardware
def split_and_save_data(data):
    unique_hardware = np.unique(data['H'])
    for hardware in unique_hardware:
        # Filter data for each hardware
        mask = data['H'] == hardware
        filtered_data = {
            'data': data['X'][mask],
            'labels': data['Y'][mask],
            'sessions': data['S'][mask],
            'hardwares': data['H'][mask]
        }
        
        # Save filtered data to a new HDF5 file
        file_name = f"./data/train_unconnected_hardware_{hardware.decode('utf-8') if isinstance(hardware, bytes) else hardware}.h5"
        with h5py.File(file_name, 'w') as f:
            for key, value in filtered_data.items():
                f.create_dataset(key, data=value)
        print(f"Data for hardware {hardware.decode('utf-8') if isinstance(hardware, bytes) else hardware} saved to {file_name}")

split_and_save_data(data_test)


Data for hardware BioSemi saved to ./data/train_unconnected_hardware_BioSemi.h5
Data for hardware Geodisi saved to ./data/train_unconnected_hardware_Geodisi.h5
Data for hardware HydroCe saved to ./data/train_unconnected_hardware_HydroCe.h5


In [6]:
import h5py
import numpy as np

with h5py.File("./data/train_raw.h5", "r") as f:
    X = f['labels'][:]

train = np.unique(X)

In [None]:
import h5py
import numpy as np

with h5py.File("./data/test_raw.h5", "r") as f:
    X = f['data'][:]

test = np.unique(X)

In [None]:
import h5py
import numpy as np

with h5py.File("./data/valid_raw.h5", "r") as f:
    X = f['data'][:]

valid = np.unique(X)

In [None]:
import h5py
import numpy as np

with h5py.File("./data/neg_raw.h5", "r") as f:
    X = f['data'][:]

neg = np.unique(X)

In [None]:
all_unique_subjects = np.unique(np.concatenate([train, test, valid, neg]))
