In [1]:
import os
import re
from collections import defaultdict
import random

# Function to extract metadata from filename
def extract_info_from_filename(filename):
    pattern = r"subject_(\d+)_session_(\d+)_task_(\w+)_date_(\d{4}-\d{2}-\d{2})_hardware_(\w+)_segments.h5"
    match = re.match(pattern, filename)
    if match:
        subject = int(match.group(1))
        session = int(match.group(2))
        task = match.group(3)
        date = match.group(4)
        hardware = match.group(5)
        return subject, session, task, date, hardware
    else:
        print("NOT Found: Pattern")
    return None, None, None, None, None

# List files and extract metadata
def list_files_and_extract_metadata(directory):
    filenames = os.listdir(directory)
    data_info = [extract_info_from_filename(f) for f in filenames]
    return filenames, data_info

# Group files by subject and session
def group_files_by_subject(data_info):
    subjects_sessions = defaultdict(list)
    for subject, session, task, date, hardware in data_info:
        if subject is not None:
            subjects_sessions[subject].append((session, task, date, hardware))
    return subjects_sessions

# Main workflow
directory = './processed_data/'

# Step 1: List files and extract metadata
filenames, data_info = list_files_and_extract_metadata(directory)

# Step 2: Group the files by subject and session
subjects_sessions = group_files_by_subject(data_info)

# Step 3: Split into multi-session and single-session subjects
multi_session_subjects = {s: sessions for s, sessions in subjects_sessions.items() if len(sessions) > 1}
single_session_subjects = {s: sessions for s, sessions in subjects_sessions.items() if len(sessions) == 1}

# Step 4: Select 50 test subjects and 10 validation subjects from multi-session subjects
random.seed(42)
test_subjects = random.sample(list(multi_session_subjects.keys()), 50)
remaining_subjects = [s for s in multi_session_subjects if s not in test_subjects]
validation_subjects = random.sample(remaining_subjects, 10)
train_subjects = [s for s in remaining_subjects if s not in validation_subjects]

# Add single-session subjects to the training set
train_subjects.extend(single_session_subjects.keys())

# Function to group files by subject lists
def group_files_by_split(subject_list, data_info, filenames):
    files = []
    for subject in subject_list:
        files.extend([filenames[i] for i, (s, _, _, _, _) in enumerate(data_info) if s == subject])
    return files

# Step 5: Group the files for each split (train, validation, and test)
train_files = group_files_by_split(train_subjects, data_info, filenames)
valid_files = group_files_by_split(validation_subjects, data_info, filenames)
test_files = group_files_by_split(test_subjects, data_info, filenames)

print(f"Train files: {len(train_files)}")
print(f"Validation files: {len(valid_files)}")
print(f"Test files: {len(test_files)}")


NOT Found: Pattern
Train files: 4906
Validation files: 272
Test files: 858


In [2]:
import re
from datetime import datetime

# Function to extract metadata from filename
def extract_info_from_filename(filename):
    pattern = r"subject_(\d+)_session_(\d+)_task_(\w+)_date_(\d{4}-\d{2}-\d{2})_hardware_(\w+)_segments.h5"
    match = re.match(pattern, filename)
    if match:
        subject = int(match.group(1))
        session = int(match.group(2))
        task = match.group(3)
        date = match.group(4)
        hardware = match.group(5)
        return subject, session, task, date, hardware
    else:
        print(f"NOT Found: Pattern for {filename}")
    return None, None, None, None, None

# Function to filter and sort filenames
def filter_and_sort_filenames(filenames, specific_subject):
    # Extract metadata and filter by subject
    filtered_files = [
        (filename, extract_info_from_filename(filename))
        for filename in filenames
    ]
    filtered_files = [
        (filename, info) for filename, info in filtered_files
        if info[0] == specific_subject
    ]

    # Sort filtered files by date
    sorted_files = sorted(
        filtered_files,
        key=lambda x: datetime.strptime(x[1][3], '%Y-%m-%d')  # Sort by date
    )

    # Print sorted filenames
    for filename, info in sorted_files:
        print(filename)


# Filter and sort for subject 1
specific_subject = 1
filter_and_sort_filenames(train_files, 70)


subject_70_session_1_task_ltpFR_date_2010-02-15_hardware_Geodisi_segments.h5
subject_70_session_2_task_ltpFR_date_2010-02-16_hardware_HydroCe_segments.h5
subject_70_session_3_task_ltpFR_date_2010-02-22_hardware_Geodisi_segments.h5
subject_70_session_4_task_ltpFR_date_2010-02-23_hardware_HydroCe_segments.h5
subject_70_session_5_task_ltpFR_date_2010-02-26_hardware_HydroCe_segments.h5
subject_70_session_6_task_ltpFR_date_2010-03-15_hardware_HydroCe_segments.h5
subject_70_session_7_task_ltpFR_date_2010-03-16_hardware_HydroCe_segments.h5
subject_70_session_8_task_ltpFR_date_2010-03-19_hardware_HydroCe_segments.h5
subject_70_session_9_task_ltpFR_date_2010-03-22_hardware_Geodisi_segments.h5
subject_70_session_10_task_ltpFR_date_2010-03-23_hardware_HydroCe_segments.h5
subject_70_session_11_task_ltpFR_date_2010-03-29_hardware_Geodisi_segments.h5
subject_70_session_12_task_ltpFR_date_2010-03-30_hardware_HydroCe_segments.h5
subject_70_session_13_task_ltpFR_date_2010-04-02_hardware_Geodisi_segment

In [2]:
import os
import re
from collections import defaultdict
import random

# Function to extract metadata from filename
def extract_info_from_filename(filename):
    pattern = r"subject_(\d+)_session_(\d+)_task_(\w+)_date_(\d{4}-\d{2}-\d{2})_hardware_(\w+)_segments.h5"
    match = re.match(pattern, filename)
    if match:
        subject = int(match.group(1))
        session = int(match.group(2))
        task = match.group(3)
        date = match.group(4)
        hardware = match.group(5)
        return subject, session, task, date, hardware
    else:
        print("NOT Found: Pattern")
    return None, None, None, None, None

# List files and extract metadata
def list_files_and_extract_metadata(directory):
    filenames = os.listdir(directory)
    data_info = [extract_info_from_filename(f) for f in filenames]
    return filenames, data_info

# Group files by subject and session
def group_files_by_subject(data_info):
    subjects_sessions = defaultdict(list)
    for subject, session, task, date, hardware in data_info:
        if subject is not None:
            subjects_sessions[subject].append((session, task, date, hardware))
    return subjects_sessions

# Main workflow
directory = './processed_data/'

# Step 1: List files and extract metadata
filenames, data_info = list_files_and_extract_metadata(directory)

# Step 2: Group the files by subject and session
subjects_sessions = group_files_by_subject(data_info)

# Step 3: Split into multi-session and single-session subjects
multi_session_subjects = {s: sessions for s, sessions in subjects_sessions.items() if len(sessions) > 1}
single_session_subjects = {s: sessions for s, sessions in subjects_sessions.items() if len(sessions) == 1}

# Step 4: Select 50 test subjects and 10 validation subjects from multi-session subjects
random.seed(42)
test_subjects = random.sample(list(multi_session_subjects.keys()), 100)
remaining_subjects = [s for s in multi_session_subjects if s not in test_subjects]
validation_subjects = random.sample(remaining_subjects, 15)
train_subjects = [s for s in remaining_subjects if s not in validation_subjects]

# Add single-session subjects to the training set
#train_subjects.extend(single_session_subjects.keys())
print("single session number", len(single_session_subjects.keys()))

# Function to group files by subject lists
def group_files_by_split(subject_list, data_info, filenames):
    files = []
    for subject in subject_list:
        files.extend([filenames[i] for i, (s, _, _, _, _) in enumerate(data_info) if s == subject])
    return files

# Step 5: Group the files for each split (train, validation, and test)
train_files = group_files_by_split(train_subjects, data_info, filenames)
valid_files = group_files_by_split(validation_subjects, data_info, filenames)
Negative_files = group_files_by_split(test_subjects[:50], data_info, filenames)
test_files = group_files_by_split(test_subjects[50:], data_info, filenames)

print(f"Train files: {len(train_files)}")
print(f"Validation files: {len(valid_files)}")
print(f"Negative files: {len(Negative_files)}")
print(f"Test files: {len(test_files)}")


NOT Found: Pattern
single session number 7
Train files: 4009
Validation files: 315
Negative files: 846
Test files: 859


In [3]:
extract_info_from_filename(train_files[0])

(99, 7, 'ltpFR', '2010-10-27', 'HydroCe')

In [4]:
import h5py
import numpy as np
import os

def create_label(subject, session, task, date, hardware):
    # You can use different strategies here to create labels.
    return subject  # This is currently returning only the subject ID

def concatenate_h5_files(file_list, directory, output_file):
    """
    Concatenates multiple h5 files from a directory into a single h5 file.
    Additionally saves session, task, date, and hardware information.
    
    file_list: List of input h5 filenames (without full path).
    directory: Directory where the h5 files are located.
    output_file: Path to the output h5 file.
    """
    all_labels = []
    all_sessions = []
    all_tasks = []
    all_dates = []
    all_hardwares = []

    # Initialize output file and datasets
    with h5py.File(output_file, 'w') as f_out:
        data_dset = None  # To hold the dataset for the concatenated data

        for file in file_list:
            # Create the full path for the file
            full_path = os.path.join(directory, file)

            # Check if the file exists
            if not os.path.exists(full_path):
                print(f"Warning: File {full_path} does not exist. Skipping.")
                continue

            with h5py.File(full_path, 'r') as f:
                # Assuming 'data_segments' or 'data' as keys for EEG data
                if 'data_segments' in f:
                    data = f['data_segments'][:]
                elif 'data' in f:
                    data = f['data'][:]
                else:
                    print(f"Warning: No valid data found in {file}. Skipping.")
                    continue

                # Check if the data has the expected number of dimensions
                if data.ndim != 3:
                    print(f"Skipping file {file} due to unexpected data shape: {data.shape}")
                    continue

                # Initialize the dataset in the output file with the correct shape only once
                if data_dset is None:
                    all_data_shape = (0, data.shape[1], data.shape[2])  # Use the shape from the first file
                    data_dset = f_out.create_dataset(
                        'data', shape=all_data_shape, maxshape=(None, data.shape[1], data.shape[2]), chunks=True
                    )
                
                # Resize the dataset to accommodate new data
                current_size = data_dset.shape[0]
                new_size = current_size + data.shape[0]
                data_dset.resize(new_size, axis=0)
                data_dset[current_size:new_size] = data  # Append the new data

                # Extract metadata from the filename and generate labels
                subject, session, task, date, hardware = extract_info_from_filename(file)
                label = create_label(subject, session, task, date, hardware)

                # Extend metadata lists
                all_labels.extend([label] * len(data))  # Extend label list
                all_sessions.extend([session] * len(data))  # Save session for each data point
                all_tasks.extend([task] * len(data))  # Save task for each data point
                all_dates.extend([date] * len(data))  # Save date for each data point
                all_hardwares.extend([hardware] * len(data))  # Save hardware for each data point

        # Now write all the metadata once
        if len(all_labels) > 0:
            f_out.create_dataset('labels', data=np.array(all_labels))
            f_out.create_dataset('sessions', data=np.array(all_sessions))
            f_out.create_dataset('tasks', data=np.array(all_tasks, dtype="S"))  # Save task as string
            f_out.create_dataset('dates', data=np.array(all_dates, dtype="S"))  # Save date as string
            f_out.create_dataset('hardwares', data=np.array(all_hardwares, dtype="S"))  # Save hardware as string

        print(f"Successfully created {output_file} with shape: {data_dset.shape}, labels: {len(all_labels)}")


In [5]:
# Directory where the files are stored
directory = './processed_data/'

concatenate_h5_files(valid_files, directory, '../Data/validation.h5')



Successfully created validation.h5 with shape: (31464, 93, 500), labels: 31464


In [6]:
concatenate_h5_files(Negative_files, directory, '../Data/Negative.h5')

Skipping file subject_103_session_19_task_ltpFR_date_2010-12-10_hardware_Geodisi_segments.h5 due to unexpected data shape: (0,)
Successfully created Negative.h5 with shape: (84408, 93, 500), labels: 84408


In [7]:
concatenate_h5_files(train_files, directory, '../Data/train.h5')


Skipping file subject_107_session_17_task_ltpFR_date_2010-12-17_hardware_Geodisi_segments.h5 due to unexpected data shape: (0,)
Skipping file subject_107_session_18_task_ltpFR_date_2010-12-20_hardware_Geodisi_segments.h5 due to unexpected data shape: (0,)
Skipping file subject_107_session_19_task_ltpFR_date_2010-12-21_hardware_HydroCe_segments.h5 due to unexpected data shape: (0,)
Skipping file subject_98_session_17_task_ltpFR_date_2010-12-10_hardware_Geodisi_segments.h5 due to unexpected data shape: (0,)
Skipping file subject_164_session_2_task_ltpFR_date_2011-10-07_hardware_HydroCe_segments.h5 due to unexpected data shape: (0,)
Successfully created train.h5 with shape: (400236, 93, 500), labels: 400236


In [8]:
concatenate_h5_files(test_files, directory, '../Data/test.h5')


Skipping file subject_322_session_16_task_ltpFR2_date_2016-04-06_hardware_HydroCe_segments.h5 due to unexpected data shape: (0,)
Skipping file subject_96_session_15_task_ltpFR_date_2010-11-08_hardware_Geodisi_segments.h5 due to unexpected data shape: (0,)
Successfully created test.h5 with shape: (85643, 93, 500), labels: 85643


In [None]:
import h5py

def inspect_h5_file(h5_file_path):
    """
    Inspects the structure of an HDF5 file and prints all datasets.
    
    Args:
    - h5_file_path: Path to the HDF5 file to inspect.
    """
    with h5py.File(h5_file_path, 'r') as f:
        print("Datasets in the file:")
        f.visititems(lambda name, obj: print(f"{name}: {obj}"))

# Inspect the file structure
h5_file_path = 'validation.h5'
inspect_h5_file(h5_file_path)


In [1]:
import h5py
import torch
import numpy as np
from torch.utils.data import Dataset

class EEGDataset(Dataset):
    def __init__(self, h5_file_path):
        """
        Initialize the dataset with data from the specified HDF5 file.
        Args:
        - h5_file_path: Path to the HDF5 file containing the data and labels.
        """
        # Open the HDF5 file
        self.h5_file = h5py.File(h5_file_path, 'r')

        # Access the data and labels from the HDF5 file
        self.x_data = self.h5_file['data']  # EEG data (use memory-mapped access)
        self.y_data = self.h5_file['labels'][:]  # Labels (load fully into memory)
        self.s_data = self.h5_file['sessions'][:]  # Sessions (load fully into memory)

        # Convert labels to torch tensors
        self.targets = torch.tensor(self.y_data, dtype=torch.long)

        # Print the number of unique subjects (labels)
        unique_subjects = len(np.unique(self.y_data))
        print(f'Number of unique subjects: {unique_subjects}')

    def __len__(self):
        """
        Return the number of data points.
        """
        return len(self.x_data)

    def __getitem__(self, idx):
        """
        Retrieve a single data point and its corresponding label and session.
        Args:
        - idx: Index of the data point to retrieve.
        Returns:
        - x_tensor: The EEG data as a torch tensor.
        - y_tensor: The label as a torch tensor.
        - session: The session identifier.
        """
        # Efficiently fetch a single data point (use memory-mapped access for large datasets)
        x_tensor = torch.tensor(self.x_data[idx], dtype=torch.float32)  # Convert to torch tensor
        y_tensor = self.targets[idx]  # Fetch the preloaded label tensor
        session = self.s_data[idx]  # Session info

        return x_tensor, y_tensor, session

    def close(self):
        """
        Close the HDF5 file when done.
        """
        self.h5_file.close()

# Usage example
h5_file_path = 'train.h5'
train_dataset = EEGDataset(h5_file_path)


Number of unique subjects: 292


In [1]:
print("matin")

matin


In [2]:
import h5py
import torch
import numpy as np
from torch.utils.data import Dataset

class EEGDataset(Dataset):
    def __init__(self, h5_file_path):
        """
        Initialize the dataset with data from the specified HDF5 file.
        Args:
        - h5_file_path: Path to the HDF5 file containing the data and labels.
        """
        # Open the HDF5 file
        self.h5_file = h5py.File(h5_file_path, 'r')

        # Access the data and labels from the HDF5 file
        self.x_data = self.h5_file['data']  # EEG data (use memory-mapped access)
        self.y_data = self.h5_file['labels'][:]  # Labels (load fully into memory)
        self.s_data = self.h5_file['sessions'][:]  # Sessions (load fully into memory)

        # Convert labels to torch tensors
        self.targets = torch.tensor(self.y_data, dtype=torch.long)

        # Print the number of samples and unique subjects (labels)
        num_samples = self.x_data.shape[0]  # Number of samples is the first dimension of the data
        unique_subjects = len(np.unique(self.y_data))  # Number of unique subjects
        print(f'Number of samples: {num_samples}')
        print(f'Number of unique subjects: {unique_subjects}')

    def __len__(self):
        """
        Return the number of data points.
        """
        return self.x_data.shape[0]  # The number of samples is the first dimension of x_data

    def __getitem__(self, idx):
        """
        Retrieve a single data point and its corresponding label and session.
        Args:
        - idx: Index of the data point to retrieve.
        Returns:
        - x_tensor: The EEG data as a torch tensor.
        - y_tensor: The label as a torch tensor.
        - session: The session identifier.
        """
        # Efficiently fetch a single data point (use memory-mapped access for large datasets)
        x_tensor = torch.tensor(self.x_data[idx], dtype=torch.float32)  # Convert to torch tensor
        y_tensor = self.targets[idx]  # Fetch the preloaded label tensor
        session = self.s_data[idx]  # Session info

        return x_tensor, y_tensor, session

    def close(self):
        """
        Close the HDF5 file when done.
        """
        self.h5_file.close()

# Usage example
h5_file_path = 'train.h5'
train_dataset = EEGDataset(h5_file_path)


Number of samples: 489943
Number of unique subjects: 292
