In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install poetry

Collecting poetry
  Downloading poetry-2.1.3-py3-none-any.whl.metadata (7.1 kB)
Collecting cleo<3.0.0,>=2.1.0 (from poetry)
  Downloading cleo-2.1.0-py3-none-any.whl.metadata (12 kB)
Collecting dulwich<0.23.0,>=0.22.6 (from poetry)
  Downloading dulwich-0.22.8-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Collecting findpython<0.7.0,>=0.6.2 (from poetry)
  Downloading findpython-0.6.3-py3-none-any.whl.metadata (5.3 kB)
Collecting installer<0.8.0,>=0.7.0 (from poetry)
  Downloading installer-0.7.0-py3-none-any.whl.metadata (936 bytes)
Collecting pbs-installer<2026.0.0,>=2025.1.6 (from pbs-installer[download,install]<2026.0.0,>=2025.1.6->poetry)
  Downloading pbs_installer-2025.4.9-py3-none-any.whl.metadata (990 bytes)
Collecting pkginfo<2.0,>=1.12 (from poetry)
  Downloading pkginfo-1.12.1.2-py3-none-any.whl.metadata (13 kB)
Collecting poetry-core==2.1.3 (from poetry)
  Downloading poetry_core-2.1.3-py3-none-any.whl.metadata (3.5 kB)
Collecting tomlkit<1.0

In [None]:
import os
os.chdir('/content/drive/My Drive/cs598_project/preparation')

In [None]:
!pip install wfdb

Collecting wfdb
  Downloading wfdb-4.3.0-py3-none-any.whl.metadata (3.8 kB)
Collecting pandas>=2.2.3 (from wfdb)
  Downloading pandas-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.9/89.9 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
Downloading wfdb-4.3.0-py3-none-any.whl (163 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.8/163.8 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pandas-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.1/13.1 MB[0m [31m81.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pandas, wfdb
  Attempting uninstall: pandas
    Found existing installation: pandas 2.2.2
    Uninstalling pandas-2.2.2:
      Successfully uninstalled pandas-2.2.2
[31mERROR: pip's dependency resolver does not currently take into accoun

In [None]:
!pip install iterative-stratification

Collecting iterative-stratification
  Downloading iterative_stratification-0.1.9-py3-none-any.whl.metadata (1.3 kB)
Downloading iterative_stratification-0.1.9-py3-none-any.whl (8.5 kB)
Installing collected packages: iterative-stratification
Successfully installed iterative-stratification-0.1.9


In [None]:
import os
import sys
from collections import Counter
from typing import List, Tuple, Type
from glob import glob

import wfdb
import numpy as np
from tqdm import tqdm
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit

sys.path.append("..")
import config
import utils

class G12ECPreparator:

    def __init__(self,
        sampling_frequency: int=500,
        split_number: int=1,
        min_sample_ratio: float=0.01,
    ) -> None:
        """
        Args:
            sampling_frequency (int): Sampling frequency (500).
            split_number (int): Seed　value for train test split.
            min_samples (float): Number of minimum sample ratio.
        """

        self.sampling_frequency = sampling_frequency
        self.split_number = split_number
        self.min_sample_ratio = min_sample_ratio

        self.load_dir = os.path.join(
            '/content/drive/My Drive/cs598_project/data', config.dirname_g12ec, "WFDB")
        self.save_dir = os.path.join(
            '/content/drive/My Drive/cs598_project/data', config.dirname_g12ec, "processed")
        os.makedirs(self.save_dir, exist_ok=True)

    def _open_heafile(self, hea_file: str) -> Type[wfdb.io.record.Record]:
        """
        Args:
            hea_file (str): Path to hea file.
        Returns:
            waveform_data ():
        """
        basename, _ = os.path.splitext(hea_file)
        waveform_data = wfdb.rdrecord(basename)
        return waveform_data

    def _load_data(self) -> Tuple[np.ndarray, List, Tuple[List, List]]:
        """
        Args:
            None
        Returns:
            signals (np.ndarray): Array of 12lead ECG signals with length num_samples.
                            (Each elements are array of shape [sequence_length, 12])
            dxs (List): List of diagnosis ids
            demographics (Tuple): Tuple of list of sex and age of each data.
        """
        hea_files = sorted(glob(self.load_dir + "/*.hea"))
        print(f"Found {len(hea_files)} files.")
        signals = []
        dxs, sexs, ages = [], [], []
        for hea_file in tqdm(hea_files):
            print(hea_file)
            data = self._open_heafile(hea_file)
            assert(data.n_sig == 12)
            assert(data.fs == 500)
            assert(data.sig_name == config.g12ec_lead_order)
            signal = np.nan_to_num(data.p_signal, 0)
            signals.append(signal)

            sexs.append(data.comments[1])
            ages.append(data.comments[0])
            dxs.append(data.comments[2])
        return np.array(signals, dtype=object), dxs, (sexs, ages)

    def _align_signal_length(self, signals: np.ndarray):
        """
        Args:
            signals (np.ndarray):
        Returns:
            aligned_signals (np.ndarray):
        """
        aligned_signals = []
        for signal in signals:
            # Padding
            signal_length = signal.shape[0]
            if signal_length > config.g12ec_default_signal_length:
                raise ValueError(f"Signal length {signal_length} exceeded default_signal_length.")
            elif signal_length < config.g12ec_default_signal_length:
                pad_length = config.g12ec_default_signal_length - signal_length
                pad = np.zeros([pad_length, signal.shape[1]])
                signal = np.concatenate([pad, signal], axis=0)
            aligned_signals.append(signal)
        aligned_signals = np.stack(aligned_signals)
        return aligned_signals

    def _preprocess_signal(
        self,
        X_train: np.ndarray,
        X_val: np.ndarray,
        X_test: np.ndarray
    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
        """
        Scale data.

        Args:
            X_train (np.ndarray): Array of arrays of shape [(sequence_length, 12), (..), .., (..)].
            X_val (np.ndarray): Array of arrays of shape [(sequence_length, 12), (..), .., (..)].
            X_test (np.ndarray): Array of arrays of shape [(sequence_length, 12), (..), .., (..)].
        Returns:
            X_train (np.ndarray): Array of arrays of shape [(sequence_length, 12), (..), .., (..)].
            X_val (np.ndarray): Array of arrays of shape [(sequence_length, 12), (..), .., (..)].
            X_test (np.ndarray): Array of arrays of shape [(sequence_length, 12), (..), .., (..)].
        """
        # apply padding
        X_train = self._align_signal_length(X_train)
        X_val = self._align_signal_length(X_val)
        X_test = self._align_signal_length(X_test)

        # apply scaling
        X_train, X_val, X_test = utils.preprocess_signals(
            X_train, X_val, X_test, self.save_dir, seed=self.split_number)

        return X_train, X_val, X_test

    def _process_label(self, labels: List) -> Tuple[np.ndarray, np.ndarray]:
        """
        Args:
            labels (List):
        Returns:
            processed_labels (np.ndarray): Array of shape (num_samples, num_labels).
            target_labels (np.ndarray): List of dx code corresponding to `processed_labels` index.
        """
        label_index = []
        for label in labels:
            # label = "Dx: XXXXX,YYYYY"
            label = label.replace("Dx: ", "")
            label = label.split(",")
            label_index += label

        # Select labels with more than `self.min_sample_ratio * len(labels)`.
        target_labels = []
        for idx, count in Counter(label_index).items():
            if count > int(self.min_sample_ratio * len(labels)):
                target_labels.append(idx)

        processed_labels = np.zeros([len(labels), len(target_labels)])
        for i, label in enumerate(labels):
            # label = "Dx: XXXXX,YYYYY"
            label = label.replace("Dx: ", "")
            label = label.split(",")
            for l in label:
                if l in target_labels:
                    processed_labels[i, target_labels.index(l)] = 1
        return processed_labels, np.array(target_labels)

    def _process_demographics(self, demographics: Tuple):
        """
        Args:
            demographics (Tuple[np.ndarray]):
        Returns:
            processed_demos (np.ndarray):
        """
        processed_demos = []
        sexs, ages = demographics
        for (sex, age) in zip(sexs, ages):
            sex = sex.lower().replace("sex: ", "")
            assert(sex in ["male", "female"])
            sex = int(sex == "male")

            age = age.lower().replace("age: ", "")
            age = int(age) if age.isdigit() else np.nan
            processed_demos.append([age, sex])
        processed_demos = np.array(processed_demos)
        return processed_demos

    def _split_data(
        self,
        data: np.ndarray,
        labels: np.ndarray,
    ) -> Tuple[Tuple, Tuple, Tuple]:
        """
        Args:
            data (np.ndarray): Array of shape (num_samples, ).
            labels (np.ndarray): Array of shape (num_samples, num_classes)
            demographics (np.ndarray): Tuple of shape (num_samples, 2 (age, sex)).
        Returns:
            train_data (Tuple):
            valid_data (Tuple):
            test_data (Tuple):
        """
        msss_1 = MultilabelStratifiedShuffleSplit(
            n_splits=1, test_size=0.2, random_state=self.split_number)

        for train_idx, test_idx in msss_1.split(data, labels):
            pass

        msss_2 = MultilabelStratifiedShuffleSplit(
            n_splits=1, test_size=0.5, random_state=self.split_number)

        for valid_idx, test_idx in msss_2.split(data[test_idx], labels[test_idx]):
            pass

        X_train, X_valid, X_test =\
            data[train_idx], data[valid_idx], data[test_idx]
        y_train, y_valid, y_test =\
            labels[train_idx], labels[valid_idx], labels[test_idx]

        return (X_train, y_train), (X_valid, y_valid), (X_test, y_test)

    def _dump_data(self, X: np.ndarray, y: np.ndarray, datatype: int) -> None:
        """
        Args:
            X (np.ndarray):
            y (np.ndarray):
            datatype (str):
        Returns:
            None
        """
        print(f"Saving {datatype} data ...")
        X.dump(self.save_dir + f'/X_{datatype}_seed{self.split_number}.npy', protocol=4)
        y.dump(self.save_dir + f'/y_{datatype}_seed{self.split_number}.npy', protocol=4)

    def prepare(self):
        """
        Args:

        Returns:

        """
        # Load G12EC data
        signals, dxs, demographics = self._load_data()

        print(signals)

        processed_labels, label_index = self._process_label(dxs)
        processed_demos = self._process_demographics(demographics)

        # Split data into train, valid, test
        (X_train, y_train), (X_val, y_val), (X_test, y_test) =\
            self._split_data(signals, processed_labels)

        X_train, X_val, X_test = self._preprocess_signal(X_train, X_val, X_test)

        self._dump_data(X_train, y_train, "train")
        self._dump_data(X_val, y_val, "val")
        self._dump_data(X_test, y_test, "test")
        label_index.dump(self.save_dir + "/label_index.npy")




In [None]:
if __name__ == "__main__":
    import sys

    sampling_frequency = 500
    # split_number = int(sys.argv[1])
    for split_number in range(1, 6):
        print(f"Working on split_number: {split_number} ...")
        preparator = G12ECPreparator(sampling_frequency, split_number)
        preparator.prepare()

In [None]:
"""
Based on code from `https://github.com/helme/ecg_ptbxl_benchmarking`
`master/code/experiments/scp_experiment.py`
"""
import os
import sys
import pickle

import numpy as np

sys.path.append("..")
import config
import utils

class DataPreparator():

    folds_type='strat'

    def __init__(
        self,
        task: str,
        min_samples: int,
        sampling_frequency: int,
        split_number: int=1
    ) -> None:
        """
        Args:
            task (str): Name of task ('all', 'diagnostic', 'subdiagnostic',
                                      'superdiagnostic', 'form', 'rhythm')
            min_samples (int):
            sampling_frequency (int): Sampling frequency (100 or 500).
            split_number (int): Select val and test fold index.
                val_fold_index (int): Index of stratifed split for validation dataset.
                test_fold_index (int): Index of stratifed split for test dataset.
                    (Other 8 indices not used will be treated as train_fold_indices)
        """

        assert(task in config.TASKS)
        self.task = task
        self.min_samples = min_samples
        self.sampling_frequency = sampling_frequency

        self.val_fold_index = config.split_settings[split_number]["val_index"]
        self.test_fold_index = config.split_settings[split_number]["test_index"]
        setting = f"{task}/val-{self.val_fold_index}_test-{self.test_fold_index}/"

        self.load_dir = os.path.join(config.data_root, config.dirname_ptbxl, "raw")
        self.save_dir = os.path.join(config.data_root, config.dirname_ptbxl, setting)
        os.makedirs(self.save_dir, exist_ok=True)

    def _split_data(self, data: np.ndarray, labels: np.ndarray, y_data: np.ndarray):
        """
        Args:
            data (np.ndarray): Array of shape (num_samples, sequence_length, 12).
            labels (np.ndarray): Array of shape (num_samples, ??)
            Y (np.ndarray): Array of shape (num_samples, ??)
        Returns:

        """
        test_target = labels.strat_fold == self.test_fold_index
        X_test = data[test_target]
        y_test = y_data[test_target]

        val_target = labels.strat_fold == self.val_fold_index
        X_val = data[val_target]
        y_val = y_data[val_target]

        train_target = ~val_target & ~test_target
        X_train = data[train_target]
        y_train = y_data[train_target]
        return (X_train, y_train), (X_val, y_val), (X_test, y_test)

    def _dump_data(self, X: np.ndarray, y: np.ndarray, datatype: str) -> None:
        """
        Args:
            X (np.ndarray):
            y (np.ndarray):
            datatype (str):
        Returns:
            None
        """
        print(f"Saving {datatype} data ...")
        X.dump(self.save_dir + f'X_{datatype}.npy', protocol=4)
        y.dump(self.save_dir + f'y_{datatype}.npy', protocol=4)

    def prepare(self):
        """
        Args:
        Returns:
        """
        # Load PTB-XL data
        print(self.load_dir)
        data, raw_labels = load_dataset(
            self.load_dir, self.sampling_frequency)

        # Preprocess label data
        labels = utils.compute_label_aggregations(
            raw_labels, self.load_dir, self.task)

        # Select relevant data and convert to one-hot
        data, labels, Y, _ = utils.select_data(
            data, labels, self.task, self.min_samples, self.save_dir)

        # Split data into train, valid, test
        (X_train, y_train), (X_val, y_val), (X_test, y_test) =\
            self._split_data(data, labels, Y)


        X_train, X_val, X_test = utils.preprocess_signals(
            X_train, X_val, X_test, self.save_dir)

        self._dump_data(X_train, y_train, "train")
        self._dump_data(X_val, y_val, "val")
        self._dump_data(X_test, y_test, "test")

if __name__ == "__main__":

    min_samples = 0
    sampling_frequency = 500
    split_number = 1
    for task in config.TASKS:
        print(f"Working on {task} data (split_number: {split_number})...")
        preparator = DataPreparator(task, min_samples, sampling_frequency,
                                    split_number)
        preparator.prepare()


In [None]:
import os
import sys
import ast
import pickle

import wfdb
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler, MultiLabelBinarizer

sys.path.append("..")
import config

def form_datasplit_string(split_number: int) -> str:
    """
    Form string containing datasplit information (eg. `val-9_test-10`)
    Args:
        split_number (int):
    Returns:
        data_split_string (str):
    """
    fold_indices = config.split_settings[split_number]
    val_fold_index = fold_indices["val_index"]
    test_fold_index = fold_indices["test_index"]
    data_split_string = f"val-{val_fold_index}_test-{test_fold_index}"
    return data_split_string

def load_dataset(path, sampling_rate, release=False):
    """
    Returns:
        X (np.ndarray):
        Y (pd.DataFrame):
    """

    if path.split('/')[-2] == 'PTBXL':
        # load and convert annotation data
        Y = pd.read_csv(path+'/ptbxl_database.csv', index_col='ecg_id')
        Y.scp_codes = Y.scp_codes.apply(lambda x: ast.literal_eval(x))

        # Load raw signal data
        X = load_raw_data_ptbxl(Y, sampling_rate, path)

    elif path.split('/')[-2] == 'ICBEB':
        # load and convert annotation data
        Y = pd.read_csv(path+'/icbeb_database.csv', index_col='ecg_id')
        Y.scp_codes = Y.scp_codes.apply(lambda x: ast.literal_eval(x))

        # Load raw signal data
        X = load_raw_data_icbeb(Y, sampling_rate, path)

    return X, Y

def load_raw_data_ptbxl(df, sampling_rate, path):
    if sampling_rate == 100:
        if os.path.exists(path + '/raw100.npy'):
            data = np.load(path+'/raw100.npy', allow_pickle=True)
        else:
            data = [wfdb.rdsamp(path+'/'+f) for f in tqdm(df.filename_lr)]
            data = np.array([signal for signal, meta in data])
            pickle.dump(data, open(path+'/raw100.npy', 'wb'), protocol=4)
    elif sampling_rate == 500:
        if os.path.exists(path + '/raw500.npy'):
            data = np.load(path+'/raw500.npy', allow_pickle=True)
        else:
            data = [wfdb.rdsamp(path+'/'+f) for f in tqdm(df.filename_hr)]
            data = np.array([signal for signal, meta in data])
            pickle.dump(data, open(path+'/raw500.npy', 'wb'), protocol=4)
    return data

def compute_label_aggregations(df, folder, ctype):

    df['scp_codes_len'] = df.scp_codes.apply(lambda x: len(x))

    aggregation_df = pd.read_csv(folder+'/scp_statements.csv', index_col=0)

    if ctype in ['diagnostic', 'subdiagnostic', 'superdiagnostic']:

        def aggregate_all_diagnostic(y_dic):
            tmp = []
            for key in y_dic.keys():
                if key in diag_agg_df.index:
                    tmp.append(key)
            return list(set(tmp))

        def aggregate_subdiagnostic(y_dic):
            tmp = []
            for key in y_dic.keys():
                if key in diag_agg_df.index:
                    c = diag_agg_df.loc[key].diagnostic_subclass
                    if str(c) != 'nan':
                        tmp.append(c)
            return list(set(tmp))

        def aggregate_diagnostic(y_dic):
            tmp = []
            for key in y_dic.keys():
                if key in diag_agg_df.index:
                    c = diag_agg_df.loc[key].diagnostic_class
                    if str(c) != 'nan':
                        tmp.append(c)
            return list(set(tmp))

        diag_agg_df = aggregation_df[aggregation_df.diagnostic == 1.0]
        if ctype == 'diagnostic':
            df['diagnostic'] = df.scp_codes.apply(aggregate_all_diagnostic)
            df['diagnostic_len'] = df.diagnostic.apply(lambda x: len(x))
        elif ctype == 'subdiagnostic':
            df['subdiagnostic'] = df.scp_codes.apply(aggregate_subdiagnostic)
            df['subdiagnostic_len'] = df.subdiagnostic.apply(lambda x: len(x))
        elif ctype == 'superdiagnostic':
            df['superdiagnostic'] = df.scp_codes.apply(aggregate_diagnostic)
            df['superdiagnostic_len'] = df.superdiagnostic.apply(lambda x: len(x))
    elif ctype == 'form':
        form_agg_df = aggregation_df[aggregation_df.form == 1.0]

        def aggregate_form(y_dic):
            tmp = []
            for key in y_dic.keys():
                if key in form_agg_df.index:
                    c = key
                    if str(c) != 'nan':
                        tmp.append(c)
            return list(set(tmp))

        df['form'] = df.scp_codes.apply(aggregate_form)
        df['form_len'] = df.form.apply(lambda x: len(x))
    elif ctype == 'rhythm':
        rhythm_agg_df = aggregation_df[aggregation_df.rhythm == 1.0]

        def aggregate_rhythm(y_dic):
            tmp = []
            for key in y_dic.keys():
                if key in rhythm_agg_df.index:
                    c = key
                    if str(c) != 'nan':
                        tmp.append(c)
            return list(set(tmp))

        df['rhythm'] = df.scp_codes.apply(aggregate_rhythm)
        df['rhythm_len'] = df.rhythm.apply(lambda x: len(x))
    elif ctype == 'all':
        df['all_scp'] = df.scp_codes.apply(lambda x: list(set(x.keys())))

    return df

def select_data(XX, YY, ctype, min_samples, outputfolder):
    # convert multilabel to multi-hot
    mlb = MultiLabelBinarizer()

    if ctype == 'diagnostic':
        X = XX[YY.diagnostic_len > 0]
        Y = YY[YY.diagnostic_len > 0]
        mlb.fit(Y.diagnostic.values)
        y = mlb.transform(Y.diagnostic.values)
    elif ctype == 'subdiagnostic':
        counts = pd.Series(np.concatenate(YY.subdiagnostic.values)).value_counts()
        counts = counts[counts > min_samples]
        YY.subdiagnostic = YY.subdiagnostic.apply(lambda x: list(set(x).intersection(set(counts.index.values))))
        YY['subdiagnostic_len'] = YY.subdiagnostic.apply(lambda x: len(x))
        X = XX[YY.subdiagnostic_len > 0]
        Y = YY[YY.subdiagnostic_len > 0]
        mlb.fit(Y.subdiagnostic.values)
        y = mlb.transform(Y.subdiagnostic.values)
    elif ctype == 'superdiagnostic':
        counts = pd.Series(np.concatenate(YY.superdiagnostic.values)).value_counts()
        counts = counts[counts > min_samples]
        YY.superdiagnostic = YY.superdiagnostic.apply(lambda x: list(set(x).intersection(set(counts.index.values))))
        YY['superdiagnostic_len'] = YY.superdiagnostic.apply(lambda x: len(x))
        X = XX[YY.superdiagnostic_len > 0]
        Y = YY[YY.superdiagnostic_len > 0]
        mlb.fit(Y.superdiagnostic.values)
        y = mlb.transform(Y.superdiagnostic.values)
    elif ctype == 'form':
        # filter
        counts = pd.Series(np.concatenate(YY.form.values)).value_counts()
        counts = counts[counts > min_samples]
        YY.form = YY.form.apply(lambda x: list(set(x).intersection(set(counts.index.values))))
        YY['form_len'] = YY.form.apply(lambda x: len(x))
        # select
        X = XX[YY.form_len > 0]
        Y = YY[YY.form_len > 0]
        mlb.fit(Y.form.values)
        y = mlb.transform(Y.form.values)
    elif ctype == 'rhythm':
        # filter
        counts = pd.Series(np.concatenate(YY.rhythm.values)).value_counts()
        counts = counts[counts > min_samples]
        YY.rhythm = YY.rhythm.apply(lambda x: list(set(x).intersection(set(counts.index.values))))
        YY['rhythm_len'] = YY.rhythm.apply(lambda x: len(x))
        # select
        X = XX[YY.rhythm_len > 0]
        Y = YY[YY.rhythm_len > 0]
        mlb.fit(Y.rhythm.values)
        y = mlb.transform(Y.rhythm.values)
    elif ctype == 'all':
        # filter
        counts = pd.Series(np.concatenate(YY.all_scp.values)).value_counts()
        counts = counts[counts > min_samples]
        YY.all_scp = YY.all_scp.apply(lambda x: list(set(x).intersection(set(counts.index.values))))
        YY['all_scp_len'] = YY.all_scp.apply(lambda x: len(x))
        # select
        X = XX[YY.all_scp_len > 0]
        Y = YY[YY.all_scp_len > 0]
        mlb.fit(Y.all_scp.values)
        y = mlb.transform(Y.all_scp.values)
    else:
        pass

    # save LabelBinarizer
    with open(outputfolder+'mlb.pkl', 'wb') as tokenizer:
        pickle.dump(mlb, tokenizer)

    return X, Y, y, mlb

def preprocess_signals(X_train, X_validation, X_test, outputfolder, seed=None):
    # Standardize data such that mean 0 and variance 1
    ss = StandardScaler()
    ss.fit(np.vstack(X_train).flatten()[:,np.newaxis].astype(float))

    # Save Standardizer data
    if seed is None: # For PTB-XL dataset
        filename = '/standard_scaler.pkl'
    else:
        filename = f'/standard_scaler_seed{seed}.pkl'

    with open(outputfolder+filename, 'wb') as ss_file:
        pickle.dump(ss, ss_file)

    X_train = apply_standardizer(X_train, ss)
    X_valid = apply_standardizer(X_validation, ss)
    X_test = apply_standardizer(X_test, ss)
    return X_train, X_valid, X_test

def apply_standardizer(X, ss):
    X_tmp = []
    for x in X:
        x_shape = x.shape
        X_tmp.append(ss.transform(x.flatten()[:,np.newaxis]).reshape(x_shape))
    X_tmp = np.array(X_tmp, dtype='object')
    return X_tmp
