In [1]:
import tsdb
import pickle
import logging
import numpy as np
import pandas as pd


In [2]:
force = False

In [3]:
def normalize(s: np.ndarray) -> list[float]:
    n = ((s - s.mean()) / np.std(s)).tolist()
    if (n.count('Nan') > 0 or np.std(s) < 0.00001):
        print('NaN or std: ', np.std(s))
    return n


def save(data: np.ndarray,
         to: str):
    assert isinstance(data, list)
    assert isinstance(data[0], list)
    assert isinstance(data[0][0], float)
    with open(to, "wb") as f:
        pickle.dump(data, f)
        

def split(data: list[np.ndarray],
          train: float = .9) -> tuple[list[np.ndarray],
                                      list[np.ndarray]]:
    """Generate a train/test split."""
    p = int(len(data) * train)
    return data[:p], data[p:]

# ECG Datasets

In [6]:
datasets = [
    'mitbih_test', 
    'mitbih_train',
    'ptbdb_abnormal',
    'ptbdb_normal'
]

path = "../data/raw/ECG"

In [12]:
def load_ECG(s: str, path: str) -> pd.DataFrame:
    file_name = f"{path}/{s}.csv"  
    df = pd.read_csv(file_name, header=None)
    return df

In [17]:
train = load_ECG(datasets[1], path)
# train = train.astype(float)
train.drop(columns=187, inplace=True)
# train = train.values.tolist()


In [19]:
train.astype(float)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,177,178,179,180,181,182,183,184,185,186
0,0.977941,0.926471,0.681373,0.245098,0.154412,0.191176,0.151961,0.085784,0.058824,0.049020,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.960114,0.863248,0.461538,0.196581,0.094017,0.125356,0.099715,0.088319,0.074074,0.082621,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.000000,0.659459,0.186486,0.070270,0.070270,0.059459,0.056757,0.043243,0.054054,0.045946,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.925414,0.665746,0.541436,0.276243,0.196133,0.077348,0.071823,0.060773,0.066298,0.058011,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.967136,1.000000,0.830986,0.586854,0.356808,0.248826,0.145540,0.089202,0.117371,0.150235,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87549,0.807018,0.494737,0.536842,0.529825,0.491228,0.484211,0.456140,0.396491,0.284211,0.136842,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
87550,0.718333,0.605000,0.486667,0.361667,0.231667,0.120000,0.051667,0.001667,0.000000,0.013333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
87551,0.906122,0.624490,0.595918,0.575510,0.530612,0.481633,0.444898,0.387755,0.322449,0.191837,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
87552,0.858228,0.645570,0.845570,0.248101,0.167089,0.131646,0.121519,0.121519,0.118987,0.103797,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
train = load_ECG('mitbih_train', path)
test = load_ECG('mitbih_test', path)
train = train.astype(float)
test = test.astype(float)
train.drop(columns=187, inplace=True)
test.drop(columns=187, inplace=True)
ds = list()
for col in train.columns:
    if train[col].sum() > 1:
        ds.append(normalize(np.array(train[col].tolist())))
for col in test.columns:
    if train[col].sum() > 1:
        ds.append(normalize(np.array(train[col].tolist())))
    
dataset_train, dataset_test = split(ds)
save(dataset_train,
    f"../data/processed/mitbih_TRAIN.pickle")
save(dataset_test,
    f"../data/processed/mitbih_VAL.pickle")

In [23]:
train = load_ECG('ptbdb_abnormal', path)
test = load_ECG('ptbdb_normal', path)
train = train.astype(float)
test = test.astype(float)
train.drop(columns=187, inplace=True)
test.drop(columns=187, inplace=True)
ds = list()
for col in train.columns:
    if train[col].sum() > 1:
        ds.append(normalize(np.array(train[col].tolist())))
for col in test.columns:
    if train[col].sum() > 1:
        ds.append(normalize(np.array(train[col].tolist())))
    
dataset_train, dataset_test = split(ds)
save(dataset_train,
    f"../data/processed/ptbdb_TRAIN.pickle")
save(dataset_test,
    f"../data/processed/ptbdb_VAL.pickle")