In [13]:
import numpy as np
import pandas as pd
import pickle
import os
import matplotlib.pyplot as plt
import neurokit2 as nk
import wfdb

## Leitura do dataframe

In [2]:
info = pd.read_csv('~/Downloads/subject-info.csv', sep=';')
info['age_int'] = info['Age'].apply(lambda x: int(x) if x != '>89' else 90)
info.head(3)

Unnamed: 0,Patient ID,Follow-up period from enrollment (days),days_4years,Exit of the study,Cause of death,SCD_4years SinusRhythm,HF_4years SinusRhythm,Age,Gender (male=1),Weight (kg),...,Unnamed: 173,Unnamed: 174,Unnamed: 175,Unnamed: 176,Unnamed: 177,Unnamed: 178,Unnamed: 179,Unnamed: 180,Unnamed: 181,age_int
0,P0001,2065,1460,,0,0,0,58,1,83,...,,,,,,,,,,58
1,P0002,2045,1460,,0,0,0,58,1,74,...,,,,,,,,,,58
2,P0003,2044,1460,,0,0,0,69,1,83,...,,,,,,,,,,69


In [3]:
info = info[info['Holter available'] == 1].copy()
# 3 == SCD
info['death_label'] = ((info['Follow-up period from enrollment (days)'] < 365*5) & (info['Cause of death'] == 3)).astype(int)
info['death_label'].value_counts() / info.shape[0]

death_label
0    0.905983
1    0.094017
Name: count, dtype: float64

In [4]:
survivors = info[info['death_label'] == 0]
not_survivors = info[info['death_label'] == 1]

# Escolhendo um subset dos sobreviventes
sampled_survivors = survivors.sample(n=len(not_survivors), random_state=42)

# Shuffling
balanced_df = pd.concat([not_survivors, sampled_survivors])
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

## WFDB -> PKL

In [None]:
base_path = '/home/kelvin/Downloads/music'
original_fs = 200
desired_fs = 128
start_time = original_fs * 30 # 30s
duration = original_fs * 60 * 60 * 4 # 4hrs
end_time = start_time + duration

In [6]:
data = []
labels = []

for idx, row in balanced_df.iterrows():
    if (idx + 1) % 25 == 0:
        print(f"Processing holter {idx+1}/{len(balanced_df)}")
    
    pid = row['Patient ID']
    label = row['death_label']

    record = wfdb.rdrecord(os.path.join(base_path, pid), channels=[0], sampfrom=start_time, sampto=end_time)

    signal = np.array(record.p_signal, dtype=np.float32).flatten()
    resampled = nk.signal_resample(
        signal, sampling_rate=original_fs, desired_sampling_rate=desired_fs, method='interpolation'
    )
    del signal

    data.append(resampled)
    labels.append(label)


final_data = {
    'data': np.array(data, dtype=np.float32),
    'death_label': np.array(labels, dtype=np.int16),
}

with open(os.path.join('data', f'music_{desired_fs}hz_{len(balanced_df)}.pkl'), 'wb') as pkl_file:
    pickle.dump(final_data, pkl_file)

Processing holter 25/176
Processing holter 50/176
Processing holter 75/176
Processing holter 100/176
Processing holter 125/176
Processing holter 150/176
Processing holter 175/176


## Leitura do PKL gerado

In [7]:
with open(os.path.join('data', f'music_{desired_fs}hz_{len(balanced_df)}.pkl'), 'rb') as fin:
    res = pickle.load(fin)
res.keys()

dict_keys(['data', 'death_label'])

In [12]:
128 * 60 * 60 * 4

1843200

In [9]:
type(res['data']), res['data'].shape

(numpy.ndarray, (176, 1843200))

## Leitura dos PKL não processados

In [10]:
names = [
    "x_train", "y_train", 
    "x_val", "y_val", 
    "x_test", "y_test"
]
for name in names:
    with open(f"data/music_unprocessed/{name}.pkl", "rb") as f:
        data = pickle.load(f)
        print(f"{name}: {data.shape}")

x_train: (123, 2764800)
y_train: (123,)
x_val: (26, 2764800)
y_val: (26,)
x_test: (27, 2764800)
y_test: (27,)


## Leitura dos arquivos pre-processados

In [14]:
names = [
    "x_train", "y_train", "pid_train", 
    "x_val", "y_val", "pid_val", 
    "x_test", "y_test", "pid_test"
]
for name in names:
    with open(f"data/music_preprocessed_10s_standard/{name}.pkl", "rb") as f:
        data = pickle.load(f)
        print(f"{name}: {data.shape}")

x_train: (177113, 1280, 1)
y_train: (177113, 2)
pid_train: (177113,)
x_val: (37440, 1280, 1)
y_val: (37440, 2)
pid_val: (37440,)
x_test: (38880, 1280, 1)
y_test: (38880, 2)
pid_test: (38880,)
