In [2]:
import h5py
import pickle
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.model_selection import train_test_split

In [4]:
def extract_data():
	dis = "dataset/raw_data_ep2"

	os.makedirs(dis, exist_ok=True)

	for k in range(1,5):
		f = h5py.File(os.path.join('dataset/raw_data','Part_{}.mat'.format(k)), 'r')
		
		ky = 'Part_' + str(k)
		
		for i in tqdm(range(len(f[ky])), desc="Extract to hdf5"):
			
			savHd5 = h5py.File(os.path.join(dis,'Part_{}_{}.hdf5'.format(k, i)),'w')
			savHd5.create_dataset('signal', data=f[f[ky][i][0]])

extract_data()

Extract to hdf5: 100%|██████████| 3000/3000 [01:47<00:00, 27.92it/s] 
Extract to hdf5: 100%|██████████| 3000/3000 [01:53<00:00, 26.51it/s] 
Extract to hdf5: 100%|██████████| 3000/3000 [01:52<00:00, 26.67it/s] 
Extract to hdf5: 100%|██████████| 3000/3000 [01:53<00:00, 26.33it/s] 


In [15]:
sda = h5py.File(os.path.join('dataset/raw_data','Part_3.mat'),'r')
print(sda['Part_3'])
print(sda['Part_3'][174][0])
print(sda[sda['Part_3'][174][0]][1][0])
print(len(sda['Part_3']))

<HDF5 dataset "Part_3": shape (3000, 1), type "|O">
<HDF5 object reference>
0.5747800586510264
3000


In [6]:
sda = h5py.File('dataset/data/data_tml.hdf5', 'r')
sda = sda['data']
print(sda[2][0][0:5])
print(sda[2][1][0:2])

[-0.32396481 -0.34449267 -0.37088563 -0.40118866 -0.42758162]
[108.31679145 105.58151894]


In [42]:
a = list(map(str, range(500)))
# df = pd.DataFrame(np.array([sda[0][0]]), columns=a)

print(np.array([sda[0][0]]))

[[ 74.2918826   74.09650618  73.99881797  74.04766208  74.43841491
   75.31760879  76.88062012  79.46935765  83.13266547  87.82169948
   93.19455095  99.00699936 104.81944777 110.48536386 115.66283892
  120.25418472 124.11286896 127.28773573 129.72994095 131.53717281
  132.6605872  133.24671646 133.34440466 133.10018414 132.562899
  131.73254922 130.80451124 129.72994095 128.55768244 127.28773573
  125.87125671 124.25940127 122.3544812  120.15649651 117.61660309
  114.73480094 111.60877826 108.43391148 105.40557702 102.67030718
  100.42347838  98.61624653  97.1997675   96.12519721  95.34369154
   94.8552505   94.56218587  94.31796535  94.12258893  93.87836841
   93.63414789  93.24339506  92.80379812  92.31535707  91.77807193
   91.24078678  90.65465753  90.06852828  89.43355492  88.74973746
   88.06592     87.43094665  86.79597329  86.11215583  85.47718248
   84.84220912  84.25607987  83.66995062  83.03497726  82.49769212
   81.91156286  81.42312182  80.93468078  80.44623974  80.006642

In [9]:
sda = h5py.File(os.path.join('dataset/raw_data_ep2/','Part_1_0.hdf5'),'r')
# print(sda['signal'])
print(sda['signal'][0][0])
print(sda['signal'][0][1])
# print(sda['signal'][0][2])

1.7595307917888563
67.0629551659967


In [2]:
def process_data():
	pro_dir = 'dataset/processed_data'
	raw_dir = 'dataset/raw_data_ep2'

	fs = 100	# sampling frequency
	t = 10		# length of ppg episodes
	dt = 5		# step size of taking the next episode

	samples_in_episode = round(fs * t)		# number of samples in an episode
	d_samples = round(fs * dt)				# number of samples in a step

	os.makedirs(pro_dir, exist_ok=True)

	raw_data = os.listdir(raw_dir)

	for i in tqdm(range(len(raw_data)),desc='Reading Records'):
		file = h5py.File(os.path.join(raw_dir, raw_data[i]), 'r')
		file = file['signal']

		ppg = []		# ppg signal
		abp = []		# abp signal

		df = {
			'10s': [],
			'SBP': [],
			'DBP': []
		}

		for j in range(len(file)):
			ppg.append(file[j][0])
			abp.append(file[j][1])

		for j in range(0, len(file) - samples_in_episode, d_samples):
			df['10s'].append(j)

			sbp = max(abp[j:j+samples_in_episode])		# sbp value
			df['SBP'].append(sbp)

			dbp = min(abp[j:j+samples_in_episode])    	# dbp value
			df['DBP'].append(dbp)
		
		pd.DataFrame(df).to_csv(
			os.path.join(pro_dir, f'{raw_data[i].split(".")[0]}.csv'),
			index = False
		)

process_data()

Reading Records: 100%|██████████| 12000/12000 [54:12<00:00,  3.69it/s] 


In [3]:
def downsample_data(minThresh=2500, ratio=0.25):
    files = next(os.walk('dataset/processed_data'))[2]
    sbps_dict = {}      # dictionary to store sbp and dbp values
    dbps_dict = {}
    
    sbps_cnt = {}       # dictionary containing count of specific sbp and dbp values
    dbps_cnt = {}
    
    dbps_taken = {}     # dictionary containing count of specific sbp and dbp taken
    sbps_taken = {}
    
    candidates = []		# list of candidate episodes
    
    lut = {}            # look up table
	
    for fl in files:
        lines = pd.read_csv(os.path.join('dataset/processed_data', fl))
        lines = lines.values

        for line in lines:
            file_no = int(fl.split('_')[1])
            record_no = int(fl.split('.')[0].split('_')[2])
            episode_st = int(line[0])
            sbp = int(float(line[1]))
            dbp = int(float(line[2]))

            if sbp not in sbps_dict:
                sbps_dict[sbp] = []
                sbps_cnt[sbp] = 0
            
            sbps_dict[sbp].append((file_no, record_no, episode_st))
            sbps_cnt[sbp] += 1

            if dbp not in dbps_dict:
                dbps_dict[dbp] = []
                dbps_cnt[dbp] = 0
            
            dbps_dict[dbp].append((file_no, record_no, episode_st, sbp))
            dbps_cnt[dbp] += 1
    
    sbp_keys = list(sbps_dict)				# all the different sbp values
    dbp_keys = list(dbps_dict)				# all the different dbp values
    
    sbp_keys.sort()					# sorting the sbp values
    dbp_keys.sort()					# sorting the dbp values

    for dbp in tqdm(dbp_keys, desc='DBP Binning'):
        cnt = min(int(dbps_cnt[dbp] * ratio), minThresh)

        for i in range(cnt):
            indix = np.random.randint(len(dbps_dict[dbp]))
            candidates.append([dbps_dict[dbp][indix][0], dbps_dict[dbp][indix][1], dbps_dict[dbp][indix][2]])

            if(dbp not in dbps_taken):
                dbps_taken[dbp] = 0

            dbps_taken[dbp] += 1
            
            if(dbps_dict[dbp][indix][3] not in sbps_taken):
                sbps_taken[dbps_dict[dbp][indix][3]] = 0
            
            sbps_taken[dbps_dict[dbp][indix][3]] += 1

            if(dbps_dict[dbp][indix][0] not in lut):
                lut[dbps_dict[dbp][indix][0]] = {}
            
            if(dbps_dict[dbp][indix][1] not in lut[dbps_dict[dbp][indix][0]]):
                lut[dbps_dict[dbp][indix][0]][dbps_dict[dbp][indix][1]] = {}
            
            if(dbps_dict[dbp][indix][2] not in lut[dbps_dict[dbp][indix][0]][dbps_dict[dbp][indix][1]]):
                lut[dbps_dict[dbp][indix][0]][dbps_dict[dbp][indix][1]][dbps_dict[dbp][indix][2]] = 1
            
            dbps_dict[dbp].pop(indix)

    for sbp in tqdm(sbp_keys, desc='SBP Binning'):
        if sbp not in sbps_taken:
            sbps_taken[sbp] = 0
        
        cnt = min(int(sbps_cnt[sbp]*ratio), minThresh) - sbps_taken[sbp]
        
        for i in range(cnt):
            while len(sbps_dict[sbp]) > 0:
                try:
                    indix = np.random.randint(len(sbps_dict[sbp]))
                except:
                    pass

                try:
                    dumi = lut[sbps_dict[sbp][indix][0]][sbps_dict[sbp][indix][1]][sbps_dict[sbp][indix][2]]
                except:
                    sbps_dict[sbp].pop(indix)
                    continue

                candidates.append([sbps_dict[sbp][indix][0], sbps_dict[sbp][indix][1], sbps_dict[sbp][indix][2]])
                sbps_taken[sbp] += 1
                sbps_dict[sbp].pop(indix)
                break
            
    pickle.dump(candidates, open('dataset/candidates.p', 'wb'))

downsample_data()

DBP Binning: 100%|██████████| 120/120 [00:00<00:00, 167.32it/s]
SBP Binning: 100%|██████████| 131/131 [00:03<00:00, 34.82it/s]


In [4]:
def extract_episodes(candidates):
    os.makedirs('dataset/ppgs', exist_ok=True)
    os.makedirs('dataset/abps', exist_ok=True)

    fs = 100	# sampling frequency
    t = 10      # length of ppg episodes			
    samples_in_episode = round(fs * t)

    for k in range(1, 5):
        for indix in tqdm(range(len(candidates)), desc='Extracts the episodes'):
            if candidates[indix][0] != k:
                continue

            record_no = int(candidates[indix][1])
            episode_st = int(candidates[indix][2])

            file = h5py.File(os.path.join('dataset/raw_data_ep2', f'Part_{k}_{record_no}.hdf5'), 'r')
            file = file['signal']

            p_ppg = []
            ppg = []
            abp = []

            for j in range(episode_st, episode_st+samples_in_episode):
                p_ppg.append(file[j][0])
                abp.append(file[j][1])
            
            mean = sum(p_ppg)/samples_in_episode
            ppg = [x - mean for x in p_ppg]

            pickle.dump(np.array(ppg), open(os.path.join('dataset/ppgs', '{}.p'.format(indix)), 'wb'))
            pickle.dump(np.array(abp), open(os.path.join('dataset/abps', '{}.p'.format(indix)), 'wb'))

candidates = pickle.load(open('dataset/candidates.p', 'rb'))
extract_episodes(candidates)

Extracts the episodes: 100%|██████████| 149047/149047 [07:45<00:00, 319.84it/s]
Extracts the episodes: 100%|██████████| 149047/149047 [09:22<00:00, 264.75it/s]
Extracts the episodes: 100%|██████████| 149047/149047 [05:36<00:00, 442.60it/s] 
Extracts the episodes: 100%|██████████| 149047/149047 [09:02<00:00, 274.75it/s]


In [5]:
def merge_episodes():
    os.makedirs('dataset/data', exist_ok=True)

    files = next(os.walk('dataset/abps'))[2]
    
    np.random.shuffle(files)
    data = []

    for fl in tqdm(files):
        ppg = pickle.load(open(os.path.join('dataset/ppgs', fl), 'rb'))
        abp = pickle.load(open(os.path.join('dataset/abps', fl), 'rb'))
        
        data.append([ppg, abp])
    
    f = h5py.File(os.path.join('dataset/data','data.hdf5'), 'w')
    f.create_dataset('data', data=data)

merge_episodes()

100%|██████████| 149047/149047 [02:26<00:00, 1019.11it/s]


In [4]:
def split_data():
    sst = '_1'
    length = 500

    fl = h5py.File(os.path.join('dataset/data', 'data.hdf5'), 'r')      # load the episode data

    X_train = []                        # intialize train data
    Y_train = []

    X_val = []                          # intialize validation data
    Y_val = []

    for i in tqdm(range(0, 100000), desc=f'Training Data '):    # training samples before validation samples

        X_train.append(np.array(fl['data'][i][0][:length]).reshape(-1, 1))  # ppg signal

        sbp = max(fl['data'][i][1][:length])       # update min-max of abp
        dbp = min(fl['data'][i][1][:length])

        Y_train.append(np.array([sbp, dbp]).reshape(-1, 1))  # abp signal

    for i in tqdm(range(100000, 140000), desc=f'Validation Data '):

        X_val.append(np.array(fl['data'][i][0][:length]).reshape(-1, 1))  # ppg signal

        sbp = max(fl['data'][i][1][:length])       # update min-max of abp
        dbp = min(fl['data'][i][1][:length])

        Y_val.append(np.array([sbp, dbp]).reshape(-1, 1))  # abp signal

    X_train = np.array(X_train)             # converting to numpy array

    Y_train = np.array(Y_train)             # converting to numpy array
    
    # saving the training data split
    pickle.dump({'X_train': X_train, 'Y_train': Y_train}, open(os.path.join('dataset/data', f'train{sst}.p'), 'wb'))

    X_val = np.array(X_val)                 # converting to numpy array        

    Y_val = np.array(Y_val)                 # converting to numpy array
    
    # saving the validation data split
    pickle.dump({'X_val': X_val, 'Y_val': Y_val}, open(os.path.join('dataset/data', f'val{sst}.p'), 'wb'))

    X_test = []                 # intialize test data
    Y_test = []

    for i in tqdm(range(140000, len(fl['data'])), desc='Test Data'):

        X_test.append(np.array(fl['data'][i][0][:length]).reshape(-1, 1))       # ppg signal

        sbp = max(fl['data'][i][1][:length])
        dbp = min(fl['data'][i][1][:length])
        Y_test.append(np.array([sbp, dbp]).reshape(-1, 1))       # abp signal

    X_test = np.array(X_test)           # converting to numpy array
    
    Y_test = np.array(Y_test)           # converting to numpy array

                                                                # saving the test data split
    pickle.dump({'X_test': X_test,'Y_test': Y_test}, open(os.path.join('dataset/data', f'test{sst}.p'), 'wb'))

split_data()

Training Data : 100%|██████████| 100000/100000 [01:15<00:00, 1324.39it/s]
Validation Data : 100%|██████████| 40000/40000 [00:29<00:00, 1358.12it/s]
Test Data: 100%|██████████| 9047/9047 [00:06<00:00, 1311.81it/s]


In [None]:
# rm -rf dataset/raw_data_ep2/*
# rm -rf dataset/processed_data/*

In [None]:
ls dataset/raw_data_ep2