# Downloading and formatting the data

The purpose of this notebook is to download the raw data and put them into appropriate folders and .csvs.

In [1]:
import requests
from os import listdir, mkdir
from os.path import isdir, isfile, join
import numpy as np
import pandas as pd
from scipy.stats import skew, kurtosis
from tqdm import tqdm

<h3>Download the data</h3>

In [2]:
base = 'https://physionet.org/physiobank/database/bidmc/bidmc_csv/'
endings = ['_Breaths.csv','_Numerics.csv','_Signals.csv','_Fix.txt']

files = {}

nums = []
for n in range(1,54):
    if n<10:
        nums.append('0'+str(n))
    else:
        nums.append(str(n))
        
for n in nums:
    files[n] = ['bidmc_'+n+ending for ending in endings]

In [3]:
csv = {}
txt = {}
for number in tqdm(files.keys()):
    csv[number] = []
    for f in files[number]:
        if 'csv' in f:
            csv[number].append(base+f)
        else:
            txt[number] = f

100%|██████████| 53/53 [00:00<00:00, 190486.81it/s]


In [4]:
for f in tqdm(files.keys()):
    for n in files[f]:
        if '.txt' not in n:
            df = pd.read_csv(base+n)
            if not isdir("csv"):
                mkdir("csv")
            df.to_csv('csv/'+n)

100%|██████████| 53/53 [00:53<00:00,  1.01it/s]


In [5]:
for n in tqdm(txt.keys()): 
    text = requests.get(base+txt[n])
    text = text.text
    if not isdir("txt"):
        mkdir("txt")
    with open('txt/'+txt[n], "w") as text_file:
        text_file.write(text)

100%|██████████| 53/53 [00:03<00:00, 13.60it/s]


<h3>Combine data into individual persons</h3>

In [6]:
def make_dataframe(num):
    signals= pd.read_csv('csv/bidmc_'+str(num)+'_Signals.csv',index_col=0)
    signals['sec'] = signals['Time [s]'].apply(lambda x: int(np.floor(x)))
    numerics = pd.read_csv('csv/bidmc_'+str(num)+'_Numerics.csv',index_col=0)
    numerics.fillna(numerics.mean(),inplace=True) 
    numerics.rename(columns={'Time [s]':'sec'},inplace=True)
    numerics.drop(' RESP',axis=1,inplace=True)
    numerics['sec'] = numerics['sec'].apply(lambda x: int(x))
    signals = signals[[' RESP', ' PLETH', ' V', ' AVR', ' II','sec','Time [s]']]
    person = signals.merge(numerics,on='sec',how='outer')
    Hz_125_cols = [' RESP', ' PLETH', ' V', ' AVR', ' II']
    Min = person[Hz_125_cols+['sec']].groupby('sec').min()
    Min.columns = [i+'_Min' for i in Min.columns]
    Max = person[Hz_125_cols+['sec']].groupby('sec').max()
    Max.columns = [i+'_Max' for i in Max.columns]
    Mean = person[Hz_125_cols+['sec']].groupby('sec').mean()
    Mean.columns = Mean.columns = [i+'_Mean' for i in Mean.columns]
    Kurt = person[Hz_125_cols+['sec']].groupby('sec').agg(lambda x: kurtosis(x))
    Kurt.columns = [i+'_Kurt' for i in Kurt.columns]
    Skw = person[Hz_125_cols+['sec']].groupby('sec').agg(lambda x: skew(x))
    Skw.columns = [i+'_Skw' for i in Skw.columns]
    summary_frames = [Min,Max,Mean,Kurt,Skw]
    one_sec_summary = pd.concat(summary_frames,axis=1).reset_index()
    person = person.merge(one_sec_summary,on='sec',how='outer')
    if not isdir("person_csvs"):
        mkdir("person_csvs")
    person.to_csv('person_csvs/person_'+str(num)+'.csv')

In [7]:
nums = []
for n in range(1,54):
    if n<10:
        nums.append('0'+str(n))
    else:
        nums.append(str(n))

In [None]:
for number in nums:
    try:
        make_dataframe(number)
    except:
        print("Ignoring person", number, "due to error")

Ignoring person 09 due to error
Ignoring person 15 due to error
Ignoring person 30 due to error


<h3>Combine indivual persons data together</h3>

In [None]:
onlyfiles = [f for f in listdir('person_csvs') if isfile(join('person_csvs', f))]
person_files = ['person_csvs/'+i for i in onlyfiles if 'person' in i]
files = []
for person in tqdm(person_files):
    df = pd.read_csv(person,index_col=0)
    files.append(df)

df = pd.concat(files, axis=0, ignore_index=True)
df.dropna(inplace=True)

print(df.shape)

In [None]:
df.to_csv('person_csvs/all_people.csv')