In [1]:
# instalacia kniznice VitalDB
!pip install vitaldb

Collecting vitaldb
  Downloading vitaldb-1.5.2-py3-none-any.whl.metadata (314 bytes)
Collecting wfdb (from vitaldb)
  Downloading wfdb-4.3.0-py3-none-any.whl.metadata (3.8 kB)
Collecting soundfile>=0.10.0 (from wfdb->vitaldb)
  Downloading soundfile-0.13.1-py2.py3-none-manylinux_2_28_x86_64.whl.metadata (16 kB)
Downloading vitaldb-1.5.2-py3-none-any.whl (58 kB)
Downloading wfdb-4.3.0-py3-none-any.whl (163 kB)
Downloading soundfile-0.13.1-py2.py3-none-manylinux_2_28_x86_64.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m40.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: soundfile, wfdb, vitaldb
Successfully installed soundfile-0.13.1 vitaldb-1.5.2 wfdb-4.3.0


In [2]:
import vitaldb
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed

#Identifikacia pripadov kde su dostuppne data 
case_ids = vitaldb.find_cases(["ART_SBP", "ART_MBP"])

#Funkcia na stiahnutie dat pre jeden case_id
def download_case_data(case_id):
    try:
        data = vitaldb.load_case(case_id, ["ART_SBP", "ART_MBP"])
        if data is None or len(data) == 0:
            return None
        df = pd.DataFrame(data, columns=["ART_SBP", "ART_MBP"])
        df["caseid"] = case_id
        return df
    except Exception as e:
        return None
    
#Paraelne stahovania 
bp_data = []
with ThreadPoolExecutor(max_workers=10) as executor: 
    futures = {executor.submit(download_case_data, case_id): case_id for case_id in case_ids}
    
    for future in as_completed(futures):
        result = future.result()
        if result is not None:
            bp_data.append(result)

# Ulozenie do csv suboru
bp_df = pd.concat(bp_data, ignore_index=True)
bp_df.to_csv("blood_pressure_data.csv", index=False)



In [1]:
import pandas as pd
df=pd.read_csv("blood_pressure_data.csv")
df=df.dropna()
# Zakladny popis hodnot dat
print(df[["ART_SBP", "ART_MBP"]].describe())

            ART_SBP       ART_MBP
count  2.049423e+07  2.049423e+07
mean   1.107022e+02  7.816586e+01
std    3.559405e+01  2.699228e+01
min   -9.800000e+01 -9.800000e+01
25%    1.010000e+02  7.100000e+01
50%    1.140000e+02  8.000000e+01
75%    1.280000e+02  9.100000e+01
max    3.500000e+02  3.500000e+02


In [2]:
# Odstranenie hodnot pod dane hranice
df_cleaned = df[
    (df["ART_SBP"] >= 60) & (df["ART_SBP"] <= 250) &
    (df["ART_MBP"] >= 45) & (df["ART_MBP"] <= 150)
]


df_cleaned.to_csv("filtered_blood_pressure_data.csv", index=False)


In [3]:

df = pd.read_csv("filtered_blood_pressure_data.csv")

# Pridanie indexu k jednotlivým caseid na označenie prvých 2 minút záznamu
df["row_number"] = df.groupby("caseid").cumcount()

# Vypocet SBP ako priemer prvých 60 hodnôt pre každé caseid
baseline_sbp = df[df["row_number"] < 60].groupby("caseid")["ART_SBP"].mean()


df = df.merge(baseline_sbp.rename("Baseline_SBP"), on="caseid")


In [4]:
# Definicia hypotenzneho javu 
df["hypotenzia_raw"] = (
    (df["ART_MBP"] < 65) |
    (df["ART_SBP"] < 90) |
    (df["ART_SBP"] < df["Baseline_SBP"] * 0.75)   
)

In [5]:
# Použijeme krolling window na zistenie ci hypotenzia trvala aspon 1 minutu
df["hypotenzia_rolling"] = df.groupby("caseid")["hypotenzia_raw"].transform(
    lambda x: x.rolling(window=30, min_periods=30).sum() >= 30
)

In [6]:
# Pomocou any sa zistuje ci sa hypotenzia vyskytla aspon raz
hypotenzia_per_case = df.groupby("caseid")["hypotenzia_rolling"].any().reset_index()

hypotenzia_per_case.rename(columns={"hypotenzia_rolling": "hypotenzia"}, inplace=True)
hypotenzia_per_case.to_csv("hypotenzia_cases_filtered.csv", index=False)

In [7]:
# stiahnutie cases
cases_df = pd.read_csv("https://api.vitaldb.net/cases")


cases_df.to_csv("cases.csv", index=False)

In [8]:

# Zakladny subor obsahujuci klinicke informacie 
cases_df = pd.read_csv("cases.csv")  

hypotenzia_df = pd.read_csv("hypotenzia_cases_filtered.csv")  

# Zlucenie pre tie pripady ktore su v oboch tabulkach na zaklade caseid
merged_df = cases_df.merge(hypotenzia_df, on="caseid", how="inner")


merged_df.to_csv("cases_filtered.csv", index=False) 

In [9]:
df=pd.read_csv("cases_filtered.csv")

# Odvodene atributy v hodinach a dnoch
df["case_duration"] = (df["caseend"] - df["casestart"])/3600  
df["ane_duration"] = abs((df["aneend"]) - abs(df["anestart"]))/3600 
df["op_duration"] = (df["opend"] - df["opstart"])/3600  
df["hospital_stay"] = (df["dis"] - df["adm"])/86400 

In [10]:
# Pretypovanie hypotenzie (0 = False, 1 = True)
df["hypotenzia"] = df["hypotenzia"].astype(int)


df.to_csv("spojeny.csv", index=False)


In [11]:

missing_values = df.isnull().sum()
missing_values = missing_values[missing_values > 0]
missing_percent = (missing_values / len(df)) * 100
missing_data = pd.DataFrame({"Počet chýbajúcich hodnôt": missing_values, "Percento chýbajúcich hodnôt": missing_percent})

# Zoradenie zostupne podla % chybajucich hodnot
missing_data = missing_data.sort_values(by="Percento chýbajúcich hodnôt", ascending=False)

# Zobrazenie tabuľky
print(missing_data)

                     Počet chýbajúcich hodnôt  Percento chýbajúcich hodnôt
lmasize                                  3513                    99.886267
cline2                                   3460                    98.379301
aline2                                   3419                    97.213534
preop_be                                 3192                    90.759170
preop_sao2                               3191                    90.730736
preop_hco3                               3191                    90.730736
preop_pao2                               3186                    90.588570
preop_paco2                              3186                    90.588570
preop_ph                                 3180                    90.417970
dltubesize                               2593                    73.727609
iv2                                      2135                    60.705146
cline1                                   2008                    57.094114
tubesize                 