# Προγραμματιστική άσκηση:
### Ταξινόμηση καρδιοτοκογραφικών σημάτων με βάση την μεταβλητότητα του καρδιακού ρυθμού

## Section 2
* ##### Load and store data to dataframe

In [8]:
import pandas as pd
import numpy as np  
from tqdm import tqdm
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm 
import glob
import re
import pyhrv.time_domain as td
import pathlib

In [None]:
"""
* Convert .dat files to .csv
* Create ann_db file for easier processing of additional annotations

@ Decomment the below command in order to execute in your own 

"""
# %run create_csv_database.py

In [23]:
"""
* Convert .dat files to .csv
* Create ann_db dataframe for easier processing of additional annotations
"""
ann_db = pd.read_csv("database/ann_db.csv")
ann_db.rename(columns={'Unnamed: 0':'Labels'}, inplace=True)
ann_db = ann_db.set_index('Labels')
ann_db.head()

"""
* Initialize dataframe which we will apply 
    - methods for Heart Rate Variability (HRV) computation 
    based on Fatal Heart Rate signals (FHR)
    - correlation and other types of analysis
"""

def init_dataframe(path = "database/signals"):
    files = glob.glob(path + "/*.csv")
    data_frame = pd.DataFrame()

    for filename in files:
        name = re.sub("[^0-9]", "", filename)
        df = pd.read_csv(filename, index_col=None)
        data_frame = data_frame.append(
            {
                'tag': name,
                'Fhr': list(df['FHR']), 
                'pH': ann_db[name]['pH'],
                'Apgar1': ann_db[name]['Apgar1'],
                'Apgar5': ann_db[name]['Apgar5'],
                'labels': ann_db[name]
            },
            ignore_index=True
        )
    return data_frame

df = init_dataframe()
df.head()

Unnamed: 0,tag,Fhr,pH,Apgar1,Apgar5,labels
0,1001,"[150.5, 150.5, 151.0, 151.25, 151.25, 150.25, ...",7.14,6.0,8.0,Labels pH 7.14 BDecf ...
1,1002,"[146.25, 146.25, 150.25, 148.5, 148.5, 143.5, ...",7.0,8.0,8.0,Labels pH 7.00 BDecf ...
2,1003,"[137.5, 137.5, 137.5, 139.5, 139.5, 140.75, 14...",7.2,7.0,9.0,Labels pH 7.20 BDecf ...
3,1004,"[153.75, 153.75, 153.75, 154.75, 154.75, 153.7...",7.3,8.0,9.0,Labels pH 7.30 BDecf ...
4,1005,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",7.3,9.0,10.0,Labels pH 7.30 BDecf ...


### Data Description:
##### A short (non medically exhaustive) description of the feature meaning

**analysis of umbilical artery blood sample**
- **pH** : Is the ph of the blood of the baby (n.v. 7.35-7.45). A slight increase causes blood alkalosis while a slight decrease causes acidosis. Both conditions are extremely life threatening. This value is inverse proportional to **pCO2**.
- **BDecf** : Base excess of extracellular fluid is a quantity that reflects only the non-respiratory (metabolic) component of acid-base disturbances. (Reference : [Here](https://acutecaretesting.org/en/articles/all-about-base-excess--to-be-or-not-to-be))
- **pCO2** : is the relative pressure of CO2 in the blood (n.v. 4.5-6.0Kpa). In Respiratory Acidosis, **pH** and **pCO2** are inverse proportional while in Metabolic Acidosis they are directly proportional.
- **BE** : is the excess or deficit (if negative) of bases in the blood (e.g. HCO3). The normal range is between -2 and +2 mEq/L or mmol/L. A value above the range is indicative of Metabolic Alkalosis while below the range is indicative of Metabolic Acidosis.

- **Apgar1** : It's the Apgar score at 1min from birth. It indicates the overall health status of the baby at birth. It can have values between 0 and 10 where a score above 7 is considered good. (Reference and table : [Here](https://www.birthinjurysafety.org/birth-injuries/apgar-scoring-system.html))
- **Apgar5** : It's the Apgar score at 5min from birth. See **Apgar1**.
- **Gest. weeks** : It's the number of weeks of gestation where 39-40 weeks represent a normal term delivery.
- **Weight(g)** : Is the weigth of the baby in grams at birth. This is various but in general the normal range would be between 2500g and 4500g.
- **Sex** : This is a boolean for 1 and 2 where is not clear which is male/female but shouldn't matter.
- **Age** : The Age in years of the mom.
- **Gravidity** : Is the number of times the woman has been pregnant in her life.
- **Parity** : Is the number of times the woman has been pregnant for more than 24 weeks (in a single pregnancy).
- **Diabetes** : Boolean value for Diabetes of 0 (False) and 1 (True).
- **Hypertension** : Boolean value for Hypertension of 0 (False) and 1 (True).
- **Preeclampsia** : Boolean value for Preeclampsia of 0 (False) and 1 (True). where Preeclampsia indicate a disorder of pregnancy that could lead to complications.
- **Liq.** : It referes to Liquor which in this case is the Amniotic Fluid. I'm not sure why is a boolean in the data.
- **Pyrexia** : Is a boolean value that indicate the presence of pyrexia (high temperature) in the mom or not.
- **Meconium** : Is a boolean value that indicate the presence or not of Meconium which is the earliest stool of the baby that could happen before partum.
- **Presentation** : This indicate the presentation of the baby during delivery which can be head first, legs first etc. The problem here is that this value is a number which most probably refers to an internal classification or a scale I don't know how to interpret. **See also Deliv. type**
- **Induced** : Is a boolean value that indicate if the delivery has been medically induced.
- **I.stage** : It should indicate the time in between contractions in the stage of the delivery but this is only my observational opinon based on the data.
- **NoProgress** : Is a boolean value that indicate if there has been an abort of the pregnancy or not.
- **CK/KP** : It should refer to the level of Creatine Kinase?
- **II.stage** : see **I.stage**.
- **Deliv. type** : (1: vaginal; 2: operative vaginal; 3: CS)
- **dbID** : is just an ID of the record
- **Rec. type** : No idea
- **Pos. II.st.** : No idea

## Section 2
##### Functions for
- **noisy records removal**
- **large zero segments removal**
- **records that do not agree with restrictions** 
(such as casarean records, underage mothers and so on) **removal**
- **noise retrieval** (where noisy values predefined to 0) 

##### are defined

In [27]:
SEGMENT_SIZE = 25 # megalo SEGMENT_SIZE -> ligotero strict
NOISY_THRESH = 0.20 # mikro NOISY_THRESH -> perissotero  strict
NOISE_RETRV_THRESH = 0.25 # mikro NOISE_RETRV_THRESH -> perissotero smoothing
N_PREV_VALS = 10 # for calculating the median of N_PREV_VALS values for retrieve a zero measurment

In [28]:
def drop_restictions(df, name):
    records_to_drop = list()
    cc = ca = cd = cw = 0
    
    for i in range(df.shape[0]):
        if (df.iloc[i]['labels']['Deliv. type'] != 1.0):
            records_to_drop.append(i); cc+=1

        if (df.iloc[i]['labels']['Age'] < 18):
            records_to_drop.append(i); ca+=1

        if (df.iloc[i]['labels']['Hypertension'] == 1 or
            df.iloc[i]['labels']['Preeclampsia'] == 1 or 
            df.iloc[i]['labels']['Diabetes'] == 1):
            records_to_drop.append(i); cd+=1

        if (df.iloc[i]['labels']['Gest. weeks'] < 37.5):
            records_to_drop.append(i); cw+=1
    
    print('--',name,'--')
    print(cw,' underweeked records dropped')
    print(cd,' diseased records dropped')
    print(ca,' underage records dropped')
    print(cc,' casarean records dropped')
    
    print(len(records_to_drop), 'total records dropped')
    return df.drop(records_to_drop).reset_index().drop('index', axis=1)

In [29]:
def drop_noisy_records(df, thresh=0.2):
    noisy_records = list()
    
    for i in range(df.shape[0]):
        record = df.iloc[i]
        perc_zeros = record['Fhr'].count(0) / len(record['Fhr'])
        if( perc_zeros > thresh ):
            noisy_records.append(i)    
    return  df.drop(noisy_records).reset_index().drop('index', axis=1)

In [30]:
def drop_zero_segments(df_input, segment_size = 20):
    df = df_input.copy()
    
    for row_i , fhr in df_input['Fhr'].iteritems():
        fhr_list = list(fhr) # get fhr of record row_i as list
        fhr_list_out = list() # set an empty list
        count = 0
        for j, value in enumerate(fhr_list):     
            if(value == 0):
                count += 1                
            else:
                if(count >= segment_size):
                    fhr_list_out = fhr_list_out[:-count]
                count = 0    
            fhr_list_out.append(value)
        df.at[row_i, 'Fhr'] = fhr_list_out
        
    return df

In [31]:
from statistics import median 

def get_median(lst, index, n_prev_vals):
    medians = list()
    for i in range(index, 0, -1):
        if(lst[i] != 0):
            medians.append(lst[i])
        if(len(medians)==n_prev_vals):
            return median(medians)

    return median(lst)
            
def noise_retrieval(df_input, thres = 0.25, n_prev_vals=3):
    df = df_input.copy()
    for row_i , fhr in df['Fhr'].iteritems():
        fhr_new = list()
        for i in range(len(fhr)):
            prev_values_med = get_median(fhr, i, n_prev_vals)
            if(fhr[i] > prev_values_med + prev_values_med * thres or 
            fhr[i] < prev_values_med - prev_values_med * thres ):
                fhr_new.append(prev_values_med)
            else: 
                fhr_new.append(fhr[i])
        df.at[row_i,'Fhr'] = fhr_new
        
    return df 

In [32]:
df_r = drop_restictions(df,'df')
df_r_n = drop_noisy_records(df_r,thresh=NOISY_THRESH)
df_r_n_zs = drop_zero_segments(df_r_n, segment_size=SEGMENT_SIZE)
df_r_n_zs = df_r_n_zs.set_index('tag')
df_r_n_zs_sm = noise_retrieval(df_r_n_zs, thres=NOISE_RETRV_THRESH, n_prev_vals=N_PREV_VALS)

-- df --
18  underweeked records dropped
86  diseased records dropped
0  underage records dropped
46  casarean records dropped
150 total records dropped


In [36]:
df_r_n_zs_sm

Unnamed: 0_level_0,Fhr,pH,Apgar1,Apgar5,labels
tag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1002,"[146.25, 146.25, 150.25, 148.5, 148.5, 143.5, ...",7.00,8.0,8.0,Labels pH 7.00 BDecf ...
1004,"[153.75, 153.75, 153.75, 154.75, 154.75, 153.7...",7.30,8.0,9.0,Labels pH 7.30 BDecf ...
1008,"[124.0, 124.0, 124.0, 124.0, 124.0, 124.0, 124...",7.36,8.0,9.0,Labels pH 7.36 BDecf ...
1011,"[149.5, 149.75, 149.75, 150.0, 150.0, 150.25, ...",7.37,8.0,9.0,Labels pH 7.37 BDecf ...
1014,"[145.0, 145.0, 146.0, 145.5, 145.5, 146.5, 146...",7.14,9.0,9.0,Labels pH 7.14 BDecf ...
...,...,...,...,...,...
1499,"[140.25, 136.25, 136.25, 139.5, 140.25, 140.25...",7.24,9.0,10.0,Labels pH 7.24 BDecf ...
1502,"[114.5, 114.5, 114.5, 114.5, 114.5, 114.5, 114...",7.26,9.0,10.0,Labels pH 7.26 BDecf ...
1503,"[127.75, 127.75, 127.75, 131.5, 131.5, 132.75,...",7.31,9.0,10.0,Labels pH 7.31 BDecf ...
1505,"[149.25, 148.5, 148.5, 149.25, 148.5, 148.5, 1...",7.24,9.0,10.0,Labels pH 7.24 BDecf ...


## Section 3
### Experiments

* ##### Methods applied for Heart Rate Variability
- **sdnn**
- **rmssd**
- **sdann** 
- **nn20**

* ##### Types for correlation calculation
- **Pearson**
- **Spearman**
- **Kendall rank** 

In [42]:
def compute_hrv(df_input):
    df = df_input.copy()
    
    sdnn = list(); rmssd = list(); sdann = list(); nn20 = list()
    for row_i, fhr in tqdm(df['Fhr'].iteritems(), total=df.shape[0]):
        sdnn.append(td.sdnn(np.array(df.loc[row_i]['Fhr'])))
        rmssd.append(td.rmssd(np.array(df.loc[row_i]['Fhr'])))
        sdann.append(td.sdann(np.array(df.loc[row_i]['Fhr'])))
        nn20.append(td.nn20(np.array(df.loc[row_i]['Fhr'])))

    df['sdnn'] = sdnn
    df['rmssd'] = rmssd
    df['sdann'] = sdann
    df['nn20'] = nn20
    
    return df 

df_fin = compute_hrv(df_r_n_zs_sm)

  0%|          | 0/234 [00:00<?, ?it/s]

In [43]:
df_fin.head()

Unnamed: 0_level_0,Fhr,pH,Apgar1,Apgar5,labels,sdnn,rmssd,sdann,nn20
tag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1002,"[146.25, 146.25, 150.25, 148.5, 148.5, 143.5, ...",7.0,8.0,8.0,Labels pH 7.00 BDecf ...,"(24.151511535297974,)","(2.7888464424413266,)","(17.799354547870383,)","(67, 0.3960513093338063)"
1004,"[153.75, 153.75, 153.75, 154.75, 154.75, 153.7...",7.3,8.0,9.0,Labels pH 7.30 BDecf ...,"(22.21615385507634,)","(1.894141207204708,)","(20.63260434269694,)","(23, 0.13713331743381826)"
1008,"[124.0, 124.0, 124.0, 124.0, 124.0, 124.0, 124...",7.36,8.0,9.0,Labels pH 7.36 BDecf ...,"(14.888796067991317,)","(2.232269480346261,)","(8.65294719363377,)","(47, 0.29107574162383104)"
1011,"[149.5, 149.75, 149.75, 150.0, 150.0, 150.25, ...",7.37,8.0,9.0,Labels pH 7.37 BDecf ...,"(22.151290463333712,)","(2.8055033063409454,)","(18.777693193456248,)","(40, 0.26999662504218697)"
1014,"[145.0, 145.0, 146.0, 145.5, 145.5, 146.5, 146...",7.14,9.0,9.0,Labels pH 7.14 BDecf ...,"(23.719983988141312,)","(2.866754432123968,)","(15.362750193511564,)","(93, 0.5473808122424956)"
