In [None]:
import pandas as pd
import numpy as np
import os
import re
from pathlib import Path
from statistics import mode 
pd.options.display.max_columns = 100

### Prepare Labels

In [None]:
def try_mode(l):
    try:
        m = mode(l)
    except:
        m = int(np.median(l))
    return m

In [None]:
df = pd.read_csv('../data/label/labels.csv')

In [None]:
df.replace(-1,0,inplace=True)

In [None]:
v = 1003
df[df.rec_id==v]

In [None]:
t = np.sort(df[df.rec_id==v].eval_step1.values)
t

In [None]:
(np.mean(t)),(np.median(t)),try_mode(t)

In [None]:
records = sorted(np.unique(df.rec_id.values))

In [None]:
data_list = []

In [None]:
for rec in records:
    temp_df = df[df.rec_id==rec]
    eval_step1 = try_mode(temp_df.eval_step1.values)
    eval_step2 = try_mode(temp_df.eval_step2.values)
    eval_step3 = try_mode(temp_df.eval_step3.values)
    eval_step4 = try_mode(temp_df.eval_step4.values)
    data = rec,eval_step1,eval_step2,eval_step3,eval_step4
    data_list.append(data)

In [None]:
labels = pd.DataFrame(data_list,columns=['rec_id','step1','step2','step3','step4'])
labels.set_index('rec_id',inplace=True)

In [None]:
labels.head()

In [None]:
labels.to_csv('../data/label/label_df.csv')

### Read pH

In [None]:
labels= pd.read_csv('../data/label/label_df.csv')

In [None]:
def read_info(location):
    info = dict()
    f = open(str(location)+'.hea') #open info file
    lines = [line for line in f.readlines()]
    for line in lines[7:]:
        if '#' in line:
            if re.search(r'\d+', line):
                info[re.search(r'\w+', line).group()] = re.search(r"[-+]?\d*\.\d+|\d+", line).group()
    f.close()
    return info

In [None]:
data_path = Path('../data')
file_names = [str(data_path/file)[:-4] for file in sorted(os.listdir(data_path)) if '.txt' in file]

In [None]:
pH_list = []
for file in file_names:
    info = read_info(file)
    rec = int(file.split('/')[-1])
    pH = float(info['pH'])
    pH_list.append([rec,pH])

In [None]:
# pH_list

In [None]:
pH_df = pd.DataFrame(pH_list,columns=['rec_id','pH_orig'])
pH_df.set_index('rec_id',inplace = True)
pH_df.head()

In [None]:
labels.set_index('rec_id',inplace=True)

In [None]:
labels_with_pH = pd.concat([labels, pH_df], axis=1)

In [None]:
labels_with_pH.head()

#### pH Classification

From step 1 to 3, the clinicians evaluated CTG recordings as
normal, suspicious, pathological or uninterpretable according to
their daily practice. In step 4, clinicians predicted a labour outcome
(umbilical artery pH after delivery), divided into four categories
696
as no hypoxia (normal), mild hypoxia (suspicious), severe
hypoxia (pathological) or uninterpretable. During introduction of
CTGAnnotator these classes were described in terms of pH value


`(pH > 7.15 for no hypoxia; 7.15 ≥ pH > 7.05 for mild hypoxia and pH ≤ 7.05 for severe hypoxia)`.

For the step 4, occurrences of risk
factors, as written in the patient record, were provided in addition to
the basic clinical information described earlier. It included presence
of diabetes, fever, hypertension, preeclampsia, meconium stained
amniotic fluid or induction of labour.`Source: pdf_file`


1. `eval_step1-3` - annotation of step 1 to 3 :
       values (normal=1, suspicious=2, pathological=3, ninterpretable=-1)
2. `eval_step4` - annotation of step 4, 
       values (no hypoxia=1, mild hypoxia=2, severe hypoxia=3, uninterpretable=-1)

In [None]:
def classify_pH(pH):
    if pH > 7.15:
        return 1
    elif pH <=7.15 and pH >7.05:
        return 2
    elif pH <=7.05:
        return 3

In [None]:
labels_with_pH['pH'] = labels_with_pH.pH_orig.apply(classify_pH)

In [None]:
labels_with_pH.head()

#### Total Cases

##### Normal , no hypoxia , pH > 7.15 , class = 1

In [None]:
len(labels_with_pH[labels_with_pH.pH==1])

##### Suspicious, mild hypoxia, 7.15 ≥ pH > 7.05, class = 2

In [None]:
len(labels_with_pH[labels_with_pH.pH==2])

##### Pathological,severe hypoxia, pH ≤ 7.05, class = 3

In [None]:
len(labels_with_pH[labels_with_pH.pH==3])

In [None]:
labels_with_pH.to_csv('../data/label/labels_with_pH.csv')

### Inspect Labels

In [None]:
labels = pd.read_csv('../data/label/labels_with_pH.csv')

In [None]:
labels.head()

In [None]:
len(labels[labels.step1==labels.step2])

In [None]:
len(labels[labels.step2==labels.step3])

In [None]:
label_1_eq_2 = labels[labels.step1==labels.step2]
label_2_eq_3 = labels[labels.step2==labels.step3]
label_1_eq_2_eq_3 = label_1_eq_2[label_1_eq_2.step2==label_1_eq_2.step3]
label_1_eq_2_eq_3_eq_pH = label_1_eq_2_eq_3[label_1_eq_2_eq_3.step3==label_1_eq_2_eq_3.pH]

In [None]:
lenghts = len(label_1_eq_2),len(label_2_eq_3),len(label_1_eq_2_eq_3),len(label_1_eq_2_eq_3_eq_pH)
lenghts

In [None]:
s = 0
for l in lenghts: s+= l #sum of lengths
s

### Accuracy of Medical Experts

This has been found out by comparing step4(prediction of experts) with actual pH.

In [None]:
len(labels[labels.step4==labels.pH])/len(labels)