In [None]:
from pathlib import Path
import pandas as pd
from pandarallel import pandarallel
import wfdb
from wfdb.io import Record
from typing import List
import shutil
import datetime
from datetime import timedelta

In [None]:
pandarallel.initialize(progress_bar=True)

# Create folder for dataset
dataset_path = Path("../ctg/dataset")
dataset_path.mkdir(exist_ok=True)

In [None]:
# Read the metadata file
dataset_records = pd.read_csv("../ctg/ctu-chb/RECORDS.csv")
dataset_records.head()

In [None]:
# # Create a single outcome file
# def convert_to_int(comment_line: str):
#     value = comment_line.split(' ')[-1]
#     if value != 'NaN':
#         return int(value)
#     else:
#         return float('nan')


def read_and_process_outcome(row):
    record_no: int = int(row["record"])
    record_comments: List[str] = wfdb.rdheader(f"../ctg/ctu-chb/{record_no}").comments
    try:
        ph = float(record_comments[2].split(" ")[-1])
        bcef = float(record_comments[3].split(" ")[-1])
        pco2 = float(record_comments[4].split(" ")[-1])
        be = float(record_comments[5].split(" ")[-1])
        apgar1 = int(record_comments[6].split(" ")[-1])
        apgar5 = int(record_comments[7].split(" ")[-1])
        fetus_age_weeks = int(record_comments[16].split(" ")[-1])
        fetus_weight_grams = float(record_comments[17].split(" ")[-1])
        fetus_sex = int(record_comments[18].split(" ")[-1])
        mother_age_years = int(record_comments[20].split(" ")[-1])
        mother_gravidity = float(record_comments[21].split(" ")[-1])
        mother_parity = int(record_comments[22].split(" ")[-1])
        mother_diabetes = int(record_comments[23].split(" ")[-1])
        mother_hypertension = int(record_comments[24].split(" ")[-1])
        mother_preeclampsia = int(record_comments[25].split(" ")[-1])
        mother_praecox = int(record_comments[26].split(" ")[-1])
        mother_pyrexia = int(record_comments[27].split(" ")[-1])
        mother_meconim = int(record_comments[28].split(" ")[-1])
    except BaseException as e:
        print(record_no)
        print(e)
        raise ValueError()

    return {
        "record_no": record_no,
        "ph": ph,
        "bcef": bcef,
        "pco2": pco2,
        "be": be,
        "apgar1": apgar1,
        "apgar5": apgar5,
        "fetus_age_weeks": fetus_age_weeks,
        "fetus_weight_grams": fetus_weight_grams,
        "fetus_sex": fetus_sex,
        "mother_age_years": mother_age_years,
        "mother_gravidity": mother_gravidity,
        "mother_parity": mother_parity,
        "mother_diabetes": mother_diabetes,
        "mother_hypertension": mother_hypertension,
        "mother_preeclampsia": mother_preeclampsia,
        "mother_praecox": mother_praecox,
        "mother_pyrexia": mother_pyrexia,
        "mother_meconim": mother_meconim,
    }


outcome_df = dataset_records.apply(read_and_process_outcome, axis=1)
outcome_df = pd.DataFrame.from_records(outcome_df.to_list())
outcome_df.to_parquet("../ctg/dataset/outcome.parquet")
outcome_df.head()

In [None]:
# Convert each record to parquet
def convert_record_to_parquet(row):
    record_no: int = int(row["record_no"])
    record: Record = wfdb.rdrecord(f"../ctg/ctu-chb/{record_no}")
    record_df = record.to_dataframe()
    record_df.to_parquet(f"../ctg/dataset/{record_no}.parquet")


outcome_df.apply(convert_record_to_parquet, axis=1)

In [None]:
# Delete wfdb db
def delete_unused_items(dir: Path):
    for entry in dir.iterdir():
        if entry.is_dir():
            shutil.rmtree(entry)
        elif entry.is_file():
            entry.unlink()
    dir.rmdir()


wfdb_path = Path("../ctg/ctu-chb/")
delete_unused_items(wfdb_path)

In [None]:
# Add CTG length to outcomes
outcome_df = pd.read_parquet("../ctg/dataset/outcome.parquet")
outcome_df.head()


# Add length of CTG
def convert_record_to_parquet(row):
    if not "no_of_points" in row:
        record_no: int = int(row["record_no"])
        record_df = pd.read_parquet(f"../ctg/dataset/{record_no}.parquet")
        row["no_of_points"] = record_df.shape[0]
    return row


outcome_df = outcome_df.apply(convert_record_to_parquet, axis=1)
outcome_df.to_parquet("../ctg/dataset/outcome.parquet")

In [None]:
import plotly.express as px

fig = px.histogram(outcome_df, x="no_of_points")
fig.show()

In [None]:
# Expand outcome
outcome_df = pd.read_parquet("../ctg/dataset/outcome.parquet")
outcome_df.head()

WINDOW_LENGTH = timedelta(minutes=10)
WINDOW_STRIDE = timedelta(minutes=10)


def expand_outcome_normal():
    data_dir = Path("../ctg-data/final-ctg-dataset")
    outcome = pd.read_parquet(data_dir / "outcomes_preprocessed.parquet")
    print(outcome.shape)

    def expand_outcome_row(row):
        segment_info_dict = row["segments_information"]
        segments = segment_info_dict["segments"]
        identifier: str = row["ID"]
        datetime_of_birth: datetime = row["datetime_of_birth"]
        pH = row["ns_art_ph"]

        # Create list to store all the start and end value for rows
        start_rows_list = []
        end_rows_list = []
        no_of_points_list = []
        segment_number_list = []
        window_number_list = []

        for idx, segment in enumerate(segments):
            segment_start = segment["start"]
            segment_end = segment["end"]
            window_roll = 0
            window_start = segment_start
            window_end = window_start + WINDOW_LENGTH
            ctg_segment = pd.read_parquet(
                data_dir / "ctgs" / "filtered_segments" / f"{identifier}_{idx}.parquet"
            )

            while window_end <= segment_end:
                # Create outcome expanded row
                start_rows_list.append(window_start)
                end_rows_list.append(window_end)
                no_of_points_list.append(ctg_segment[window_start:window_end].shape[0])
                segment_number_list.append(idx)
                window_number_list.append((window_roll + 1))

                # Move window
                window_roll = window_roll + 1
                window_start = segment_start + (window_roll * WINDOW_STRIDE)
                window_end = window_start + WINDOW_LENGTH

        id_list = [identifier] * len(start_rows_list)
        dob_list = [datetime_of_birth] * len(start_rows_list)
        ph_list = [pH] * len(start_rows_list)
        return {
            "identifier": id_list,
            "start": start_rows_list,
            "end": end_rows_list,
            "datetime_of_birth": dob_list,
            "ns_art_ph": ph_list,
            "no_of_points": no_of_points_list,
            "segment_number": segment_number_list,
            "window_number": window_number_list,
        }

    outcome_expanded = outcome.parallel_apply(expand_outcome_row, axis=1)

    id_list = []
    start_rows_list = []
    end_rows_list = []
    dob_list = []
    ph_list = []
    no_of_points_list = []
    segment_number_list = []
    window_number_list = []

    for result in outcome_expanded:
        id_list.extend(result["identifier"])
        start_rows_list.extend(result["start"])
        end_rows_list.extend(result["end"])
        dob_list.extend(result["datetime_of_birth"])
        ph_list.extend(result["ns_art_ph"])
        no_of_points_list.extend(result["no_of_points"])
        segment_number_list.extend(result["segment_number"])
        window_number_list.extend(result["window_number"])

    expanded_dict = {
        "identifier": id_list,
        "start": start_rows_list,
        "end": end_rows_list,
        "datetime_of_birth": dob_list,
        "ns_art_ph": ph_list,
        "no_of_points": no_of_points_list,
        "segment_number": segment_number_list,
        "window_number": window_number_list,
    }

    expanded_df = pd.DataFrame.from_dict(expanded_dict)
    print(expanded_df.shape)
    expanded_df.to_parquet(data_dir / "outcome_expanded.parquet")


expand_outcome_normal()