In [15]:
import os, sys, random, datetime, json

from pyts.image import GramianAngularField

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

In [27]:
BASEFILEPATH = "/Users/phillip/Sync/NICU Datasets"

In [28]:
# patientInfo.json is a json file that contains information of patients staying in hospital and their records of being diagnosed as infected
patientInfoDict = {}
# load patient info from json file
with open("{}/{}".format(BASEFILEPATH, PATIENTINFO)) as f:
    patientInfoDict = json.load(f)

In [29]:
def isClean(data: "ndarray") -> bool:
    """
    check whether the input dataframe is clean
    
    :param data: a 2-d ndarray retrieved by calling df.values
    :return: bool value indicating whether the input dataframe is clean
    """
    if (
        "***" in data
        or "+++" in data
        or "APN" in data
        or "ASY" in data
        or " VF" in data
        or "   " in data
        or "LRN" in data
        or "---" in data
    ):
        return False
    else:
        return True


def whatIsTheNoise(data: "ndarray") -> str:
    """
    check what is the noise among the input data
    
    :param data: a 2-d ndarray retrieved by calling df.values
    :return: str containing the outlier characters
    """
    retStr = ""

    if "***" in data:
        retStr += "***, "
    if "+++" in data:
        retStr += "+++, "
    if "APN" in data:
        retStr += "APN, "
    if "ASY" in data:
        retStr += "ASY, "
    if " VF" in data:
        retStr += " VF, "
    if "   " in data:
        retStr += "   , "
    if "LRN" in data:
        retStr += "LRN, "
    if "---" in data:
        retStr += "---, "

    if retStr == "":
        retStr = "All Clear!"

    return retStr


def calcMissingLine(df: "DataFrame") -> int:
    """
    calculate # of lines with missing values.
    
    :param df: the DataFrame to be analysed
    :return: # lines with missing values
    """
    count = 0

    df.replace("***", np.nan, inplace=True)
    df.replace("+++", np.nan, inplace=True)
    df.replace("APN", np.nan, inplace=True)
    df.replace("ASY", np.nan, inplace=True)
    df.replace(" VF", np.nan, inplace=True)
    df.replace("   ", np.nan, inplace=True)
    df.replace("LRN", np.nan, inplace=True)
    df.replace("---", np.nan, inplace=True)

    #     df = df.astype("float64")

    count = (df.isna().sum(axis=1) > 0).sum()

    return count


def calcMissingValue(df: "DataFrame") -> int:
    """
    calculate the missing ratio by value
    
    :param df: the DataFrame to be analysed
    :return: # missing values / # total values
    """
    df.replace("***", np.nan, inplace=True)
    df.replace("+++", np.nan, inplace=True)
    df.replace("APN", np.nan, inplace=True)
    df.replace("ASY", np.nan, inplace=True)
    df.replace(" VF", np.nan, inplace=True)
    df.replace("   ", np.nan, inplace=True)
    df.replace("LRN", np.nan, inplace=True)
    df.replace("---", np.nan, inplace=True)

    #     df = df.astype("float64")

    nMissing = df.isna().sum().sum()

    return nMissing


# calculate the missing ratio
# return: # lines with missing values / # total lines
def calcMissingRatio(dataframe):
    count = 0
    for i in dataframe.values:
        if (
            "***" in i
            or "+++" in i
            or "APN" in i
            or "ASY" in i
            or " VF" in i
            or "   " in i
            or "LRN" in i
            or "---" in i
        ):
            count += 1
    return count / len(dataframe)


# fill in missing values
# first with linear interpolation
# if it does not work fill with the nearest value
def fillMissing(dataframe):
    dataframe.replace("***", np.nan, inplace=True)
    dataframe.replace("+++", np.nan, inplace=True)
    dataframe.replace("APN", np.nan, inplace=True)
    dataframe.replace("ASY", np.nan, inplace=True)
    dataframe.replace(" VF", np.nan, inplace=True)
    dataframe.replace("   ", np.nan, inplace=True)
    dataframe.replace("LRN", np.nan, inplace=True)
    dataframe.replace("---", np.nan, inplace=True)

    dataframe = dataframe.astype("float64")

    dataframe.interpolate(inplace=True, limit_direction="both")
    dataframe.interpolate(inplace=True, method="nearest")

    return dataframe

In [50]:
IMAGESIZE = 227
WINDOWSIZE = IMAGESIZE

In [51]:
normalData_arr = []
infectedData_arr = []

# iterate items in patientInfo.json
for name in patientInfoDict:
    if name == "unknown":
        continue
    for entry in patientInfoDict[name]:
        sepsisTime = entry["sepsis"]
        duration = entry["duration"]
        durationList = []  # the dates that the patient is in hospital
        sepsisList = []  # the dates that the patient is infected
        delta = datetime.timedelta(days=1)

        for d in duration:
            d_start = datetime.datetime.strptime(d["start_date"], "%Y%m%d")
            d_end = datetime.datetime.strptime(d["end_date"], "%Y%m%d")
            while d_start <= d_end:
                durationList.append(datetime.datetime.strftime(d_start, "%Y%m%d"))
                d_start += delta
        durationList = list(set(durationList))  # get rid of the duplicates

        for s in sepsisTime:
            s_start = datetime.datetime.strptime(s["start_date"], "%Y%m%d")
            s_end = datetime.datetime.strptime(s["end_date"], "%Y%m%d")
            while s_start <= s_end:
                sepsisList.append(datetime.datetime.strftime(s_start, "%Y%m%d"))
                s_start += delta
        sepsisList = list(set(sepsisList))

        for date in durationList:
            temp = pd.read_csv(
                "%s/%s/%s/trend.txt" % (BASEFILEPATH, "Cot" + entry["cot"], date)
            )[["HR", "SpO2", "25255", "RESP", "PLS"]]

            if os.path.exists(
                "%s/%s/%s/trend1.txt" % (BASEFILEPATH, "Cot" + entry["cot"], date)
            ):
                temp = temp.append(
                    pd.read_csv(
                        "%s/%s/%s/trend1.txt"
                        % (BASEFILEPATH, "Cot" + entry["cot"], date)
                    )[["HR", "SpO2", "25255", "RESP", "PLS"]],
                    ignore_index=True,
                )

            # check if the length of data is larger than window size
            if len(temp) < WINDOWSIZE:
                continue

            # we only use data with missing rate less than 10%
            if calcMissingRatio(temp) < 0.1:
                temp.replace(0, np.nan, inplace=True)
                temp = fillMissing(temp)
            else:
                continue

            # if the patient has records of sepsis
            if date in sepsisList:
                # extract data block of WINDOWSIZE size
                for i in range(0, len(temp) - WINDOWSIZE, 30):
                    if not temp.isna().any().any():
                        block = temp.iloc[i : i + WINDOWSIZE, :].values
                        infectedData_arr.append(block)
            else:
                # extract data block of WINDOWSIZE size
                for i in range(0, len(temp) - WINDOWSIZE, 30):
                    if not temp.isna().any().any():
                        block = temp.iloc[i : i + WINDOWSIZE, :].values
                        normalData_arr.append(block)

# save the processed data to hard drive
np.save("normal_gramian_nicu_{}_{}.npy".format(IMAGESIZE, 5), np.array(normalData_arr))
np.save(
    "sepsis_gramian_nicu_{}_{}.npy".format(IMAGESIZE, 5), np.array(infectedData_arr),
)

  "***" in i
  or "+++" in i
  or "APN" in i
  or "ASY" in i
  or " VF" in i
  or "   " in i
  or "LRN" in i
  or "---" in i


In [47]:
for item in normalData_arr:
    if np.isnan(item).sum().sum() > 0:
        print(item)