### Importing Packages

In [11]:
import pandas as pd
from datetime import datetime
import numpy as np
import pydicom
from PIL import Image
import matplotlib.pyplot as plt

### Defining Data Preprocessing Function

In [12]:
def date_format(date):
    date = str(date)
    formatted_date = f"{date[:4]}-{date[4:6]}-{date[6:]}"
    return formatted_date

In [13]:
def time_format(time):
    time = str(time)
    time = time.split(".")[0]
    while len(time) != 6:
        time = "0" + time
    formatted_time = f"{time[:2]}:{time[2:4]}:{time[4:6]}"
    return formatted_time

In [14]:
def convert_datetime(input_date):
    return datetime.fromisoformat(input_date)

### Loading Data

In [15]:
subset = pd.read_csv("../data/external/subset.csv")
segmented = pd.read_csv("../data/external/CXLSeg-segmented.csv")
xray = pd.read_csv("../data/external/mimic-cxr-2.0.0-metadata.csv")

### Preprocessing Data

In [16]:
subset["admittime"] = subset["admittime"].apply(convert_datetime)
subset["dischtime"] = subset["dischtime"].apply(convert_datetime)

In [17]:
xray = xray.assign(formatted_date = xray["StudyDate"].apply(date_format))
xray = xray.assign(formatted_time = xray["StudyTime"].apply(time_format))
xray = xray.assign(studytime = (xray["formatted_date"] + " " + xray["formatted_time"]).apply(convert_datetime))

### Merging

In [None]:
# Subsetting xray dataset to make merge more efficient
xray_merge = xray[["subject_id", "study_id", "ViewPosition", "studytime"]]
# First merge
merging = subset.merge(xray_merge, left_on = "subject_id", right_on = "subject_id")
# Matching each xray to hospital admission
matched_dates = merging[(merging["studytime"] >= merging["admittime"]) & (merging["studytime"] <= merging["dischtime"])].reset_index(drop = True)
# Preprocessing segmented for merging
segmented_merged = segmented[["subject_id", "study_id", "dicom_id", "DicomPath", "No Finding"]]
segmented_merged["No Finding"] = segmented_merged["No Finding"].fillna(-1)
segmented_merged["Abnormal"] = (segmented_merged["No Finding"] * -1)
segmented_merged = segmented_merged.drop(columns = ["No Finding"])
# Final merge
complete_merged = matched_dates.merge(segmented_merged, on = ["subject_id", "study_id"])[["subject_id", "hadm_id", "stay_id", "study_id", 
                                                                       "admittime", "dischtime", "studytime", "ViewPosition",
                                                                       "dicom_id", "DicomPath", "Abnormal", "los", 
                                                                       "chronic_pulmonary_disease", "sepsis3"]]

complete_merged

### Readmission

In [19]:
# Subset for only pulmonary 
pulmonary = subset[subset["chronic_pulmonary_disease"] == 1]

In [None]:
# Finding readmitted patients
readmission = pulmonary.merge(pulmonary, on = "subject_id")
readmission["admittime_y"] = pd.to_datetime(readmission["admittime_y"])
readmission["admittime_x"] = pd.to_datetime(readmission["admittime_x"])
readmission["dischtime_x"] = pd.to_datetime(readmission["dischtime_x"])
readmission["dischtime_y"] = pd.to_datetime(readmission["dischtime_y"])
readmission = readmission[readmission["admittime_y"] > readmission["dischtime_x"]]
readmission["time_between_readmission"] = readmission["admittime_y"] - readmission["dischtime_x"]

readmission

In [None]:
# Finding patients who weren't initially admitted with sepsis and then were readmitted with sepsis
sepsis_readmission = readmission[(readmission["sepsis3_x"] == False) & (readmission["sepsis3_y"] == True)].sort_values("time_between_readmission")
sepsis_readmission = sepsis_readmission.assign(days_between_readmission = sepsis_readmission["time_between_readmission"].dt.days)

sepsis_readmission