# Extraction of Admission and Discharge Dates

Each year the EHR are build a little different, therefore I will go through each of the years one by one and extracting the features that interest me. To start, I will be working on only extracting the dates.

In [7]:
import functions
import os
import pandas as pd
from xml.dom import minidom
import datetime

## 2006 - COMPLICATED

#### Get DF

In [8]:
root_06 = "Data/2006"
list_of_txt_06 = functions.search_for_files(root_06, ".xml")

EHR_06 = {}
for item in list_of_txt_06:
    file = minidom.parse(item)
    text = file.getElementsByTagName("TEXT")
    ID = file.getElementsByTagName("RECORD")
    for txt, idx in zip(text, ID):
        ls = []
        for element in txt.childNodes:
            if type(element).attributes == None:
                ls.append(element.data)
            else:
                ls.append(element.firstChild.data)
        EHR_06[f'06_{int(idx.attributes["ID"].value)}'] = "".join(ls)
        
df_06 = pd.DataFrame.from_dict(EHR_06, orient="index").sort_index()
#df_06

#### Get features

In [9]:
# Get a dataframe with multiple different features

EHR_imp_06 = {}


for k, txt in EHR_06.items():
    EHR_imp_06[k] = {
        "text": txt,
        "Admission Date": None,
        "Discharge Date": None,
    }
    if 'Admission Date' in txt:
        DD = txt.split('Admission Date')[1].split('\n')
        #print(k, DD[1])
        EHR_imp_06[k]["Admission Date"] = DD[1].strip()
    if 'ADMISSION DATE' in txt:
        DD = txt.split('ADMISSION DATE')[1].split('\n')
        #print(k, DD[1])
        EHR_imp_06[k]["Admission Date"] = DD[1].strip()
    if 'Discharge Date' in txt:
        DD = txt.split('Discharge Date')[1].split('\n')
        EHR_imp_06[k]["Discharge Date"] = DD[1].strip()
    if 'DISCHARGE DATE' in txt:
        DD = txt.split('DISCHARGE DATE')[1].split('\n')
        EHR_imp_06[k]["Discharge Date"] = DD[1].strip()
        #EHR_imp_08[k]["Admission Date"] = important_info[9].split(" ")[3]
        #EHR_imp_08[k]["Discharge Date"] = DD if DD != "" else None
    
df_06_imp = pd.DataFrame.from_dict(EHR_imp_06, orient="index")
#df_06_imp

#### Additional statistical info

In [10]:
# Extra safety use regular expression check
Admin_dt = []
Disc_dt = []
for date in df_06_imp["Admission Date"]:
    try:
        a = datetime.datetime.strptime(date, "%m/%d/%Y")
    except:
        try:
            a = datetime.datetime.strptime(date, "%m-%d-%y")
        except:
            try:
                a = datetime.datetime.strptime(date, "%m/%d/%Y")
            except:
                Admin_dt.append(None)
            else:
                Admin_dt.append(datetime.date(a.year, a.month, a.day))
        else:
            Admin_dt.append(datetime.date(a.year, a.month, a.day))
    else:
        Admin_dt.append(datetime.date(a.year, a.month, a.day))

for date in df_06_imp["Discharge Date"]:
    try:
        a = datetime.datetime.strptime(date, "%m/%d/%Y")
    except:
        try:
            a = datetime.datetime.strptime(date, "%m-%d-%y")
        except:
            try:
                a = datetime.datetime.strptime(date, "%m/%d/%Y")
            except:
                Disc_dt.append(None)
            else:
                Disc_dt.append(datetime.date(a.year, a.month, a.day))
        else:
            Disc_dt.append(datetime.date(a.year, a.month, a.day))
    else:
        Disc_dt.append(datetime.date(a.year, a.month, a.day))

df_06_imp["Admission Date"] = Admin_dt
df_06_imp["Discharge Date"] = Disc_dt
df_06_imp["Time in Hosp"] = [
    dt - at if dt != None and at != None else None for dt, at in zip(Disc_dt, Admin_dt)
]

In [11]:
date_in_06 = (
    (df_06_imp.shape[0] - df_06_imp["Admission Date"].isna().sum())
    * 100
    / df_06_imp.shape[0]
)
date_out_06 = (
    (df_06_imp.shape[0] - df_06_imp["Discharge Date"].isna().sum())
    * 100
    / df_06_imp.shape[0]
)
print(
    f"The percentage of Admission dates is: {date_in_06} and the percentage of Discharge dates is: {date_out_06}"
)

The percentage of Admission dates is: 59.14567360350493 and the percentage of Discharge dates is: 57.28368017524644


## 2008

The files are built the same with written: "Admission Date: %%/%%/%%%%" for each date

#### Get DF

In [12]:
# Obtain DF for that year (from previous notebook)
root_08 = "Data/2008"
list_of_txt_08 = functions.search_for_files(root_08, ".xml")

EHR_08 = {}

for item in list_of_txt_08:
    file = minidom.parse(item)
    text = file.getElementsByTagName("text")
    ID = file.getElementsByTagName("doc")
    for txt, idx in zip(text, ID):
        # print(idx.attributes['id'].value,)
        EHR_08[f"08_{idx.attributes['id'].value}"] = txt.firstChild.data

df_08 = pd.DataFrame.from_dict(EHR_08, orient="index").sort_index()
# df_08

#### Obtain additional info

In [13]:
# Get a dataframe with multiple different features

EHR_imp_08 = {}

for k, txt in EHR_08.items():
    EHR_imp_08[k] = {
        "text": None,
        "ID_1": None,
        "Hospital": None,
        "ID_2": None,
        "blank": None,
        "ID_3": None,
        "Date in/out?": None,
        "Discharge Summary": None,
        "S/U": None,
        "DIS": None,
        "Admission Date": None,
        "Report Status": None,
        "Discharge Date": None,
    }
    important_info = txt.split("\n")[1].split("|")
    DD = txt.split("Discharge Date:")[1].split("\n")[0]
    if len(important_info) == 10:
        EHR_imp_08[k]["text"] = txt
        EHR_imp_08[k]["ID_1"] = important_info[0]
        EHR_imp_08[k]["Hospital"] = important_info[1]
        EHR_imp_08[k]["ID_2"] = important_info[2]
        EHR_imp_08[k]["blank"] = important_info[3]
        EHR_imp_08[k]["ID_3"] = important_info[4]
        EHR_imp_08[k]["Date in/out?"] = important_info[5]
        EHR_imp_08[k]["Discharge Summary"] = important_info[6]
        EHR_imp_08[k]["S/U"] = important_info[7]
        EHR_imp_08[k]["DIS"] = important_info[8]
        EHR_imp_08[k]["Admission Date"] = important_info[9].split(" ")[3]
        # dmy = important_info[9].split(' ')[3].split('/')
        # EHR_imp_08[k]['Admission DateTime'] = datetime(year = int(dmy[2]), month = int(dmy[0]), day = int(dmy[1])) if len(dmy) == 3 and (dmy[0] != '2' and dmy[1] != '29') else None
        EHR_imp_08[k]["Report Status"] = (
            important_info[9].split(" ")[6]
            if len(important_info[9].split(" ")) == 7
            else None
        )
        EHR_imp_08[k]["Discharge Date"] = DD if DD != "" else None

df_08_imp = pd.DataFrame.from_dict(EHR_imp_08, orient="index")
# df_08_imp

#### Stats about this dataframe

In [14]:
# Extra safety use regular expression check
Admin_dt = []
Disc_dt = []
for date in df_08_imp["Admission Date"]:
    try:
        a = datetime.datetime.strptime(date, "%m/%d/%Y")
    except:
        Admin_dt.append(None)
    else:
        Admin_dt.append(datetime.date(a.year, a.month, a.day))

for date in df_08_imp["Discharge Date"]:
    try:
        a = datetime.datetime.strptime(date, " %m/%d/%Y")
    except:
        Disc_dt.append(None)
    else:
        Disc_dt.append(datetime.date(a.year, a.month, a.day))

df_08_imp["Admission Date"] = Admin_dt
df_08_imp["Discharge Date"] = Disc_dt
# df_08_imp["Time in Hosp"] = [
#    dt - at if dt != None and at != None else None for dt, at in zip(Disc_dt, Admin_dt)
# ]
# df_08_imp["Discharge Summary"] = df_08_imp["Discharge Summary"].str.lower()

In [15]:
# TRY TO GET A DICTIONNARY WHERE WE ASSOCIATE A STATUS TO AN AMOUNT OF TIME IN THE HOSPITAL
# DS = {}
# for _, row in df_08_imp.iterrows():
#     if row['Discharge Summary'] not in DS:
#         DS[row['Discharge Summary']] = [row['Time in Hosp']]
#     else:
#         DS[row['Discharge Summary']].append(row['Time in Hosp'])

In [16]:
date_in_08 = (
    (df_08_imp.shape[0] - df_08_imp["Admission Date"].isna().sum())
    * 100
    / df_08_imp.shape[0]
)
date_out_08 = (
    (df_08_imp.shape[0] - df_08_imp["Discharge Date"].isna().sum())
    * 100
    / df_08_imp.shape[0]
)
print(
    f"The percentage of Admission dates is: {date_in_08} and the percentage of Discharge dates is: {date_out_08}"
)

The percentage of Admission dates is: 99.02991107518189 and the percentage of Discharge dates is: 93.85610347615199


## 2009

#### Get DF

In [17]:
# Get all the files
root_09_bis = "Data/2009_Bis"
list_of_txt_09_bis = functions.search_for_files(root_09_bis, ".txt")

# Get their record ID and put the in a dictionnary
EHR_09_bis = {}

for file in list_of_txt_09_bis:
    with open(file, encoding="utf-8-sig") as f:
        lines = f.readlines()
    ID = lines[0].split(" ")[1][1:-1]
    EHR_09_bis[f"09_{ID}"] = "".join(lines)

df_09_bis = pd.DataFrame.from_dict(EHR_09_bis, orient="index").sort_index()
# df_09_bis

#### Get Features

In [18]:
EHR_imp_09 = {}

for k, txt in EHR_09_bis.items():
    EHR_imp_09[k] = {
        "text": None,
        "ID_1": None,
        "Hospital": None,
        "ID_2": None,
        "blank": None,
        "ID_3": None,
        "Date in/out?": None,
        "Discharge Summary": None,
        "S/U": None,
        "DIS": None,
        "Admission Date": None,
        "Report Status": None,
        "Discharge Date": None,
    }
    important_info = txt.split("\n")
    clean_info = [x for x in important_info if x != ""]
    important_info = clean_info[1].split("|")
    DD = (
        clean_info[2].split("Discharge Date:")[1]
        if len(clean_info[2].split("Discharge Date:")) == 2
        else None
    )
    if len(important_info) == 10:
        EHR_imp_09[k]["text"] = txt
        EHR_imp_09[k]["ID_1"] = important_info[0]
        EHR_imp_09[k]["Hospital"] = important_info[1]
        EHR_imp_09[k]["ID_2"] = important_info[2]
        EHR_imp_09[k]["blank"] = important_info[3]
        EHR_imp_09[k]["ID_3"] = important_info[4]
        EHR_imp_09[k]["Date in/out?"] = important_info[5]
        EHR_imp_09[k]["Discharge Summary"] = important_info[6]
        EHR_imp_09[k]["S/U"] = important_info[7]
        EHR_imp_09[k]["DIS"] = important_info[8]
        EHR_imp_09[k]["Admission Date"] = important_info[9].split(" ")[3]
        # dmy = important_info[9].split(' ')[3].split('/')
        # EHR_imp_09[k]['Admission DateTime'] = datetime(year = int(dmy[2]), month = int(dmy[0]), day = int(dmy[1])) if len(dmy) == 3 and (dmy[0] != '2' and dmy[1] != '29') else None
        EHR_imp_09[k]["Report Status"] = (
            important_info[9].split(" ")[6]
            if len(important_info[9].split(" ")) == 7
            else None
        )
        EHR_imp_09[k]["Discharge Date"] = DD if DD != "" else None

df_09_imp = pd.DataFrame.from_dict(EHR_imp_09, orient="index")
# df_09_imp

#### Additional Info

In [19]:
# Extra safety use regular expression check
Admin_dt = []
Disc_dt = []
for date in df_09_imp["Admission Date"]:
    try:
        a = datetime.datetime.strptime(date, "%m/%d/%Y")
    except:
        Admin_dt.append(None)
    else:
        Admin_dt.append(datetime.date(a.year, a.month, a.day))

for date in df_09_imp["Discharge Date"]:
    try:
        a = datetime.datetime.strptime(date, " %m/%d/%Y")
    except:
        Disc_dt.append(None)
    else:
        Disc_dt.append(datetime.date(a.year, a.month, a.day))

df_09_imp["Admission Date"] = Admin_dt
df_09_imp["Discharge Date"] = Disc_dt
df_09_imp["Time in Hosp"] = [
    dt - at if dt != None and at != None else None for dt, at in zip(Disc_dt, Admin_dt)
]
df_09_imp["Discharge Summary"] = df_09_imp["Discharge Summary"].str.lower()

In [20]:
date_in_09 = (
    (df_09_imp.shape[0] - df_09_imp["Admission Date"].isna().sum())
    * 100
    / df_09_imp.shape[0]
)
date_out_09 = (
    (df_09_imp.shape[0] - df_09_imp["Discharge Date"].isna().sum())
    * 100
    / df_09_imp.shape[0]
)
print(
    f"The percentage of Admission dates is: {date_in_09} and the percentage of Discharge dates is: {date_out_09}"
)

The percentage of Admission dates is: 89.03122497998399 and the percentage of Discharge dates is: 84.14731785428343


## 2010

#### Get DF

In [21]:
# Get all the files
root_10 = "Data/2010"
list_of_txt_10 = functions.search_for_files(root_10, ".txt")

# Put it into a dictionnary and get the ID as the name of the file
EHR_10 = {}

for file in list_of_txt_10:
    with open(file, encoding="utf-8-sig") as f:
        lines = f.readlines()
    ID = file.split("/")[-1][:-4]
    EHR_10[f"10_{ID}"] = "".join(lines)

df_10 = pd.DataFrame.from_dict(EHR_10, orient="index")
# df_10

#### Get features

In [22]:
# Get a dataframe with multiple different features

EHR_imp_10 = {}


for k, txt in EHR_10.items():
    EHR_imp_10[k] = {
        "text": txt,
        "Admission Date": None,
        "Discharge Date": None,
    }
    if 'Admission Date' in txt:
        DD = txt.split('Admission Date')[1].split('\n')
        EHR_imp_10[k]["Admission Date"] = DD[1].strip()
    if 'ADMISSION DATE' in txt:
        DD = txt.split('ADMISSION DATE')[1].split('\n')
        EHR_imp_10[k]["Admission Date"] = DD[1].strip()
    if 'Discharge Date' in txt:
        DD = txt.split('Discharge Date')[1].split('\n')
        #print(DD[1])
        EHR_imp_10[k]["Discharge Date"] = DD[1].strip()
    if 'DISCHARGE DATE' in txt:
        DD = txt.split('DISCHARGE DATE')[1].split('\n')
        EHR_imp_10[k]["Discharge Date"] = DD[1].strip()
        #EHR_imp_08[k]["Admission Date"] = important_info[9].split(" ")[3]
        #EHR_imp_08[k]["Discharge Date"] = DD if DD != "" else None
    
df_10_imp = pd.DataFrame.from_dict(EHR_imp_10, orient="index")
#df_06_imp

#### Get additional data

In [23]:
# Extra safety use regular expression check
Admin_dt = []
Disc_dt = []
for date in df_10_imp["Admission Date"]:
    try:
        a = datetime.datetime.strptime(date, "%m/%d/%Y")
    except:
        try:
            a = datetime.datetime.strptime(date, "%m-%d-%y")
        except:
            try:
                a = datetime.datetime.strptime(date, "%m/%d/%Y")
            except:
                Admin_dt.append(None)
            else:
                Admin_dt.append(datetime.date(a.year, a.month, a.day))
        else:
            Admin_dt.append(datetime.date(a.year, a.month, a.day))
    else:
        Admin_dt.append(datetime.date(a.year, a.month, a.day))

for date in df_10_imp["Discharge Date"]:
    try:
        a = datetime.datetime.strptime(date, "%m/%d/%Y")
    except:
        try:
            a = datetime.datetime.strptime(date, "%m-%d-%y")
        except:
            try:
                a = datetime.datetime.strptime(date, "%m/%d/%Y")
            except:
                Disc_dt.append(None)
            else:
                Disc_dt.append(datetime.date(a.year, a.month, a.day))
        else:
            Disc_dt.append(datetime.date(a.year, a.month, a.day))
    else:
        Disc_dt.append(datetime.date(a.year, a.month, a.day))

df_10_imp["Admission Date"] = Admin_dt
df_10_imp["Discharge Date"] = Disc_dt
df_10_imp["Time in Hosp"] = [
    dt - at if dt != None and at != None else None for dt, at in zip(Disc_dt, Admin_dt)
]

In [24]:
date_in_10 = (
    (df_10_imp.shape[0] - df_10_imp["Admission Date"].isna().sum())
    * 100
    / df_10_imp.shape[0]
)
date_out_10 = (
    (df_10_imp.shape[0] - df_10_imp["Discharge Date"].isna().sum())
    * 100
    / df_10_imp.shape[0]
)
print(
    f"The percentage of Admission dates is: {date_in_10} and the percentage of Discharge dates is: {date_out_10}"
)

The percentage of Admission dates is: 46.753246753246756 and the percentage of Discharge dates is: 44.58874458874459


## 2011

Extraction of dates

#### Get DF

In [25]:
# Get all the files
root_11 = "Data/2011"
list_of_txt_11 = functions.search_for_files(root_11, ".txt")

EHR_11 = {}

for file in list_of_txt_11:
    with open(file, encoding="utf-8-sig") as f:
        lines = f.readlines()
    ID = file.split("/")[-1].split("-")[-1][:-4]
    EHR_11[f"11_{ID}"] = "".join(lines)

df_11 = pd.DataFrame.from_dict(EHR_11, orient="index").sort_index()
# df_11

#### Get features

In [26]:
# EHR_11_imp = {}

# for k, txt in EHR_11.items():
#     EHR_11_imp[k] = {'text':None, 'Admission Date': None, 'Discharge Date': None, 'Date of Birth': None, 'Sex': None, 'Service': None}
#     important_info = txt.split('\n')
#     EHR_11_imp[k]['text'] = txt
#     EHR_11_imp[k]['Admission Date'] = important_info[1]
#     EHR_11_imp[k]['Discharge Date'] = important_info[3]
#     EHR_11_imp[k]['Date of Birth'] = important_info[5]
#     EHR_11_imp[k]['Sex'] = important_info[7]
#     EHR_11_imp[k]['Service'] = important_info[9]

# df_11_imp = pd.DataFrame.from_dict(EHR_11_imp, orient="index")
# df_11_imp

In [27]:
EHR_11_imp = {}
for k, txt in EHR_11.items():
    EHR_11_imp[k] = {"text": txt, "Admission Date": None, "Discharge Date": None}
    if "Admission Date :" in txt:
        AD = txt.split("Admission Date :")[1].split("\n")[1]
        EHR_11_imp[k]["Admission Date"] = AD
    elif "ADMISSION DATE :" in txt:
        AD_1 = txt.split("ADMISSION DATE :")[1].split("\n")[1]
        EHR_11_imp[k]["Admission Date"] = AD_1
    if "Discharge Date :" in txt:
        AD = txt.split("Discharge Date :")[1].split("\n")[1]
        EHR_11_imp[k]["Discharge Date"] = AD
    elif "DISCHARGE DATE :" in txt:
        AD_1 = txt.split("DISCHARGE DATE :")[1].split("\n")[1]
        EHR_11_imp[k]["Discharge Date"] = AD_1

df_11_imp = pd.DataFrame.from_dict(EHR_11_imp, orient="index")
# df_11_imp['Discharge Date']

#### Get additional Data

In [28]:
# Extra safety use regular expression check
Admin_dt = []
Disc_dt = []
for date in df_11_imp["Admission Date"]:
    try:
        a = datetime.datetime.strptime(date, "%m/%d/%Y")
    except:
        try:
            a = datetime.datetime.strptime(date, "%m-%d-%y")
        except:
            try:
                a = datetime.datetime.strptime(date, "%Y-%m-%d")
            except:
                Admin_dt.append(None)
            else:
                Admin_dt.append(datetime.date(a.year, a.month, a.day))
        else:
            Admin_dt.append(datetime.date(a.year, a.month, a.day))
    else:
        Admin_dt.append(datetime.date(a.year, a.month, a.day))

for date in df_11_imp["Discharge Date"]:
    try:
        a = datetime.datetime.strptime(date, "%m/%d/%Y")
    except:
        try:
            a = datetime.datetime.strptime(date, "%m-%d-%y")
        except:
            try:
                a = datetime.datetime.strptime(date, "%Y-%m-%d")
            except:
                Disc_dt.append(None)
            else:
                Disc_dt.append(datetime.date(a.year, a.month, a.day))
        else:
            Disc_dt.append(datetime.date(a.year, a.month, a.day))
    else:
        Disc_dt.append(datetime.date(a.year, a.month, a.day))

df_11_imp["Admission Date"] = Admin_dt
df_11_imp["Discharge Date"] = Disc_dt
df_11_imp["Time in Hosp"] = [
    dt - at if dt != None and at != None else None for dt, at in zip(Disc_dt, Admin_dt)
]
# df_11_imp['Discharge Summary'] = df_11_imp['Discharge Summary'].str.lower()

In [29]:
date_in_11 = (
    (df_11_imp.shape[0] - df_11_imp["Admission Date"].isna().sum())
    * 100
    / df_11_imp.shape[0]
)
date_out_11 = (
    (df_11_imp.shape[0] - df_11_imp["Discharge Date"].isna().sum())
    * 100
    / df_11_imp.shape[0]
)
print(
    f"The percentage of Admission dates is: {date_in_11} and the percentage of Discharge dates is: {date_out_11}"
)

The percentage of Admission dates is: 76.88679245283019 and the percentage of Discharge dates is: 74.76415094339623


## 2012

#### Get DF

In [30]:
# Get all the files
root_12 = "Data/2012"
list_of_txt_12 = functions.search_for_files(root_12, ".txt")

EHR_12 = {}

for file in list_of_txt_12:
    with open(file, encoding="utf-8-sig") as f:
        lines = f.readlines()
    ID = file.split("/")[-1].split(".")[0]
    EHR_12[f"12_{ID}"] = "".join(lines)

df_12 = pd.DataFrame.from_dict(EHR_12, orient="index").sort_index()
# df_12

#### Get Features

In [31]:
EHR_12_imp = {}
for k, txt in EHR_12.items():
    EHR_12_imp[k] = {"text": txt, "Admission Date": None, "Discharge Date": None}
    if "Admission Date :" in txt:
        AD = txt.split("Admission Date :")[1].split("\n")[1]
        EHR_12_imp[k]["Admission Date"] = AD
    elif "ADMISSION DATE :" in txt:
        AD_1 = txt.split("ADMISSION DATE :")[1].split("\n")[1]
        EHR_12_imp[k]["Admission Date"] = AD_1
    if "Discharge Date :" in txt:
        AD = txt.split("Discharge Date :")[1].split("\n")[1]
        EHR_12_imp[k]["Discharge Date"] = AD
    elif "DISCHARGE DATE :" in txt:
        AD_1 = txt.split("DISCHARGE DATE :")[1].split("\n")[1]
        EHR_12_imp[k]["Discharge Date"] = AD_1

df_12_imp = pd.DataFrame.from_dict(EHR_12_imp, orient="index")
# df_12_imp['Admission Date']

#### Get additional data

In [32]:
# Extra safety use regular expression check
Admin_dt = []
Disc_dt = []
for date in df_12_imp["Admission Date"]:
    try:
        a = datetime.datetime.strptime(date, "%m/%d/%Y")
    except:
        try:
            a = datetime.datetime.strptime(date, "%m-%d-%y")
        except:
            try:
                a = datetime.datetime.strptime(date, "%Y-%m-%d")
            except:
                Admin_dt.append(None)
            else:
                Admin_dt.append(datetime.date(a.year, a.month, a.day))
        else:
            Admin_dt.append(datetime.date(a.year, a.month, a.day))
    else:
        Admin_dt.append(datetime.date(a.year, a.month, a.day))

for date in df_12_imp["Discharge Date"]:
    try:
        a = datetime.datetime.strptime(date, "%m/%d/%Y")
    except:
        try:
            a = datetime.datetime.strptime(date, "%m-%d-%y")
        except:
            try:
                a = datetime.datetime.strptime(date, "%Y-%m-%d")
            except:
                Disc_dt.append(None)
            else:
                Disc_dt.append(datetime.date(a.year, a.month, a.day))
        else:
            Disc_dt.append(datetime.date(a.year, a.month, a.day))
    else:
        Disc_dt.append(datetime.date(a.year, a.month, a.day))

df_12_imp["Admission Date"] = Admin_dt
df_12_imp["Discharge Date"] = Disc_dt
df_12_imp["Time in Hosp"] = [
    dt - at if dt != None and at != None else None for dt, at in zip(Disc_dt, Admin_dt)
]
# df_11_imp['Discharge Summary'] = df_11_imp['Discharge Summary'].str.lower()

In [33]:
date_in_12 = (
    (df_12_imp.shape[0] - df_12_imp["Admission Date"].isna().sum())
    * 100
    / df_12_imp.shape[0]
)
date_out_12 = (
    (df_12_imp.shape[0] - df_12_imp["Discharge Date"].isna().sum())
    * 100
    / df_12_imp.shape[0]
)
print(
    f"The percentage of Admission dates is: {date_in_12} and the percentage of Discharge dates is: {date_out_12}"
)

The percentage of Admission dates is: 86.12903225806451 and the percentage of Discharge dates is: 83.87096774193549


## 2014

#### Get DF

In [34]:
# Get all the files
root_14 = "Data/2014"
list_of_txt_14 = functions.search_for_files(root_14, ".xml")

EHR_14 = {}

for item in list_of_txt_14:
    file = minidom.parse(item)
    text = file.getElementsByTagName("TEXT")
    for txt in text:
        ID = item.split("/")[-1][:-4]
        EHR_14[f"14_{ID}"] = txt.firstChild.data

df_14 = pd.DataFrame.from_dict(EHR_14, orient="index").sort_index()
# df_14

#### Get features

In [35]:
EHR_14_imp = {}
for k, txt in EHR_14.items():
    EHR_14_imp[k] = {"text": txt, "Admission Date": None, "Discharge Date": None}
    if "Record date:" in txt:
        AD = txt.split("Record date: ")[1].split("\n")[0]
        EHR_14_imp[k]["Admission Date"] = AD

df_14_imp = pd.DataFrame.from_dict(EHR_14_imp, orient="index")
# df_14_imp

#### Get additional stats

In [36]:
Admin_dt = []
for date in df_14_imp["Admission Date"]:
    try:
        a = datetime.datetime.strptime(date, "%Y-%m-%d")
    except:
        Admin_dt.append(None)
    else:
        Admin_dt.append(datetime.date(a.year, a.month, a.day))

df_14_imp["Admission Date"] = Admin_dt

In [37]:
date_in_14 = (
    (df_14_imp.shape[0] - df_14_imp["Admission Date"].isna().sum())
    * 100
    / df_14_imp.shape[0]
)
# date_out_14 = (
#     (df_14_imp.shape[0] - df_14_imp["Discharge Date"].isna().sum())
#     * 100
#     / df_14_imp.shape[0]
# )
print(f"The percentage of Admission dates is: {date_in_14}")

The percentage of Admission dates is: 99.76993865030674


## 2018_1

#### Get DF

In [38]:
# Get all the files
root_18_1 = "Data/2018_1"
list_of_txt_18_1 = functions.search_for_files(root_18_1, ".xml")

EHR_18_1 = {}

for files in list_of_txt_18_1:
    i=0
    fls = []
    file = minidom.parse(files)
    text = file.getElementsByTagName("TEXT")
    for txt in text:
        lg_text = txt.firstChild.data
        lg_text = lg_text.split("Record date")
        for item in lg_text:
            ID = files.split("/")[-1][:-4]
            EHR_18_1[f"18_1_{ID}_{i}"] = "Record date" + item

df_18_1 = pd.DataFrame.from_dict(EHR_18_1, orient="index").sort_index()
# df_18_1

#### Get features

In [39]:
EHR_18_1_imp = {}
for k, txt in EHR_18_1.items():
    EHR_18_1_imp[k] = {"text": txt, "Admission Date": None, "Discharge Date": None}
    if "Record date:" in txt:
        AD = txt.split("Record date: ")[1].split("\n")[0]
        EHR_18_1_imp[k]["Admission Date"] = AD

df_18_1_imp = pd.DataFrame.from_dict(EHR_18_1_imp, orient="index")
# df_18_1_imp

#### Get additional stats

In [40]:
Admin_dt = []
for date in df_18_1_imp["Admission Date"]:
    try:
        a = datetime.datetime.strptime(date, "%Y-%m-%d")
    except:
        Admin_dt.append(None)
    else:
        Admin_dt.append(datetime.date(a.year, a.month, a.day))

df_18_1_imp["Admission Date"] = Admin_dt

In [41]:
date_in_18_1 = (
    (df_18_1_imp.shape[0] - df_18_1_imp["Admission Date"].isna().sum())
    * 100
    / df_18_1_imp.shape[0]
)
# date_out_14 = (
#     (df_14_imp.shape[0] - df_14_imp["Discharge Date"].isna().sum())
#     * 100
#     / df_14_imp.shape[0]
# )
print(f"The percentage of Admission dates is: {date_in_18_1}")

The percentage of Admission dates is: 100.0


## 2018_2

#### Get DF

In [42]:
# Get all the files
root_18_2 = "Data/2018_2"
list_of_txt_18_2 = functions.search_for_files(root_18_2, ".txt")

EHR_18_2 = {}

for file in list_of_txt_18_2:
    with open(file, encoding="utf-8-sig") as f:
        lines = f.readlines()
    ID = file.split("/")[-1][:-4]
    EHR_18_2[f"18_2_{ID}"] = "".join(lines)

df_18_2 = pd.DataFrame.from_dict(EHR_18_2, orient="index").sort_index()
# df_18_2

#### Get features

In [43]:
EHR_18_2_imp = {}
i = 0
for k, txt in EHR_18_2.items():
    EHR_18_2_imp[k] = {
        "text": txt,
        "Admission Date": None,
        "Discharge Date": None,
        "Date of Birth": None,
        "Sex": None,
    }
    important_info = txt.split("Service")[0]
    AD = important_info.split("**")[1]
    DD = important_info.split("**")[3] if len(important_info.split("**")) >= 4 else None
    DOB = (
        important_info.split("**")[5] if len(important_info.split("**")) >= 6 else None
    )
    Sex = (
        important_info.split("Sex:")[1].split("\n")[0].strip()
        if "Sex" in important_info
        else None
    )
    EHR_18_2_imp[k]["Admission Date"] = AD
    EHR_18_2_imp[k]["Discharge Date"] = DD
    EHR_18_2_imp[k]["Date of Birth"] = DOB
    EHR_18_2_imp[k]["Sex"] = Sex

df_18_2_imp = pd.DataFrame.from_dict(EHR_18_2_imp, orient="index")
# df_18_2_imp

#### Get additional stats

In [44]:
# Extra safety use regular expression check
Admin_dt = []
Disc_dt = []
for date in df_18_2_imp["Admission Date"]:
    try:
        a = datetime.datetime.strptime(date, "%Y-%m-%d")
    except:
        Admin_dt.append(None)
    else:
        Admin_dt.append(datetime.date(a.year, a.month, a.day))

for date in df_18_2_imp["Discharge Date"]:
    try:
        a = datetime.datetime.strptime(date, "%Y-%m-%d")
    except:
        Disc_dt.append(None)
    else:
        Disc_dt.append(datetime.date(a.year, a.month, a.day))

df_18_2_imp["Admission Date"] = Admin_dt
df_18_2_imp["Discharge Date"] = Disc_dt
df_18_2_imp["Time in Hosp"] = [
    dt - at if dt != None and at != None else None for dt, at in zip(Disc_dt, Admin_dt)
]

In [45]:
date_in_18_2 = (
    (df_18_2_imp.shape[0] - df_18_2_imp["Admission Date"].isna().sum())
    * 100
    / df_18_2_imp.shape[0]
)
date_out_18_2 = (
    (df_18_2_imp.shape[0] - df_18_2_imp["Discharge Date"].isna().sum())
    * 100
    / df_18_2_imp.shape[0]
)
print(
    f"The percentage of Admission dates is: {date_in_18_2} and the percentage of Discharge dates is: {date_out_18_2}"
)

The percentage of Admission dates is: 95.04950495049505 and the percentage of Discharge dates is: 95.24752475247524


## Complete

In [46]:
df_ls = [
    df_06_imp,
    df_08_imp,
    df_09_imp,
    df_10_imp,
    df_11_imp,
    df_12_imp,
    df_14_imp,
    df_18_1_imp,
    df_18_2_imp,
]
df = pd.concat(df_ls)
# df

In [47]:
tot_date_in = 0
tot_date_out = 0

for _, row in df.iterrows():
    if isinstance(row["Admission Date"], datetime.date):
        tot_date_in += 1
    if isinstance(row["Discharge Date"], datetime.date):
        tot_date_out += 1

pct_in = tot_date_in * 100 / df.shape[0]
pct_out = tot_date_out * 100 / df.shape[0]

print(
    f'In the corpus there are {tot_date_in} of occurences of "Admission Dates" and {tot_date_out} of occurences of "Discharge Date" for a total of {df.shape[0]} EHR, thus there are {pct_in} and {pct_out}'
)

In the corpus there are 5869 of occurences of "Admission Dates" and 4108 of occurences of "Discharge Date" for a total of 6923 EHR, thus there are 84.77538639318215 and 59.33843709374548
