This notebook creates the event log for the experiment in the paper "Data Enhanced Process Models in Process Mining"
It uses the MIMIC-IV (https://physionet.org/content/mimiciv/1.0/) database, stored in a Postgres database. First, all cases related to acute Heart Failure are retrieved. Then, the hospital data for their repsective hospital admission is fetched. Additionally, X-ray data is retrieved from an external database called MIMIC-CXR (https://physionet.org/content/mimic-cxr/2.0.0/). After that, relevant lab values and physician order entries, according to the clinical guideline, are used for the event log. The databases can be accessed via physionet, which require CITI training for access (https://mimic.mit.edu/iv/). 

!!!Do not forget to execute the defined functions at the bottom of the script first!!!

In [None]:
import numpy as np
from psycopg2 import connect
import pandas as pd
import pm4py
import numpy as np
import pandasql as ps
from pm4py.objects.conversion.log import converter as log_converter
#connect to postgres mimic database
con = connect(dbname="postgres", host="127.0.0.1", user="postgres", password="1234")
con.set_client_encoding('utf8')
cursor = con.cursor()

In [None]:
pd.set_option("display.max_rows", 500)

In [None]:
cursor.execute('SELECT * FROM mimic_hosp.diagnoses_icd')
icds = cursor.fetchall()
cols = list(map(lambda x: x[0], cursor.description))
icds = pd.DataFrame(icds, columns=cols)

In [None]:
hf = icds.loc[icds["icd_code"].str.contains("42821") | (icds["icd_code"].str.contains("42823")) | (icds["icd_code"].str.contains("42831")) |
        (icds["icd_code"].str.contains("42833")) | (icds["icd_code"].str.contains("42841"))| (icds["icd_code"].str.contains("42843"))
        | (icds["icd_code"].str.contains("I5021")) |  (icds["icd_code"].str.contains("I5023")) |(icds["icd_code"].str.contains("I5031"))|
        (icds["icd_code"].str.contains("I5033")) |
        (icds["icd_code"].str.contains("I5041"))|
        (icds["icd_code"].str.contains("I5042"))|
        (icds["icd_code"].str.contains("I5043"))|
        (icds["icd_code"].str.contains("I5811"))|
        (icds["icd_code"].str.contains("I5813"))]


In [None]:
hf = hf.reset_index()

In [None]:
hf = hf.drop("index", axis=1)

In [None]:
hadms = list(hf["hadm_id"])

In [None]:
cursor.execute("SELECT * FROM mimic_hosp.d_icd_diagnoses")
desc_icd = cursor.fetchall()
cols = list(map(lambda x: x[0], cursor.description))
desc_icd = pd.DataFrame(desc_icd, columns=cols)
desc_icd = desc_icd[["icd_code", "long_title"]]

In [None]:
hf = hf.merge(desc_icd, on="icd_code", how="inner")

In [None]:
cursor.execute("SELECT * from mimic_hosp.drgcodes")
drgs = cursor.fetchall()
cols = list(map(lambda x: x[0], cursor.description))
drgs = pd.DataFrame(drgs, columns=cols)

In [None]:
APR = drgs.loc[drgs["drg_type"] == "APR"]

In [None]:
hf_drg = drgs.loc[drgs["hadm_id"].isin(list(hf["hadm_id"]))]

In [None]:
hf_drg = hf_drg.loc[hf_drg["drg_type"] == "APR"]

In [None]:
#Consider only Heart Failure related DRGs
l = list(["Heart Failure", "Cardiac Catheterization w/ Circ Disord Exc Ischemic Heart Disease","Percutaneous Cardiovascular Procedures w/o AMI",
"Cardiac Arrhythmia & Conduction Disorders",
"Acute Myocardial Infarction",
"Percutaneous Cardiovascular Procedures w/ AMI",
"Cardiac Catheterization for Ischemic Heart Disease",
"Cardiac Defibrillator & Heart Assist Anomaly",
"Cardiac Valve Procedures w/ Cardiac Catheterization",
"Coronary Bypass w/ Cardiac Cath Or Percutaneous Cardiac Procedure",
"Other Circulatory System Diagnoses"
         ])
l = [x.upper() for x in l]

In [None]:
hf_filter = hf_drg.loc[hf_drg["description"].isin(l)]

In [None]:
hf_filter = hf_filter.sort_values(["hadm_id", "drg_code"])

In [None]:
hf_filter = hf_filter.reset_index()

In [None]:
hf_filter.drop("index", axis=1,inplace=True)

In [None]:
hadms = list(hf_filter["hadm_id"])

In [None]:
hf_diag = get_diagnoses(hf_filter, hadms, 3)

In [None]:
hf_adm = get_adms(hf_diag, hadms)

In [None]:
hf_adm = hf_adm.reset_index()

In [None]:
hf_adm.drop("index", axis=1, inplace=True)

In [None]:
#Filter cases, where the first icd code does not contain a heart failure related icd code

In [None]:
hf_adm = hf_adm.loc[hf_adm["1_desc_icd"].str.contains("heart failure", na=False)]

In [None]:
hadms = list(hf_adm["hadm_id"])

In [None]:
hf_t = get_transfers(hf_adm, hadms)

In [None]:
hf_p = get_patients(hf_t)

In [None]:
hf_p.reset_index(inplace=True)

In [None]:
hf_p.drop("index", axis=1, inplace=True)

In [None]:
hadms = list(hf_p["hadm_id"].unique())

In [None]:
hadms = list(map(int, hadms))

In [None]:
len(hadms)

In [None]:
####Retrieve Lab values and entries from POE (provider order entry) table #########

In [None]:
cursor.execute('SELECT * FROM mimic_hosp.d_labitems')
lab_d = cursor.fetchall()
cols = list(map(lambda x: x[0], cursor.description))
lab_d = pd.DataFrame(lab_d, columns=cols)   

In [None]:
cursor.execute('SELECT * FROM mimic_hosp.labevents where hadm_id = any(%s)', [hadms])
labs = cursor.fetchall()
cols = list(map(lambda x: x[0], cursor.description))
labs = pd.DataFrame(labs, columns=cols)

In [None]:
lab_w_detail = labs.merge(lab_d, on="itemid", how="inner")

In [None]:
lab_w_detail = lab_w_detail.sort_values(["hadm_id", "charttime"])

In [None]:
lab_w_detail.loc[(lab_w_detail["valuenum"] < lab_w_detail["ref_range_lower"]) & (lab_w_detail["valuenum"] != 0), "abnormal_low"] = "abnormal"

In [None]:
lab_w_detail.loc[(lab_w_detail["valuenum"] > lab_w_detail["ref_range_upper"]) & (lab_w_detail["valuenum"] != 0), "abnormal_high"] = "abnormal"

In [None]:
cursor.execute('SELECT * FROM mimic_hosp.poe where hadm_id = any(%s)', [hadms])
poe = cursor.fetchall()
cols = list(map(lambda x: x[0], cursor.description))
poe = pd.DataFrame(poe, columns=cols)
poe = poe.sort_values(["hadm_id", "ordertime"])

In [None]:
cursor.execute('SELECT * FROM mimic_hosp.poe_detail')
poe_d = cursor.fetchall()
cols = list(map(lambda x: x[0], cursor.description))
poe_d = pd.DataFrame(poe_d, columns=cols)

In [None]:
poe = poe.merge(poe_d, on="poe_id", how="left")

In [None]:
subs = list(hf_p["subject_id"].unique())

In [None]:
#load x ray metadata from the mimic-cxr https://physionet.org/content/mimic-cxr/2.0.0/
findings = pd.read_csv("cxr_findings_metadata.csv")

In [None]:
HF_findings = findings.loc[findings["subject_id"].isin(subs)]

In [None]:
labels = {0: 'Negative', 1: 'Positive', -1: 'Uncertain', -9: 'Disagreement'}

In [None]:
HF_findings.reset_index(inplace=True)

In [None]:
HF_findings.drop("Unnamed: 0", axis=1, inplace=True)

In [None]:
HF_findings["StudyTime"] = HF_findings["StudyTime"].astype(int)

In [None]:
HF_findings["StudyTime"] = HF_findings["StudyTime"].astype(str)

In [None]:
HF_findings["StudyDate"] = HF_findings["StudyDate"].astype(str)

In [None]:
HF_findings["ImageDate"] =HF_findings["StudyDate"] +":"+ HF_findings["StudyTime"]

In [None]:
HF_findings = HF_findings.loc[HF_findings["StudyTime"].str.len() >=3]

In [None]:
HF_findings["ImageDate_with_Stamp"] = pd.to_datetime(HF_findings["ImageDate"], format="%Y%m%d:%H%M%S")

In [None]:
x = hf_adm[["subject_id", "hadm_id","admittime", "dischtime"]]

In [None]:
#map hospital admission to x-ray images
sqlcode = '''
select *
from x
left join HF_findings on x.subject_id=HF_findings.subject_id
where HF_findings.ImageDate_with_Stamp >= x.admittime and HF_findings.ImageDate_with_Stamp < x.dischtime 

'''

newdf = ps.sqldf(sqlcode,locals())
newdf = newdf.loc[:,~newdf.columns.duplicated()]
#newdf = newdf.drop_duplicates(["hadm_id", "careunit", "intime", "outtime", "curr_service"])

In [None]:
newdf["ImageDate_with_Stamp"] = newdf["ImageDate_with_Stamp"].str.slice(stop=-7)

In [None]:
newdf["ImageDate_with_Stamp"] = pd.to_datetime(newdf["ImageDate_with_Stamp"])

In [None]:
to_merge = newdf[['subject_id', 'hadm_id', 'dicom_id', 'Atelectasis',
       'Cardiomegaly', 'Consolidation', 'Edema', 'Enlarged Cardiomediastinum',
       'Fracture', 'Lung Lesion', 'Lung Opacity', 'No Finding',
       'Pleural Effusion', 'Pleural Other', 'Pneumonia', 'Pneumothorax',
       'Support Devices', 'has_negbio_finding',
       'ImageDate_with_Stamp']]

In [None]:
x = poe.loc[(poe["order_subtype"] == "General Xray") & (poe["transaction_type"] == "New")]

In [None]:
y = x.loc[x["hadm_id"].isin(to_merge["hadm_id"])]

In [None]:
z = x.loc[x["hadm_id"].isin(to_merge["hadm_id"])]

In [None]:
y = y.reindex(columns = y.columns.tolist() + ['dicom_id','Atelectasis', 'Cardiomegaly',
       'Consolidation', 'Edema', 'Enlarged Cardiomediastinum', 'Fracture',
       'Lung Lesion', 'Lung Opacity', 'No Finding', 'Pleural Effusion',
       'Pleural Other', 'Pneumonia', 'Pneumothorax', 'Support Devices',
       'has_negbio_finding', 'ImageDate_with_Stamp'])

In [None]:
z = z.reindex(columns = z.columns.tolist() + ['dicom_id','Atelectasis', 'Cardiomegaly',
       'Consolidation', 'Edema', 'Enlarged Cardiomediastinum', 'Fracture',
       'Lung Lesion', 'Lung Opacity', 'No Finding', 'Pleural Effusion',
       'Pleural Other', 'Pneumonia', 'Pneumothorax', 'Support Devices',
       'has_negbio_finding', 'ImageDate_with_Stamp'])

In [None]:
z[['dicom_id', 'Atelectasis', 'Cardiomegaly',
       'Consolidation', 'Edema', 'Enlarged Cardiomediastinum', 'Fracture',
       'Lung Lesion', 'Lung Opacity', 'No Finding', 'Pleural Effusion',
       'Pleural Other', 'Pneumonia', 'Pneumothorax', 'Support Devices',
       'has_negbio_finding', 'ImageDate_with_Stamp']] = np.nan

In [None]:
#match xray to poe order
for index, row in y.iterrows():
    candidates = to_merge.loc[to_merge["hadm_id"] == row["hadm_id"]]
    if (len(candidates) > 0):
        for i, r in candidates.iterrows():
            if (((abs(row["ordertime"] - r["ImageDate_with_Stamp"])).total_seconds() / 3600) <= 8):
                y.loc[index, ["dicom_id", "Atelectasis",'Cardiomegaly',
       'Consolidation', 'Edema', 'Enlarged Cardiomediastinum', 'Fracture',
       'Lung Lesion', 'Lung Opacity', 'No Finding', 'Pleural Effusion',
       'Pleural Other', 'Pneumonia', 'Pneumothorax', 'Support Devices',
       'has_negbio_finding', 'ImageDate_with_Stamp']] = r[["dicom_id", "Atelectasis",'Cardiomegaly',
       'Consolidation', 'Edema', 'Enlarged Cardiomediastinum', 'Fracture',
       'Lung Lesion', 'Lung Opacity', 'No Finding', 'Pleural Effusion',
       'Pleural Other', 'Pneumonia', 'Pneumothorax', 'Support Devices',
       'has_negbio_finding', 'ImageDate_with_Stamp']]

In [None]:
dicoms_in_y = y.loc[~y["ImageDate_with_Stamp"].isna()]["dicom_id"]

In [None]:
filtered_y = y.drop_duplicates(["hadm_id", "dicom_id"], keep="first")

In [None]:
to_concat = z.loc[~z["poe_id"].isin(filtered_y["poe_id"])]

In [None]:
xray_no_dups = pd.concat([filtered_y, to_concat])

In [None]:
x = poe.loc[(poe["order_subtype"] == "General Xray") & (poe["transaction_type"] == "New")]

In [None]:
y = x.loc[x["hadm_id"].isin(to_merge["hadm_id"])]

In [None]:
poe_temp = poe.loc[~poe["poe_id"].isin(xray_no_dups["poe_id"])]

In [None]:
poe_new = pd.concat([xray_no_dups, poe_temp])

In [None]:
poe_new

In [None]:
#poe_new.to_csv("poe_table_with_xray.csv")

In [None]:
poe = pd.read_csv("poe_table_with_xray.csv")

In [None]:
poe = poe_new

In [None]:
poe = poe.rename(columns={"poe_seq_x": "poe_seq"})

In [None]:
len(poe["hadm_id"].unique())

In [None]:
poe

In [None]:
hf_adm = hf_adm.reindex(columns = hf_adm.columns.tolist() + ["start_time"])

In [None]:
hf_adm["start_time"] = 0

In [None]:
hf_adm = hf_adm.drop("start_time", axis=1)

In [None]:
hf_adm.loc[hf_adm["edregtime"].isna(), "start_time"] = hf_adm["admittime"]

In [None]:
hf_adm.loc[~hf_adm["edregtime"].isna(), "start_time"] = hf_adm["edregtime"]

In [None]:
#add admission time
to_merge = hf_adm[["hadm_id", "start_time"]]

In [None]:
poe = poe.merge(to_merge, on="hadm_id", how="inner")

In [None]:
poe["ordertime"] = pd.to_datetime(poe["ordertime"])

In [None]:
hadms_w_xray = poe.loc[~poe["dicom_id"].isna()]["hadm_id"].unique()

In [None]:
poe = poe.loc[poe["hadm_id"].isin(hadms_w_xray)]

In [None]:
lab_w_detail = lab_w_detail.loc[lab_w_detail["hadm_id"].isin(hadms_w_xray)]

In [None]:
#create data frame for guideline checking with poe and lab values
diag_df = pd.DataFrame()

In [None]:
telemetry_poe = poe.loc[poe["order_subtype"] == "Telemetry"].sort_values(["hadm_id", "ordertime"])

In [None]:
diag_df = diag_df.append(telemetry_poe.groupby("hadm_id").nth(0).reset_index())

In [None]:
oxygen_poe = poe.loc[poe["order_subtype"] == "Oxygen Therapy"].sort_values(["hadm_id", "ordertime"])

In [None]:
diag_df = diag_df.append(oxygen_poe.groupby("hadm_id").nth(0).reset_index())

In [None]:
radio_poe = poe.loc[(poe["order_type"] == "Radiology")].sort_values(["hadm_id", "ordertime"])

In [None]:
radio_poe = radio_poe.loc[radio_poe["order_subtype"] == "General Xray"]

In [None]:
diag_df = diag_df.append(radio_poe.groupby("hadm_id").nth(0).reset_index())

In [None]:
cardio_poe = poe.loc[(poe["order_type"] == "Cardiology")].sort_values(["hadm_id", "ordertime"])

In [None]:
cardio_poe["diff"] = cardio_poe["ordertime"] - cardio_poe["start_time"]

In [None]:
cardio_poe = cardio_poe.loc[cardio_poe["diff"] < pd.Timedelta(days=2)]

In [None]:
cardio_poe = cardio_poe.drop_duplicates(["hadm_id", "order_subtype"], keep="first").reset_index().drop("index", axis=1)

In [None]:
diag_df = diag_df.append(cardio_poe.groupby("hadm_id").nth(0).reset_index())

In [None]:
electro = ["Sodium", "Potassium"]

In [None]:
liver =["Asparate Aminotransferase (AST)", "Alanine Aminotransferase (ALT)", "Alkaline Phosphatase", "Albumin", "Bilirubin, Total"]

In [None]:
lab_labels = ["Troponin T", "Urea Nitrogen", "Creatinine", "Glucose", "Platelet Count", "Thyroid Stimulating Hormone"]

In [None]:
#keep the name of the lab labels for unabstracted values
lab_w_detail.loc[lab_w_detail["label"].isin(lab_labels), "abstract_lab"] = lab_w_detail["label"]

In [None]:
#change the name of liver test related lab values
lab_w_detail.loc[lab_w_detail["label"].isin(liver), "abstract_lab"] = lab_w_detail["label"]

In [None]:
#change the name for electrolytes related lab values
lab_w_detail.loc[lab_w_detail["label"].isin(electro), "abstract_lab"] = lab_w_detail["label"]

In [None]:
#get first occurences of lab values
diag_df = diag_df.append(lab_w_detail.loc[lab_w_detail["label"].isin(electro)].sort_values(["hadm_id", "charttime"]).groupby(["hadm_id", "abstract_lab"]).nth(0).reset_index())

In [None]:
diag_df = diag_df.append(lab_w_detail.loc[lab_w_detail["label"].isin(liver)].sort_values(["hadm_id", "charttime"]).groupby(["hadm_id", "abstract_lab"]).nth(0).reset_index())

In [None]:
diag_df = diag_df.append(lab_w_detail.loc[lab_w_detail["label"].isin(lab_labels)].sort_values(["hadm_id", "charttime"]).groupby(["hadm_id", "abstract_lab"]).nth(0).reset_index())

In [None]:
diag_df = diag_df.reset_index()

In [None]:
diag_df = diag_df.drop("index", axis=1)

In [None]:
#create one activity column "concept:name" for process mining
#for radiology/cardiology orders
diag_df.loc[~diag_df["poe_id"].isna(), "concept:name"] = diag_df["order_subtype"]

In [None]:
#for lab values
diag_df.loc[~diag_df["itemid"].isna(), "concept:name"] = diag_df["abstract_lab"]

In [None]:
diag_df = diag_df.rename(columns={"hadm_id":"case:concept:name"})

In [None]:
diag_df.loc[diag_df["ordertime"].isna(), "ordertime"] = diag_df["charttime"]

In [None]:
diag_df = diag_df.sort_values(["case:concept:name", "ordertime"])

In [None]:
diag_df.loc[~diag_df["poe_id"].isna(), "concept:name"] = "Perform " + diag_df["concept:name"]

In [None]:
diag_df.loc[diag_df["poe_id"].isna(), "concept:name"] = "Analyse " + diag_df["label"] + " Value"

In [None]:
diag_df.loc[diag_df["concept:name"].str.contains("Thyroid"), "concept:name"] = diag_df["concept:name"].str.replace("Thyroid Stimulating Hormone", "TSH")

In [None]:
diag_df.loc[diag_df["concept:name"].str.contains("Xray"), "concept:name"] = diag_df["concept:name"].str.replace("Xray", "X-Ray")

In [None]:
diag_df = pd.read_csv("Logs/HF_DIAG_GUIDE_WITH_XRAY.csv")

In [None]:
diag_df

In [None]:
#create log with either diag_df. the easiest way to creat an xes for prom is to load that into disco and export as XES. Use ordertime as the timestamp
event_log = log_converter.apply(diag_df, variant=log_converter.Variants.TO_EVENT_LOG)
pm4py.write_csv(event_log, "Logs/" + "BPM_PAPER_EVENT_LOG.csv")

In [None]:
def get_diagnoses(df, hadm_ids, n):
    cursor.execute('SELECT * FROM mimic_hosp.diagnoses_icd where hadm_id = any(%s)', [hadm_ids])
    icds = cursor.fetchall()
    cols = list(map(lambda x: x[0], cursor.description))
    icds = pd.DataFrame(icds, columns=cols)
    
    cursor.execute("SELECT * FROM mimic_hosp.d_icd_diagnoses")
    desc_icd = cursor.fetchall()
    cols = list(map(lambda x: x[0], cursor.description))
    desc_icd = pd.DataFrame(desc_icd, columns=cols)
    desc_icd = desc_icd[["icd_code", "long_title"]]
    
    b_icds = icds.loc[icds["hadm_id"].isin(hadm_ids)]
    count_icd = b_icds.groupby("hadm_id").count()
    count_icd = count_icd.reset_index()
    count_icd = count_icd[["hadm_id", "seq_num"]]
    df = df.merge(count_icd, on="hadm_id", how="inner").rename(columns={"seq_num":"count_icd"})
    for i in range (1, n+1):
        to_join = b_icds.loc[b_icds["seq_num"] == i][["hadm_id", "icd_code"]]
        df = df.merge(to_join, on="hadm_id", how="left").rename(columns={"icd_code": str(i) + "_icd"})
        df = df.merge(desc_icd, how="left", left_on=(str(i) + "_icd"), right_on="icd_code")
        df = df.rename(columns={"long_title":str(i) + "_desc_icd"})
        df = df.drop("icd_code", axis=1)
    return df

In [None]:
def get_adms(df, hadm_ids):
    cursor.execute('SELECT * FROM mimic_core.admissions where hadm_id = any(%s)', [hadm_ids])
    adms = cursor.fetchall()
    cols = list(map(lambda x: x[0], cursor.description))
    adms = pd.DataFrame(adms, columns=cols)
    b_adms = adms.loc[adms["hadm_id"].isin(hadm_ids)]
    b_adms.drop("subject_id", axis=1, inplace=True)
    b_adms = df.merge(b_adms, on="hadm_id", how="inner")
    b_adms = b_adms.drop_duplicates("hadm_id")
    return b_adms

In [None]:
#requires get_adms for admission/discharge location!
def get_transfers(df, hadm_ids):
    cursor.execute('SELECT * FROM mimic_core.transfers where hadm_id = any(%s)', [hadms])
    transfers = cursor.fetchall()
    cols = list(map(lambda x: x[0], cursor.description))
    transfers = pd.DataFrame(transfers, columns=cols)
    b_trans = transfers.loc[transfers["hadm_id"].isin(hadm_ids)]
    b_trans = b_trans.sort_values(["subject_id", "hadm_id","intime"])
    b_trans.loc[(b_trans["careunit"].isna()) & (b_trans["eventtype"] == "transfer"), "careunit"] = "Unknown"
    b_trans.loc[(b_trans["careunit"].isna()) & (b_trans["eventtype"] == "admit"), "careunit"] = "Admit"
    b_trans.loc[(b_trans["careunit"].isna()) & (b_trans["eventtype"] == "discharge"), "careunit"] = "Discharge"
        #Set first careunit to admission location
    b_trans = b_trans.drop("subject_id", axis=1)
    b_trans = b_trans.merge(df, on="hadm_id", how="inner")
    b_trans.loc[b_trans["careunit"] == "Discharge", "careunit"] = b_trans["discharge_location"]
    b_trans = b_trans.sort_values(["subject_id", "hadm_id","intime"])
    first_careunit = b_trans.loc[~b_trans.duplicated("hadm_id", keep="first")]
    admission_location = []
    for index, row in first_careunit.iterrows():
        add_row = row
        add_row["careunit"] = row["admission_location"]
        add_row["outtime"] = row["intime"] 
        add_row["transfer_id"] = np.nan
        add_row["intime"] = add_row["outtime"] - pd.Timedelta(seconds=1)
        admission_location.append(add_row)
    admission_location = pd.DataFrame(admission_location)
    b_trans_admission_location = pd.concat([first_careunit, admission_location])
    b_trans = b_trans.loc[b_trans.duplicated("hadm_id", keep="first")]
    b_trans = pd.concat([b_trans, b_trans_admission_location])
    b_trans = b_trans.sort_values(["subject_id","hadm_id", "intime"])
    return b_trans

In [None]:
#requires get_transfers
def get_patients(df):
    cursor.execute("SELECT * FROM mimic_core.patients")
    patients = cursor.fetchall()
    cols = list(map(lambda x: x[0], cursor.description))
    patients = pd.DataFrame(patients, columns=cols)
    b_trans_patient = df.merge(patients, on="subject_id", how="inner")
    b_trans_patient["transfer_year"] = b_trans_patient.apply(lambda x: x["intime"].year , axis=1)
    b_trans_patient["transfer_age"] = (b_trans_patient["transfer_year"] - b_trans_patient["anchor_year"]) + b_trans_patient["anchor_age"]
    b_trans_patient["anchor_real_year"] = b_trans_patient["anchor_year_group"].str.slice(0,4)
    b_trans_patient["anchor_real_year"] = pd.to_numeric(b_trans_patient["anchor_real_year"])
    b_trans_patient["anchor_real_year"] = b_trans_patient["anchor_real_year"] + 1
    b_trans_patient["transfer_real_year"] = b_trans_patient["anchor_real_year"] + b_trans_patient["transfer_year"] - b_trans_patient["anchor_year"]
    b_trans_patient.loc[b_trans_patient["transfer_real_year"] == 2021, "transfer_real_year"] = 2020
    b_trans_patient = b_trans_patient.sort_values(["hadm_id", "intime"])
    
    #set patient age for hospital admission according to first transfer in admission
    b_trans_patient.loc[~b_trans_patient.duplicated("hadm_id", keep="first"), "admission_age"] = b_trans_patient["transfer_age"]
    b_trans_patient.loc[b_trans_patient["admission_age"] <= 18, "admission_age_group"] = "0-18"
    b_trans_patient.loc[(b_trans_patient["admission_age"] <= 25) & (b_trans_patient["admission_age"] > 18), "admission_age_group"] = "19-25"
    b_trans_patient.loc[(b_trans_patient["admission_age"] <= 35) & (b_trans_patient["admission_age"] > 25), "admission_age_group"] = "26-35"
    b_trans_patient.loc[(b_trans_patient["admission_age"] <= 45) & (b_trans_patient["admission_age"] > 35), "admission_age_group"] = "36-45"
    b_trans_patient.loc[(b_trans_patient["admission_age"] <= 55) & (b_trans_patient["admission_age"] > 45), "admission_age_group"] = "46-55"
    b_trans_patient.loc[(b_trans_patient["admission_age"] <= 65) & (b_trans_patient["admission_age"] > 55), "admission_age_group"] = "56-65"
    b_trans_patient.loc[(b_trans_patient["admission_age"] <= 75) & (b_trans_patient["admission_age"] > 65), "admission_age_group"] = "66-75"
    b_trans_patient.loc[(b_trans_patient["admission_age"] <= 85) & (b_trans_patient["admission_age"] > 75), "admission_age_group"] = "76-85"
    b_trans_patient.loc[(b_trans_patient["admission_age"] > 85), "admission_age_group"] = "85+"
   ###create patient groups!!#####
    return b_trans_patient

In [None]:
def create_log_for_disco(df, log_name):
    df = df.rename(columns={'hadm_id': 'case:concept:name', 'careunit': 'concept:name'})
    event_log = log_converter.apply(df, variant=log_converter.Variants.TO_EVENT_LOG)
    pm4py.write_csv(event_log, "Logs/" + log_name + ".csv")