## README

## Import Libraries

In [None]:
from IPython.display import display, HTML
import pandas as pd
import numpy as np
from io import StringIO

## Data Display

In [None]:
def data_display(data):
    # Display the DataFrame with scroll
    # Define the height and width for the scrollable area
    display(HTML(f'''
    <div style="height: 500px; overflow-y: scroll; overflow-x: scroll; border: 1px solid black; padding: 5px;">
        {data.to_html(max_rows=None, max_cols=None)}
    </div>
    '''))

## Import Data

In [None]:
triage = pd.read_csv("/data/local/llm-evaluation/mimic-iv-ed-2.2/ed/triage.csv", on_bad_lines='skip', low_memory=False)
vitalsigns = pd.read_csv("/data/local/llm-evaluation/mimic-iv-ed-2.2/ed/vitalsign.csv", on_bad_lines='skip', low_memory=False)
ed_stays = pd.read_csv("/data/local/llm-evaluation/mimic-iv-ed-2.2/ed/edstays.csv")
patients = pd.read_csv("/data/local/llm-evaluation/mimic-iv/mimic-iv-3.0/hosp/patients.csv.gz", compression='gzip', low_memory=False)

## Preprocessing csv file

In [None]:
txt = open('/data/local/llm-evaluation/mimic-iv-note/discharge.csv').read()
txt = txt.replace('|', ',<vl>')
txt = txt.replace(',""""\n', ',<br>')

txt = txt.replace('Followup Instructions:\n___\n""""','Followup Instructions:\n___\n</br>|')
#now in text only between <br> and </br> we have to replace ',' with <comma>
import re
txt = re.sub(r'<br>([^<]*)</br>', lambda x: x.group(0).replace(',', '<comma>'), txt)

txt = txt.replace('"', '')
txt = txt.replace('text\n', 'text|')

# Use pandas to read the modified txt content as a CSV
df = pd.read_csv(StringIO(txt), lineterminator='|', on_bad_lines='warn')

## diagnosis notes and ed_stays are merged to get stay_ids values

In [None]:
# use values in hadm_id column of df to find value of column stay_id in eds_stays and creata a new dataframe with text and stay_id
for index, row in df.iterrows():
    try:
        hadm_id = float(row['hadm_id'])
        stay_id = ed_stays[ed_stays['hadm_id'] == hadm_id]['stay_id']
        if stay_id.empty:
            continue
        df.at[index, 'stay_id'] = stay_id.iloc[0]
    except Exception as e:
        print(f"{e} at {index}")
        continue

In [None]:
#create a new dataframe where stay_id is not NaN
df = df[df['stay_id'].notnull()]

## combine traige and diagnosis notes usig stayids

In [None]:
#now merge triage and df on stay_id
df_merged = pd.merge(triage, df, on="stay_id", how="inner")
df = df_merged.copy()

## merge with ed_stays on stay_id to get gender and race

In [None]:
df_merged_age = pd.merge(df, ed_stays, on='stay_id')
df = df_merged_age.copy()

## get unique on the subject_id

In [None]:
#get unique rows from merged_df based on subject_id_x
unique_df = df.drop_duplicates(subset=['subject_id_x'])

## getting tests from the text and store in new column named tests

In [None]:
def get_tests(text):
    lower_text = text.lower()
    try:
        if "discharge labs" in lower_text.split("pertinent results:")[1].split('brief hospital course:')[0]:
            return lower_text.split("pertinent results:")[1].split('brief hospital course:')[0].split('discharge labs')[0]
        else:
            return lower_text.split("pertinent results:")[1].split('brief hospital course:')[0]
    except:
        print(lower_text)
        return None

unique_df["tests"] = unique_df['text'].apply(get_tests)

## get age from patient dataframe

In [None]:
#merge unique_df with patient on subject_id and drop duplicate columns
unique_df = pd.merge(unique_df, patients, on='subject_id')

## get medication from the text volume

In [None]:
def get_medication(text):
    lower_text = text.lower()
    try:
        return lower_text.split("medications on admission:")[1].split('discharge medications:')[0]
    except:
        # print(lower_text)
        return None

unique_df["past-medication"] = unique_df['text'].apply(get_medication)

## Dataframe Preprocessing and Preparation

In [None]:
## load the "final.csv"
df = pd.read_csv('/data/local/llm-evaluation/processed/second_opinion/final.csv')

## load diagnosis.csv which has the icd-code for each patient 
diagnostics = pd.read_csv('/data/local/llm-evaluation/mimic-iv-ed-2.2/ed/diagnosis.csv',on_bad_lines='skip')
## only the data where "seq_num" equals 1. "seq_num" provides a pseudo-order for the ICD codes, with a value of 1 usually indicating highest relevance and a value of 9 indicating least relevance.
diagnostics = diagnostics[diagnostics["seq_num"] == 1]

## merge diagnostics in df to include icd_code and icd_title in df
df = df.merge(diagnostics[['stay_id', 'icd_code', 'icd_title', "icd_version"]],
              on='stay_id', how='left')

## delete the icd_code = NaN
df = df.dropna(subset=['icd_code'])

## drop columns that are not needed
df = df.drop(columns=["note_id", "note_type", "note_seq", "charttime", "storetime", "intime", "outtime", "arrival_transport", "disposition", "anchor_year", "anchor_year_group", "dod" ])

In [None]:
## merge temperature, heartrate, resprate, o2sat, sbp, dbp
def create_vitals(row):
    vitals = []
    
    # Check if each value is not NaN, and append the corresponding string
    if not pd.isna(row['temperature']):
        vitals.append(f"Temperature: {row['temperature']}")
    if not pd.isna(row['heartrate']):
        vitals.append(f"Heartrate: {row['heartrate']}")
    if not pd.isna(row['resprate']):
        vitals.append(f"resprate: {row['resprate']}")
    if not pd.isna(row['o2sat']):
        vitals.append(f"o2sat: {row['o2sat']}")
    if not pd.isna(row['sbp']):
        vitals.append(f"sbp: {row['sbp']}")   
    if not pd.isna(row['dbp']):
        vitals.append(f"dbp: {row['dbp']}") 
    
    # Join the parts with a comma and space
    return ", ".join(vitals)

df.loc[:,'initial_vitals'] = df.apply(create_vitals, axis=1)

In [None]:
## merge Gender, Race, Year
def create_patient_info(row):
    patient_info = []
    
    # Check if each value is not NaN, and append the corresponding string
    if row["gender"] == "F":
        patient_info.append("Gender: Female")
    elif row["gender"] == "M":
        patient_info.append("Gender: Male")
    else:
        patient_info.append(f"Gender: {row['gender']}")

    patient_info.append(f"Race: {row['race']}")
    patient_info.append(f"Age: {row['anchor_age']}")
    
    # Join the parts with a comma and space
    return ", ".join(patient_info)

df.loc[:,'patient_info'] = df.apply(create_patient_info, axis=1)

In [None]:
## drop columns that are not needed
df = df.drop(columns=["gender", "race", "anchor_age", "temperature", "heartrate", "resprate", "o2sat", "sbp", "dbp"])

## rearrange the columns of the dataframe
df = df[['stay_id', 'subject_id', 'hadm_id', "text", 'patient_info', 'initial_vitals', 'pain', 'chiefcomplaint', 'preprocessed_text', 'past-medication', 'tests', 'acuity', 'icd_code', 'icd_title', 'icd_version']]

In [None]:
## remove rows that have nans in acuity, because acuity will be predicted and nans dont carry information
df = df.dropna(subset=['acuity'])
df = df.dropna(subset=['tests'])

In [None]:
## convert nans to empty strings
df["pain"] = df['pain'].fillna("")
df["chiefcomplaint"] = df['chiefcomplaint'].fillna("")
df["past-medication"] = df['past-medication'].fillna("")

In [None]:
## convert numpy.float64 to numpy.int64
df['acuity'] = df['acuity'].astype(np.int64)
df['hadm_id'] = df['hadm_id'].astype(np.int64)
df['icd_version'] = df['icd_version'].astype(np.int64)

In [None]:
## find the rows that have "history of present illness" in the "text" column and keep only these rows
hpi = df['text'].str.contains('history of present illness', case=False, na=False)
hpi_index = hpi[hpi==True].index
df = df.loc[hpi_index]

In [None]:
## extract HPI from the raw text
def extract_hpi(text):
    pos_past_med_hist = text.lower().find('past medical history:')
    pos_soc_hist = text.lower().find('social history:')
    pos_fam_hist = text.lower().find('family history:')
    #text = text.replace("\n", " ")
    if pos_past_med_hist != -1:
        return text[:pos_past_med_hist].strip()
    elif pos_soc_hist != -1:
        return text[:pos_soc_hist].strip()
    elif pos_soc_hist != -1:
        return text[:pos_fam_hist].strip()
    else:
        return text

df["HPI"] = df["preprocessed_text"].apply(extract_hpi)

In [None]:
## extract diagnosis from the raw text
def extract_diagnosis(text):
    split_text = text.split("Discharge Diagnosis:" )[-1].split("Discharge Condition:")[0]
    #split_text = split_text.replace("\n", " ")
    split_text= split_text.replace('<comma>', ', ')
    return("Discharge Diagnosis: " + split_text)

df["diagnosis"] = df["text"].apply(extract_diagnosis)

## cut length HPI >2000 and test >3000

In [None]:
string_lengths = df['HPI'].str.len()
mask = string_lengths<2000
df = df[mask]

string_lengths = df['HPI'].str.len()
mask = string_lengths>50
df = df[mask]

string_lengths = df['tests'].str.len()
mask = string_lengths<3000
df = df[mask]

In [None]:
len(df)

In [None]:
lengths = df['tests'].str.len()

#Convert the list into a pandas Series
lengths_series = pd.Series(lengths)

# Set a cap at the 95th percentile (you can adjust this)
cap_value = lengths_series.quantile(1)
cap_value

## Process HPI

In [None]:
## delete nans in HPI
df = df.dropna(subset=['HPI'])
df = df[df['HPI'] != ""]

In [None]:
## HPI preprocess
from tqdm import tqdm
import re
def extract_only_hpi(text):

    ## remove everything after
    #text = re.sub(re.compile("in the ED.*", re.IGNORECASE), "", text)
    text = re.sub(re.compile(r"in the ED, initial vital.*", re.IGNORECASE | re.DOTALL), "", text)
    text = re.sub(re.compile(r"in the ED initial vital.*", re.IGNORECASE | re.DOTALL), "", text)
    text = re.sub(re.compile(r"\bED Course.*", re.IGNORECASE | re.DOTALL), "", text)
    text = re.sub(re.compile(r"\bIn ED initial VS.*", re.IGNORECASE | re.DOTALL), "", text)
    text = re.sub(re.compile(r"in the ED, initial VS.*", re.IGNORECASE | re.DOTALL), "", text)
    text = re.sub(re.compile(r"\binitial VS.*", re.IGNORECASE | re.DOTALL), "", text)
    text = re.sub(re.compile(r"in the ED.*", re.IGNORECASE | re.DOTALL), "", text)

    return text

tqdm.pandas()
df["HPI"] = df["HPI"].progress_apply(extract_only_hpi)

In [None]:
## Remove the ones that have ED in them
mask = df["HPI"].str.contains(r'\bED', case=False, na=False)
df = df[~mask]
## remove where test is nan to be able to compare between normal user and expert
df = df.dropna(subset=['tests'])

## remove header in diagnosis

In [None]:
## remove header "discharge diagnosis"
def remove_header(text, header):
    text = re.sub(re.compile(header, re.IGNORECASE), "", text)
    return text
## Remove Header in diagnosis "discharge diagnosis"
df['diagnosis'] = df['diagnosis'].apply(lambda text: remove_header(text, "discharge diagnosis:"))

In [None]:
## delete before including the string
def delete_before_string(text):

    # remove everything after
    text = re.sub(re.compile(r".*Facility:\n___", re.IGNORECASE | re.DOTALL), "", text)
    return text

df['diagnosis'] = df['diagnosis'].apply(delete_before_string)

In [None]:
## delete before including the string
def delete_before_string(text):

    # remove everything after
    text = re.sub(re.compile(r".*___ Diagnosis:", re.IGNORECASE | re.DOTALL), "", text)

    return text
df['diagnosis'] = df['diagnosis'].apply(delete_before_string)

In [None]:
## delete after "PMH" which stand for past medical history
def delete_after_string(text):

    # remove everything after
    text = re.sub(re.compile(r"PMH.*", re.IGNORECASE | re.DOTALL), "", text)

    return text

df['diagnosis'] = df['diagnosis'].apply(delete_after_string)

In [None]:
mask = df["HPI"].str.contains(' ER ', case=False, na=False)
df = df[~mask]
mask = df["HPI"].str.contains('Emergency room', case=False, na=False)
df = df[~mask]
mask = df["HPI"].str.contains('Emergency department', case=False, na=False)
df = df[~mask]
mask = df["HPI"].str.contains('impression', case=False, na=False)
df = df[~mask]


mask = df["diagnosis"].str.contains('deceased', case=False, na=False)
df = df[~mask]
mask = df["diagnosis"].str.contains('died', case=False, na=False)
df = df[~mask]

In [None]:
len(df)

In [None]:
mask_hpi = df["diagnosis"].str.contains('history of present illness', case=False, na=False)
df = df[~mask_hpi]
print(len(df))

In [None]:
## remove the ones that have primary in them but not surely in the beginning \n
mask = df["diagnosis"].str.contains('primary', case=False, na=False)
ind = df[mask].index.tolist()
mask2 = df['diagnosis'].str.contains(r'^\s*\nprimary', flags=re.IGNORECASE, regex=True)
ind2 = df[mask2].index.tolist()
ind_drop = set(ind) - set(ind2)
df = df[~df.index.isin(ind_drop)]

## remove the ones that have secondary in them but not surely in the beginning of secondary
mask = df["diagnosis"].str.contains('secondary', case=False, na=False)
ind = df[mask].index.tolist()
mask2 = df['diagnosis'].str.contains('\nsecondary', flags=re.IGNORECASE, regex=True)
ind2 = df[mask2].index.tolist()
ind_drop = set(ind) - set(ind2)
df = df[~df.index.isin(ind_drop)]

In [None]:
len(df)

In [None]:
df["primary_diagnosis"] = None
df["secondary_diagnosis"] = None
## divide discharge diagnosis into primary and secondary diangosis if possible
for i in df.index:
    index = df["diagnosis"][i].lower().find('secondary')
    if index != -1:
        df.loc[i, "primary_diagnosis"] = df["diagnosis"][i][:index]
        df.loc[i, "secondary_diagnosis"] = df["diagnosis"][i][index:]
    else:
        df.loc[i, "primary_diagnosis"] = df["diagnosis"][i]
        df.loc[i, "secondary_diagnosis"] = ""

In [None]:
## delete after "___ Condition:" which stand for past medical history
def delete_after_string(text):

    # remove everything after
    text = re.sub(re.compile(r"___ Condition:.*", re.IGNORECASE | re.DOTALL), "", text)

    return text

df['primary_diagnosis'] = df['primary_diagnosis'].apply(delete_after_string)

In [None]:
## delete if primary_diagnosis has more than 15 single \n (\n are between the different diagnosis, therefore if you have more than 15 you have too many diagnosis) 
def count_single_newlines(text):
    single_newlines = re.findall(r'(?<!\n)\n(?!\n)', text)
    return len(single_newlines)

# Apply the function to the entire column and get a list of counts
newline_counts = df['primary_diagnosis'].apply(count_single_newlines).tolist()

mask = [value < 16 for value in newline_counts]
df = df[mask]

In [None]:
len(df)

In [None]:
df = df.drop(columns=['text', 'preprocessed_text', 'past-medication'], inplace=False)

In [None]:
df = df[:2000]

In [None]:
## replace colon without \n to colon with \n
def colon_replacement(text):

    # remove everything after
    text = re.sub(r":\s*(?!\n)", ':\n', text)

    return text

df['primary_diagnosis'] = df['primary_diagnosis'].apply(colon_replacement)
df['secondary_diagnosis'] = df['secondary_diagnosis'].apply(colon_replacement)

In [None]:
## make diagnosis into a list for each row
liste = df['primary_diagnosis'].apply(lambda x: [s for s in x.split('\n') if s.strip()] if pd.notna(x) else x)
liste = liste.apply(lambda lst: [item for item in lst if "primary diagnoses" not in item.lower()])
liste = liste.apply(lambda lst: [item for item in lst if "primary diagnosis" not in item.lower()])
liste = liste.apply(lambda lst: [item for item in lst if "primary" not in item.lower()]) 
liste = liste.apply(lambda lst: [item for item in lst if "====" not in item.lower()])
liste = liste.apply(lambda lst: [item for item in lst if "" != item.lower()])

import re
def remove_number_prefix(item):
    return re.sub(r'^[1-8]\)\s*', '', item)
liste = liste.apply(lambda lst: [remove_number_prefix(item) for item in lst])

df["primary_diagnosis"] = liste


df['secondary_diagnosis'] = df['secondary_diagnosis'].fillna("")
liste = df['secondary_diagnosis'].apply(lambda x: [s for s in x.split('\n') if s.strip()])

liste = liste.apply(lambda lst: [item for item in lst if "secondary diagnoses" not in item.lower()])
liste = liste.apply(lambda lst: [item for item in lst if "secondary diagnosis" not in item.lower()])
liste = liste.apply(lambda lst: [item for item in lst if "secondary" not in item.lower()]) 
liste = liste.apply(lambda lst: [item for item in lst if "====" not in item.lower()])
liste = liste.apply(lambda lst: [item for item in lst if "" != item.lower()])

import re
def remove_number_prefix(item):
    return re.sub(r'^[1-8]\)\s*', '', item)
liste = liste.apply(lambda lst: [remove_number_prefix(item) for item in lst])

df["secondary_diagnosis"] = liste