## Import Libraries

In [None]:
import pandas as pd
import re
import time
import os
from tqdm import tqdm

import boto3
from langchain.prompts import PromptTemplate
from langchain_aws import ChatBedrock

from IPython.display import display, HTML

## Data Display

In [None]:
def data_display(data):
    # Display the DataFrame with scroll and define the height and width for the scrollable area
    display(HTML(f'''
    <div style="height: 500px; overflow-y: scroll; overflow-x: scroll; border: 1px solid black; padding: 5px;">
        {data.to_html(max_rows=None, max_cols=None)}
    </div>
    '''))

# Triage

## Load Data Triage

In [None]:
## Load Data from Claude_triage_diagnosis_specialty.ipynb
df = pd.read_csv("MIMIC-IV-Ext-Triage.csv")

## Extract the Prediction from the <acuity> tag

In [None]:
def parse_triage(triage):
    #parse whats in between <acuity>  and </acuity> tag
    try:
        triage = triage.split('<acuity>')[-1].split('</acuity>')[0]
    except Exception as e:
        print(triage, f"{e}")
    return triage

df["triage_Claude3.5"] = df["triage_Claude3.5"].apply(parse_triage)
df["triage_Claude3"] = df["triage_Claude3"].apply(parse_triage)
df["triage_Haiku"] = df["triage_Haiku"].apply(parse_triage)
df["triage_Claude3.5_Clinical"] = df["triage_Claude3.5_Clinical"].apply(parse_triage)
df["triage_Claude3_Clinical"] = df["triage_Claude3_Clinical"].apply(parse_triage)
df["triage_Haiku_Clinical"] = df["triage_Haiku_Clinical"].apply(parse_triage)


## remove "esi level" string before the triage prediction

In [None]:
def extract_esi_level(text):
    # Regular expression to find "ESI Level" followed by a number
    if pd.isna(text):
        return(text)
    else:
        match = re.search(r'esi level\s*(\d+)', text.lower())
        
        if match:
            return match.group(1)  # Extract the number part
        else:
            return text
        
df["triage_Claude3.5"] = df["triage_Claude3.5"].apply(extract_esi_level)
df["triage_Claude3"] = df["triage_Claude3"].apply(extract_esi_level)
df["triage_Haiku"] = df["triage_Haiku"].apply(extract_esi_level)
df["triage_Claude3.5_Clinical"] = df["triage_Claude3.5_Clinical"].apply(extract_esi_level)
df["triage_Claude3_Clinical"] = df["triage_Claude3_Clinical"].apply(extract_esi_level)
df["triage_Haiku_Clinical"] = df["triage_Haiku_Clinical"].apply(extract_esi_level)

## convert the prediction to "int"

In [None]:
def convert_to_int(text):
    try:
        return int(text)  # Try to convert the string to an integer
    except ValueError:
        return text
    
df["triage_Claude3.5"] = df["triage_Claude3.5"].apply(convert_to_int)
df["triage_Claude3"] = df["triage_Claude3"].apply(convert_to_int)
df["triage_Haiku"] = df["triage_Haiku"].apply(convert_to_int)
df["triage_Claude3.5_Clinical"] = df["triage_Claude3.5_Clinical"].apply(convert_to_int)
df["triage_Claude3_Clinical"] = df["triage_Claude3_Clinical"].apply(convert_to_int)
df["triage_Haiku_Clinical"] = df["triage_Haiku_Clinical"].apply(convert_to_int)


## save file
df.to_csv('MIMIC-IV-Ext-Triage-prediction.csv', index=False)

# Specialty

## Load Data Specialty

In [None]:
## Load Data from Claude_triage_diagnosis_specialty.ipynb
df = pd.read_csv("MIMIC-IV-Ext-Diagnosis-Specialty.csv")

## extract the Prediction from the <Specialty> tag

In [None]:
def parse_specialty(specialty):
    #parse whats in between <specialty>  and </specialty> tag
    try:
        spec1 = specialty.split('<specialty>')[1].split('</specialty>')[0]
        spec2 = specialty.split('<specialty>')[2].split('</specialty>')[0]
        spec3 = specialty.split('<specialty>')[3].split('</specialty>')[0]
        specialty = [spec1, spec2, spec3]
    except Exception as e1:
            e1
            try: 
                specialty = specialty.split('<specialty>')[-1].split('</specialty>')[0]
            except Exception as e2:
                print(f"{e2}")
    return specialty

df["specialty_Claude3.5"] = df["diag_spec_Claude3.5"].apply(parse_specialty)
df["specialty_Claude3"] = df["diag_spec_Claude3"].apply(parse_specialty)
df["specialty_Haiku"] = df["diag_spec_Haiku"].apply(parse_specialty)
df["specialty_Claude3.5_Clinical"] = df["diag_spec_Claude3.5_Clinical"].apply(parse_specialty)
df["specialty_Claude3_Clinical"] = df["diag_spec_Claude3_Clinical"].apply(parse_specialty)
df["specialty_Haiku_Clinical"] = df["diag_spec_Haiku_Clinical"].apply(parse_specialty)


## remove leading newline 

In [None]:
def remove_leading_newline(text):
    if isinstance(text, str):  # Check if the input is a string
        # Remove leading '\n' (newline) and leading '\\n' (literal backslash followed by 'n')
        if text.startswith("\\n"):  # Handle literal "\\n"
            return text[2:]  # Remove the first two characters (i.e., "\\n")
        else:
            return text.lstrip('\n')  # Remove actual newline characters
    else:
        return text  # Return the input unchanged if it's not a string

df["specialty_Claude3.5"] = df["specialty_Claude3.5"].apply(remove_leading_newline)
df["specialty_Claude3"] = df["specialty_Claude3"].apply(remove_leading_newline)
df["specialty_Haiku"] = df["specialty_Haiku"].apply(remove_leading_newline)
df["specialty_Claude3.5_exp"] = df["specialty_Claude3.5_exp"].apply(remove_leading_newline)
df["specialty_Claude3_exp"] = df["specialty_Claude3_exp"].apply(remove_leading_newline)
df["specialty_Haiku_exp"] = df["specialty_Haiku_exp"].apply(remove_leading_newline)

## create a list of the predicted specialties for each patient

In [None]:
def create_list(text):
    if type(text) == str:
        try:
            text1 = text.split('\n')[0]
            text2 = text.split('\n')[1]
            text3 = text.split('\n')[2]
            text = [text1, text2, text3]
        except Exception as e1:
            try:
                text1 = text.split('\\n')[0]
                text2 = text.split('\\n')[1]
                text3 = text.split('\\n')[2]
                text = [text1, text2, text3]
            except Exception as e2:
                print(text, f"{e2}")
        return text
    else:
        return(text)

df["specialty_Claude3.5"] = df["specialty_Claude3.5"].apply(create_list)
df["specialty_Claude3"] = df["specialty_Claude3"].apply(create_list)
df["specialty_Haiku"] = df["specialty_Haiku"].apply(create_list)
df["specialty_Claude3.5_Clinical"] = df["specialty_Claude3.5_Clinical"].apply(create_list)
df["specialty_Claude3_Clinical"] = df["specialty_Claude3_Clinical"].apply(create_list)
df["specialty_Haiku_Clinical"] = df["specialty_Haiku_Clinical"].apply(create_list)

## remove the numeration in some of the predictions

In [None]:
def remove_numeration(entry):
    # Use regular expression to remove leading numeration only for '1.', '2.', or '3.'
    return re.sub(r'^[1-3]\.\s*', '', entry)

df["specialty_Claude3.5"] = df["specialty_Claude3.5"].apply(lambda lst: [remove_numeration(entry) for entry in lst] if isinstance(lst, list) else lst)
df["specialty_Claude3"] = df["specialty_Claude3"].apply(lambda lst: [remove_numeration(entry) for entry in lst] if isinstance(lst, list) else lst)
df["specialty_Haiku"] = df["specialty_Haiku"].apply(lambda lst: [remove_numeration(entry) for entry in lst] if isinstance(lst, list) else lst)
df["specialty_Claude3.5_Clinical"] = df["specialty_Claude3.5_Clinical"].apply(lambda lst: [remove_numeration(entry) for entry in lst] if isinstance(lst, list) else lst)
df["specialty_Claude3_Clinical"] = df["specialty_Claude3_Clinical"].apply(lambda lst: [remove_numeration(entry) for entry in lst] if isinstance(lst, list) else lst)
df["specialty_Haiku_Clinical"] = df["specialty_Haiku_Clinical"].apply(lambda lst: [remove_numeration(entry) for entry in lst] if isinstance(lst, list) else lst)


## Cleaning the Dataframe for clarity

In [None]:
df = df.drop(columns=["diag_spec_Claude3.5", "diag_spec_Claude3", "diag_spec_Haiku", "diag_spec_Claude3.5_Clinical", "diag_spec_Claude3_Clinical", "diag_spec_Haiku_Clinical"])

## save file
df.to_csv('MIMIC-IV-Ext-Specialty-prediction.csv', index=False)

# Diagnosis

## Load Data Diagnosis

In [None]:
## Load Data from Claude_triage_diagnosis_specialty.ipynb
df = pd.read_csv("MIMIC-IV-Ext-Diagnosis-Specialty.csv")

## extract the Prediction from the <diagnosis> tag

In [None]:
def parse_diagnosis(diagnosis):
    #parse whats in between <diagnosis>  and </diagnosis> tag
    try:
        diag1 = diagnosis.split('<diagnosis>')[1].split('</diagnosis>')[0]
        diag2 = diagnosis.split('<diagnosis>')[2].split('</diagnosis>')[0]
        diag3 = diagnosis.split('<diagnosis>')[3].split('</diagnosis>')[0]
        diagnosis = [diag1, diag2, diag3]
    except Exception as e1:
        e1
        try: 
            diagnosis = diagnosis.split('<diagnosis>')[-1].split('</diagnosis>')[0]
        except Exception as e2:
            print(f"{e2}")
    return diagnosis

df["diagnosis_Claude3.5"] = df["diag_spec_Claude3.5"].apply(parse_diagnosis)
df["diagnosis_Claude3"] = df["diag_spec_Claude3"].apply(parse_diagnosis)
df["diagnosis_Haiku"] = df["diag_spec_Haiku"].apply(parse_diagnosis)
df["diagnosis_Claude3.5_Clinical"] = df["diag_spec_Claude3.5_Clinical"].apply(parse_diagnosis)
df["diagnosis_Claude3_Clinical"] = df["diag_spec_Claude3_Clinical"].apply(parse_diagnosis)
df["diagnosis_Haiku_Clinical"] = df["diag_spec_Haiku_Clinical"].apply(parse_diagnosis)



## remove leading newline 

In [None]:
def remove_leading_newline(text):
    if isinstance(text, str):  # Check if the input is a string
        # Remove leading '\n' (newline) and leading '\\n' (literal backslash followed by 'n')
        if text.startswith("\\n"):  # Handle literal "\\n"
            return text[2:]  # Remove the first two characters (i.e., "\\n")
        else:
            return text.lstrip('\n')  # Remove actual newline characters
    else:
        return text  # Return the input unchanged if it's not a string
    
df["diagnosis_Claude3.5"] = df["diagnosis_Claude3.5"].apply(remove_leading_newline)
df["diagnosis_Claude3"] = df["diagnosis_Claude3"].apply(remove_leading_newline)
df["diagnosis_Haiku"] = df["diagnosis_Haiku"].apply(remove_leading_newline)
df["diagnosis_Claude3.5_Clinical"] = df["diagnosis_Claude3.5_Clinical"].apply(remove_leading_newline)
df["diagnosis_Claude3_Clinical"] = df["diagnosis_Claude3_Clinical"].apply(remove_leading_newline)
df["diagnosis_Haiku_Clinical"] = df["diagnosis_Haiku_Clinical"].apply(remove_leading_newline)


## craete a list of the predicted diagnoses for each patient

In [None]:
def create_list(text):
    if type(text) == str:
        try:
            text1 = text.split('\n')[0]
            text2 = text.split('\n')[1]
            text3 = text.split('\n')[2]
            text = [text1, text2, text3]
        except Exception as e1:
            try:
                text1 = text.split('\\n')[0]
                text2 = text.split('\\n')[1]
                text3 = text.split('\\n')[2]
                text = [text1, text2, text3]
            except Exception as e2:
                print(text, f"{e2}")
        return text
    else:
        return(text)
    
df["diagnosis_Claude3.5"] = df["diagnosis_Claude3.5"].apply(create_list)
df["diagnosis_Claude3"] = df["diagnosis_Claude3"].apply(create_list)
df["diagnosis_Haiku"] = df["diagnosis_Haiku"].apply(create_list)
df["diagnosis_Claude3.5_Clinical"] = df["diagnosis_Claude3.5_Clinical"].apply(create_list)
df["diagnosis_Claude3_Clinical"] = df["diagnosis_Claude3_Clinical"].apply(create_list)
df["diagnosis_Haiku_Clinical"] = df["diagnosis_Haiku_Clinical"].apply(create_list)

## function to remove the numeration in some of the predictions

In [None]:
def remove_numeration(entry):
    # Use regular expression to remove leading numeration only for '1.', '2.', or '3.'
    return re.sub(r'^[1-3]\.\s*', '', entry)

# Apply the function to each list in the column
df["diagnosis_Claude3.5"] = df["diagnosis_Claude3.5"].apply(lambda lst: [remove_numeration(entry) for entry in lst] if isinstance(lst, list) else lst)
df["diagnosis_Claude3"] = df["diagnosis_Claude3"].apply(lambda lst: [remove_numeration(entry) for entry in lst] if isinstance(lst, list) else lst)
df["diagnosis_Haiku"] = df["diagnosis_Haiku"].apply(lambda lst: [remove_numeration(entry) for entry in lst] if isinstance(lst, list) else lst)
df["diagnosis_Claude3.5_Clinical"] = df["diagnosis_Claude3.5_Clinical"].apply(lambda lst: [remove_numeration(entry) for entry in lst] if isinstance(lst, list) else lst)
df["diagnosis_Claude3_Clinical"] = df["diagnosis_Claude3_Clinical"].apply(lambda lst: [remove_numeration(entry) for entry in lst] if isinstance(lst, list) else lst)
df["diagnosis_Haiku_Clinical"] = df["diagnosis_Haiku_Clinical"].apply(lambda lst: [remove_numeration(entry) for entry in lst] if isinstance(lst, list) else lst)


## Cleaning the Dataframe for clarity

In [None]:
df = df.drop(columns=["diag_spec_Claude3.5", "diag_spec_Claude3", "diag_spec_Haiku", "diag_spec_Claude3.5_Clinical", "diag_spec_Claude3_Clinical", "diag_spec_Haiku_Clinical"])


## save file
df.to_csv('MIMIC-IV-Ext-Diagnosis-prediction.csv', index=False)

# Additional Postprocessing

In [None]:
## load dataframes from above
spec = pd.read_csv("MIMIC-IV-Ext-Specialty-prediction.csv")
diag = pd.read_csv("MIMIC-IV-Ext-Diagnosis-prediction.csv")
triage = pd.read_csv("MIMIC-IV-Ext-Diagnosis-Triage-prediction.csv")

In [None]:
## Convert the specialty rows into lists - data in columns are stored as strings but actually represent lists
spec['specialty_primary_diagnosis'] = spec['specialty_primary_diagnosis'].apply(lambda x: eval(x))


## delete empty specialties and initial vitals
mask1 = spec["specialty_primary_diagnosis"].str.len() == 0
mask2 = spec["initial_vitals"].isna()
mask = pd.Index(mask1 | mask2)
spec = spec[~mask]
diag = diag[~mask]
triage = triage[~mask]

## delete these row due to no possible output from the LLM
spec = spec.drop([795,2176,1208], inplace=False)
diag = diag.drop([795,2176,1208], inplace=False)
triage = triage.drop([795,2176,1208], inplace=False)


## delete where specialty gt is "no answer"
mask = ~spec["specialty_primary_diagnosis"].apply(lambda x: any(item == "no answer" for item in x))
spec = spec[mask]
diag = diag[mask]
triage = triage[mask]


## convert triage/acuity to type int
triage["triage_Claude3.5"] = triage["triage_Claude3.5"].astype(int)
triage["triage_Claude3"] = triage["triage_Claude3"].astype(int)
triage["triage_Haiku"] = triage["triage_Haiku"].astype(int)
triage["triage_Claude3.5_Clinical"] = triage["triage_Claude3.5_Clinical"].astype(int)
triage["triage_Claude3_Clinical"] = triage["triage_Claude3_Clinical"].astype(int)
triage["triage_Haiku_Clinical"] = triage["triage_Haiku_Clinical"].astype(int)


## extract first 2000 values
spec = spec[:2000]
diag = diag[:2000]
triage = triage[:2000]


## save files
spec.to_csv('MIMIC-IV-Ext-Specialty-prediction.csv', index=False)
diag.to_csv('MIMIC-IV-Ext-Diagnosis-prediction.csv', index=False)
triage.to_csv('MIMIC-IV-Ext-Triage-prediction.csv', index=False)