In [None]:
%pip install pandas==1.5.3 numpy==1.26.4 openai==1.37.1 langchain_community==0.2.9 langchain_text_splitters==0.2.2 langchain_openai==0.1.17 statsmodels==0.14.3 scipy==1.14.0 replicate==0.32.1 Scikit-learn==1.5.1 python-dotenv==1.0.1


# DATA STANDARDIZATION

In [None]:
import pandas as pd
import numpy as np
import re

In [None]:
# Load RIS dataset
df = pd.read_csv(filepath_or_buffer='/path/document.csv', sep=';')

#Delete unneccessary columns & reorder dataframe
df = df.drop(columns=[])
new_order =['new order']
df_neworder = df.reindex(columns=new_order)

#Rename MRI devices with their respectiv magnetic flux density
df_devices = df_neworder.replace({'MRI device':'magnetic flux density'})

In [None]:
# Function to extract paragraphs based on the starting keyword
def extract_paragraph(text, start_keywords):
    # Create a regex pattern that matches any of the start keywords
    start_pattern = '|'.join([re.escape(keyword) for keyword in start_keywords])
    pattern = re.compile(rf'({start_pattern})(.*?)(?=(\n[A-Z][^:]*:|\Z))', re.DOTALL)
    match = pattern.search(text)
    return match.group(2).strip() if match else None

# Keywords for clinical question & imaging procedure description
klinik_keywords = ['keywords']
technik_keywords = ['keywords']

# Apply the function to extract the required paragraphs and store in a new column
df_devices['new column clinical question'] = df_devices['report'].apply(lambda x: extract_paragraph(x, klinik_keywords))
df_devices['new column sequences'] = df_devices['report'].apply(lambda x: extract_paragraph(x, technik_keywords))

# Delete the originial report column
df_extracted = df_devices.drop(columns=['report'])

In [None]:
# Keywords for contrast medium administration
keywords = ['keywords']

# Function to check if any keyword is in the text
def contains_keywords(text, keywords):
    return any(keyword in text for keyword in keywords)

# Create a new list to store the results
contrast_medium = df_extracted['sequences'].apply(lambda x: contains_keywords(x, keywords))

# Add the list as a new column to the DataFrame
df_extracted['new column contrast medium administration'] = contrast_medium

In [None]:
#Save result in a new CSV file
df_extracted.to_csv('/path/standardized document.csv',sep=',')

# PROTOCOL PREDICTION OPEN SOURCE MODEL

In [None]:
import pandas as pd
import replicate
import os
import openai
import re
from dotenv import load_dotenv
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS

## LLAMA 3.1 405B 

### PIPELINE

In [None]:
#Load data with clinical question
df = pd.read_csv(filepath_or_buffer='/path/clinical_questions.csv', sep=';')

#Load enviroment variables
load_dotenv()

#Set API Token
replicate_api_token = os.getenv('REPLICATE_API_KEY')
replicate_client = replicate.Client(api_token=replicate_api_token)

standard_sequences = ['Axiale T1','Coronare T1','Axiale T2','Sagittale T2','Axiale T2 FS','Coronare T2 FS','Axiale T2*','Axiale STIR','Sagittale STIR','Coronare TIR','Axiale T2 BLADE','Axiale T1 FLASH','Axiale T2 FLASH','Sagittale T1 FLASH','Coronare T1 FLASH','Axiale T2 SPACE','Sagittale T2 SPACE','Coronare FLAIR','Axiale FLAIR','Sagittale FLAIR','Coronare FLAIR FAST','3D FLAIR','3D T1 SPACE FS','3D T2 SPACE FS','T1 MPRAGE','T1 VIBE','Coronare T1 Dynamik','Axiale SWI','Axiale DWI','Coronare DWI','Axiale DTI','Art. TOF-MRA']

#Prediction Pipeline
def prediction_prompt(row):
    input = {
    "top_p": 0.1,
    "prompt": (
        f"Du bist Neuroradiologe. Du bekommst eine radiologische Fragestellung.\n"
        f"Die Fragestellung enthält Abkürzungen. Formuliere dafür die Abkürzungen aus und antworte so: Abkürzung:ausgeschriebene Abkürzung.\n"
        f"Wenn es bereits eine Diagnose gibt, nenne diese. Führe dann die 3 wahrscheinlichsten Differentialdiagnosen für diese Fragestellung auf.\n"
        f"Dir stehen standardisierte MRT Sequenzen zur Verfügung: {standard_sequences}.\n"
        f"Nenne alle Sequenzen aus der Liste der standardisierten MRT Sequenzen, die in der MRT Untersuchung nötig sind, um die Fragestellung zu beantworten. Falls eine Sequenz vor und nach der Kontrastmittelgabe geplant werden soll, nenne sie zweifach.\n"
        f"Bestimme außerdem, ob in der MRT Untersuchung Kontrastmittel gegeben werden soll oder nicht. Wenn ja, schreibe TRUE, wenn nicht, schreibe FALSE.\n"
        f"Befolge stets das Antwortformat. Erkläre dein Vorgehen nicht. Gib nur die erfragten Informationen wieder, füge keine weiteren Zeichen hinzu.\n"
        f"Deine Antwort soll folgendes Format haben:\n"
        f"Abkürzungen: Abkürzung:ausgeschriebene Abkürzung, Abkürzung:ausgeschriebene Abkürzung, ...\n"
        f"Diagnose: bereits bekannte Diagnose, Differentialdiagnosen: 1. wahrscheinlichste, 2. zweitwahrscheinlichste, 3. drittwahrscheinlichste\n"
        f"Sequenzen: Sequenz 1, Sequenz 2, Sequenz 3, ...\n"
        f"Kontrastmittelgabe: TRUE oder FALSE\n"
        f"Radiologische Fragestellung: {row['Fragestellung']}"),
    "min_tokens": 0,
    "temperature": 0.1,
    "prompt_template": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
    "presence_penalty": 1.15
}
    # Using Replicate's API to run the prediction
    output = replicate_client.run('meta/meta-llama-3.1-405b-instruct', input=input)
    
    # Join tokens into a coherent string
    if isinstance(output, list):
        output = ''.join(output)

    return output

# Iterate over the test DataFrame and generate predictions
results = []

for index, row in df.iterrows():  
    result = prediction_prompt(row)
    results.append(result)


### EXTRACT THE DATA OF THE RESULTS

In [None]:
# Initialize the lists to store the extracted data
abbreviations = []
differential_diagnosis = []
sequence_prediction = []
contrastmedium_prediction = []

# Iterate through each entry in the results
for entry in results:
    # Match the 'Abkürzungen:' section
    abbreviation_match = re.search(r'Abkürzungen:\s*(.*?)\n', entry, re.DOTALL)
    if abbreviation_match:
        abbreviations.append(abbreviation_match.group(1).strip())
    else:
        abbreviations.append(None)  # Append None if the section is missing

    # Match the 'Differentialdiagnosen:' section
    dd_match = re.search(r'Diagnose:\s*(.*?)\n', entry, re.DOTALL)
    if dd_match:
        differential_diagnosis.append(dd_match.group(1).strip())
    else:
        differential_diagnosis.append(None)  # Append an None if the section is missing
    
    # Match the 'Sequenzen:' section
    sequenzen_match = re.search(r'Sequenzen:\s*(.*?)\n', entry, re.DOTALL)
    if (sequenzen_match):
        sequence_prediction.append(sequenzen_match.group(1).strip())
    else:
        sequence_prediction.append(None)  # Append None if the section is missing

    # Match the 'Kontrastmittelgabe:' section
    contrastmedium_match = re.search(r'Kontrastmittelgabe:\s*(TRUE|FALSE)', entry, re.DOTALL)
    if contrastmedium_match:
        contrastmedium_prediction.append(contrastmedium_match.group(1).strip())
    else:
        contrastmedium_prediction.append(None)  # Append None if the section is missing


#Add the extracted results to the dataframe
df.insert(8,'Abkürzungen',abbreviations)
df.insert(9,'Diagnose/DD',differential_diagnosis)
df.insert(10,'Vorhersage Sequenzen', sequence_prediction)
df.insert(11,'Vorhersage Kontrastmittelgabe', contrastmedium_prediction)

#Save the dataframe as csv
df.to_csv(path_or_buf='/path/results',sep=';')

## LLAMA 3.1 405B WITH RAG

### PIPELINE

In [None]:
# Load data with clinical question
df = pd.read_csv(filepath_or_buffer='/path/clinical questions.csv', sep=';')

#Load enviroment variables
load_dotenv()

# Set API Token
replicate_api_token = os.getenv('REPLICATE_API_KEY')
replicate_client = replicate.Client(api_token=replicate_api_token)

# Set OpenAI API key
openai_api_key = os.getenv('OPENAI_API_KEY')
openai.api_key = openai_api_key

# Function to load and split PDF documents
def get_docs():
    loader_pdf = PyPDFLoader('/path/guidelines')
    pdf_doc = loader_pdf.load()

    text_splitter = RecursiveCharacterTextSplitter(
        separators=["\n \n", "\n", " ", ""],
        chunk_size=400,
        chunk_overlap=0,
        length_function=len,
        is_separator_regex=False
    )

    splitpdf = text_splitter.split_documents(pdf_doc)
    return splitpdf

# Function to create the vector store
def create_vector_store(docs):
    embedding = OpenAIEmbeddings(openai_api_key=openai_api_key, model="text-embedding-ada-002")
    vectorStore = FAISS.from_documents(docs, embedding=embedding)
    return vectorStore

docs = get_docs()
vectorStore = create_vector_store(docs)


#Prediction Pipeline
def prediction_prompt(row):
    #Embed clinical question into query
    query = f"Das ist eine radiologische Fragestellung: {row['Fragestellung']}. Zu welcher 'Anwendung' gehört sie? Das benutzte 'Gerät' ist {row['Gerät']}."
    
    #Evoke retrieval of documents
    retriever = vectorStore.as_retriever(search_kwargs={"k": 4})
    retrieved_docs = retriever.get_relevant_documents(query)

    input = {
    "top_p": 0.1,
    "prompt": (
        f"Du bist Neuroradiologe. Du bekommst eine radiologische Fragestellung.\n"
        f"Die Fragestellung enthält Abkürzungen. Formuliere nur die Abkürzungen aus der Fragestellung aus und antworte so: Abkürzung:ausgeschriebene Abkürzung.\n"
        f"Wenn es bereits eine Diagnose gibt, nenne diese. Führe dann die 3 wahrscheinlichsten Differentialdiagnosen für diese Fragestellung auf.\n"
        f"Du hast 4 MRT-Protokolle zur Verfügung: {retrieved_docs}. Jedes Protokoll enthält einen Abschnitt zur Anwendung. Dieser behinhaltet für welche radiologischen Fragestellungen das jeweilige Protokoll geeignet ist.\n"
        f"Wähle das Protokoll aus, was am besten zur Beantwortung dieser radiologischen Fragestellung passt. Berücksichtige hierbei vorallem die Details aus Anwendungsbeschreibung des Protokolls.\n"
        f"Gib die Informationen des Protokolls unverändert wieder. Befolge stets das Antwortformat. Erkläre dein Vorgehen nicht, füge keine weiteren Zeichen hinzu.\n"
        f"Deine Antwort soll folgendes Format haben:\n"
        f"Abkürzungen: Abkürzung:ausgeschriebene Abkürzung, Abkürzung:ausgeschriebene Abkürzung, ...\n"
        f"Diagnose: bereits bekannte Diagnose, Differentialdiagnosen: 1. wahrscheinlichste, 2. zweitwahrscheinlichste, 3. drittwahrscheinlichste\n"
        f"Protokollname: Name des Protokolls\n"
        f"Sequenzen: Sequenzen des Protokolls\n"
        f"Kontrastmittelgabe: Kontrastmittelgabe im Protokoll\n"
        f"Radiologische Fragestellung: {row['Fragestellung']}"),
    "min_tokens": 0,
    "temperature": 0.1,
    "prompt_template": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
    "presence_penalty": 1.15
}
    # Using Replicate's API to run the prediction
    output = replicate_client.run('meta/meta-llama-3.1-405b-instruct', input=input)
    
    # Join tokens into a coherent string
    if isinstance(output, list):
        output = ''.join(output)

    return output, retrieved_docs

# Iterate over the test DataFrame and generate predictions
results = []
retrieved_documents = []

for index, row in df.iterrows():  
    result, retrieved_docs = prediction_prompt(row)
    results.append(result)
    retrieved_documents.append(retrieved_docs)



### EXTRACT THE DATA OF THE RESULTS

In [None]:
retrieved_docs_name = []

# Iterate through each list in the retrieved_documents and extract protocol name
for entry_list in retrieved_documents:
    name_group = []

    if isinstance(entry_list, list):
        # Iterate through each document-like object in the list
        for entry in entry_list:
            # Check if the entry has 'page_content'
            if hasattr(entry, 'page_content'):
                page_content = entry.page_content
                
                # Split the content by lines
                lines = page_content.split('\n')
                
                for line in lines:
                    # Adjust to match both 'Name :' and 'Name:'
                    if re.match(r"Name\s*:", line.strip()):
                        # Extract the part after 'Name:'
                        name = line.split(":", 1)[1].strip()
                        name_group.append(name)

    # Append the group of names (per row) to the main list
    retrieved_docs_name.append(name_group)



In [None]:
# Initialize the lists to store the extracted data
abbreviations = []
differential_diagnosis = []
protocol_prediction = []
sequence_prediction = []
contrastmedium_prediction = []

# Iterate through each entry in the results
for entry in results:
    # Match the 'Abkürzungen:' section
    abbreviation_match = re.search(r'Abkürzungen:\s*(.*?)\n', entry, re.DOTALL)
    if abbreviation_match:
        abbreviations.append(abbreviation_match.group(1).strip())
    else:
        abbreviations.append(None)  # Append None if the section is missing

    # Match the 'Differentialdiagnosen:' section
    dd_match = re.search(r'Diagnose:\s*(.*?)\n', entry, re.DOTALL)
    if dd_match:
        differential_diagnosis.append(dd_match.group(1).strip())
    else:
        differential_diagnosis.append(None)  # Append an None if the section is missing
    
    # Match the 'Protokollname:' section
    protocol_match = re.search(r'Protokollname:\s*(.*?)\n', entry, re.DOTALL)
    if protocol_match:
        protocol_prediction.append(protocol_match.group(1).strip())
    else:
        protocol_prediction.append(None)  # Append None if the section is missing

    # Match the 'Sequenzen:' section
    sequenzen_match = re.search(r'Sequenzen:\s*(.*?)\n', entry, re.DOTALL)
    if sequenzen_match:
        sequence_prediction.append(sequenzen_match.group(1).strip())
    else:
        sequence_prediction.append(None)  # Append None if the section is missing

    # Match the 'Kontrastmittelgabe:' section
    contrastmedium_match = re.search(r'Kontrastmittelgabe:\s*(ja|nein|gegebenenfalls)', entry, re.DOTALL)
    if contrastmedium_match:
        contrastmedium_prediction.append(contrastmedium_match.group(1).strip())
    else:
        contrastmedium_prediction.append(None)  # Append None if the section is missing

#Add extrated results to dataframe 
df.insert(7,'Abkürzungen',abbreviations)
df.insert(8,'Diagnose/DD',differential_diagnosis)
df.insert(9,'Vorhersage Protokollname',protocol_prediction)
df.insert(10,'Vorhersage Sequenzen', sequence_prediction)
df.insert(11,'Vorhersage Kontrastmittelgabe', contrastmedium_prediction)
df.insert(12,'Retrieved Documents',retrieved_docs_name)

#Save the dataframe as CSV
df.to_csv(path_or_buf='/path/results',sep=';')

# PROTOCOL PREDICTION PROPRIETARY MODEL

In [None]:
import os
import pandas as pd
from dotenv import load_dotenv
import openai
import re
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter


## GPT-4o 

### PIPELINE

In [None]:
# Load environment variables
load_dotenv()

# Set OpenAI API key
openai_api_key = os.getenv('OPENAI_API_KEY')
openai.api_key = openai_api_key

# Load the data with clinical questions
df = pd.read_csv(filepath_or_buffer='/path/clinical question.csv', sep=',')

standard_sequences = ['Axiale T1','Coronare T1','Axiale T2','Sagittale T2','Axiale T2 FS','Coronare T2 FS','Axiale T2*','Axiale STIR','Sagittale STIR','Coronare TIR','Axiale T2 BLADE','Axiale T1 FLASH','Axiale T2 FLASH','Sagittale T1 FLASH','Coronare T1 FLASH','Axiale T2 SPACE','Sagittale T2 SPACE','Coronare FLAIR','Axiale FLAIR','Sagittale FLAIR','Coronare FLAIR FAST','3D FLAIR','3D T1 SPACE FS','3D T2 SPACE FS','T1 MPRAGE','T1 VIBE','Coronare T1 Dynamik','Axiale SWI','Axiale DWI','Coronare DWI','Axiale DTI','Art. TOF-MRA']

def prediction_prompt(standard_sequences,row):
    prompt = (
        f"Du bist Neuroradiologe. Du bekommst eine radiologische Fragestellung.Radiologische Fragestellung: {row['Fragestellung']}\n"
        f"Dir stehen standardisierte MRT Sequenzen zur Verfügung: {standard_sequences}.\n"
        f"Nenne alle Sequenzen aus der Liste der standardisierten MRT Sequenzen, die in der MRT Untersuchung nötig sind, um die Fragestellung zu beantworten. Falls eine Sequenz vor und nach der Kontrastmittelgabe geplant werden soll, nenne sie zweifach.\n"
        f"Bestimme außerdem, ob in der MRT Untersuchung Kontrastmittel gegeben werden soll oder nicht. Wenn ja, schreibe TRUE, wenn nicht, schreibe FALSE.\n"
        f"Befolge stets das Antwortformat. Erkläre dein Vorgehen nicht. Gib nur die erfragten Informationen wieder, füge keine weiteren Zeichen hinzu.\n"
        f"Deine Antwort soll folgendes Format haben:\n"
        f"Sequenzen: Sequenz 1, Sequenz 2, Sequenz 3, ...\n"
        f"Kontrastmittelgabe: TRUE oder FALSE\n"
    )
    
    completion = openai.chat.completions.create(
        model="gpt-4o",
        temperature=0.1,
        messages=[
            {"role": "system", "content": "You are a neuroradiologist."},
            {"role": "user", "content": prompt}
        ]
    )
    
    result = completion.choices[0].message.content.strip()

    print(result)
    return result

# Iterate over the test DataFrame and generate predictions
results = []

for index, row in df.iterrows():
    result = prediction_prompt(standard_sequences,row)
    results.append(result)

print(results)
    

### EXTRACT DATA OF THE RESULTS

In [None]:
# Initialize the lists to store the extracted data
abbreviations = []
differential_diagnosis = []
sequence_prediction = []
contrastmedium_prediction= []

# Iterate through each entry in the results
for entry in results:
    # Match the 'Abkürzungen:' section
    abbreviation_match = re.search(r'Abkürzungen:\s*(.*?)\n', entry, re.DOTALL)
    if abbreviation_match:
        abbreviations.append(abbreviation_match.group(1).strip())

    # Match the 'Differentialdiagnosen:' section
    dd_match = re.search(r'Diagnose:\s*(.*?)\n', entry, re.DOTALL)
    if dd_match:
        # Extract the list of differential diagnoses
        dd_list = [dd.strip() for dd in dd_match.group(1).split(',')]
        differential_diagnosis.append(dd_list)

    # Match the 'Sequenzen:' section
    sequenzen_match = re.search(r'Sequenzen:\s*(.*?)\n', entry, re.DOTALL)
    if sequenzen_match:
        sequence_prediction.append(sequenzen_match.group(1).strip())

    # Match the 'Kontrastmittelgabe:' section
    contrastmedium_match = re.search(r'Kontrastmittelgabe:\s*(TRUE|FALSE)', entry, re.DOTALL)
    if contrastmedium_match:
        contrastmedium_prediction.append(contrastmedium_match.group(1).strip())

#Add extracted results to the dataframe
df.insert(8,'Abkürzungen', abbreviations)
df.insert(9,'Differentialdiagnosen', differential_diagnosis)
df.insert(10,'Vorhersage Sequenzen', sequence_prediction)
df.insert(11,'Vorhersage Kontrastmittelgabe', contrastmedium_prediction)

#Save as csv
df.to_csv(path_or_buf='/path/results.csv',sep=';')

## GPT-4o with RAG

### PIPELINE

In [None]:
# Load environment variables
load_dotenv()

# Set OpenAI API key
openai_api_key = os.getenv('OPENAI_API_KEY')
openai.api_key = openai_api_key

# Load the CSV file
df = pd.read_csv(filepath_or_buffer='path/clinical questions.csv', sep=';')

# Function to load and split the PDF document
def get_docs():
    loader_pdf = PyPDFLoader('/path/guidelines.pdf')
    pdf_doc = loader_pdf.load()
    
    text_splitter = RecursiveCharacterTextSplitter(
        separators=["\n \n", "\n", " ", ""],
        chunk_size=450,
        chunk_overlap=0,
        length_function=len,
        is_separator_regex=False
    )

    splitpdf = text_splitter.split_documents(pdf_doc)
    return splitpdf

# Function to create the vector store
def create_vector_store(docs):
    embedding = OpenAIEmbeddings(openai_api_key=openai_api_key, model="text-embedding-ada-002")
    vectorStore = FAISS.from_documents(docs, embedding=embedding)
    return vectorStore

docs = get_docs()
vectorStore = create_vector_store(docs)

# Prediction Pipeline
def prediction_prompt(row):
    #Embed clinical question into query
    query = f"Das ist eine radiologische Fragestellung:{row['Fragestellung']}. Zu welcher 'Anwendung' gehört sie? Das benutzte 'Gerät' ist {row['Gerät']}."

    #Evoke retrieval of documents
    retriever = vectorStore.as_retriever(search_kwargs={"k": 4})
    retrieved_docs = retriever.get_relevant_documents(query)

    prompt = (
        f"Du bist Neuroradiologe. Du bekommst eine radiologische Fragestellung.\n"
        f"Die Fragestellung enthält Abkürzungen. Formuliere nur die Abkürzungen aus der Fragestellung aus und antworte so: Abkürzung:ausgeschriebene Abkürzung.\n"
        f"Wenn es bereits eine Diagnose gibt, nenne diese. Führe dann die 3 wahrscheinlichsten Differentialdiagnosen für diese Fragestellung auf.\n"
        f"Du hast 4 MRT-Protokolle zur Verfügung: {retrieved_docs}. Jedes Protokoll enthält einen Abschnitt zur Anwendung. Dieser behinhaltet für welche radiologischen Fragestellungen das jeweilige Protokoll geeignet ist.\n"
        f"Wähle das Protokoll aus, was am besten zur Beantwortung dieser radiologischen Fragestellung passt. Berücksichtige hierbei vorallem die Details aus Anwendungsbeschreibung des Protokolls.\n"
        f"Gib die Informationen des Protokolls unverändert wieder. Befolge stets das Antwortformat. Erkläre dein Vorgehen nicht, füge keine weiteren Zeichen hinzu.\n"
        f"Deine Antwort soll folgendes Format haben:\n"
        f"Abkürzungen: Abkürzung:ausgeschriebene Abkürzung, Abkürzung:ausgeschriebene Abkürzung, ...\n"
        f"Diagnose: bereits bekannte Diagnose, Differentialdiagnosen: 1. wahrscheinlichste, 2. zweitwahrscheinlichste, 3. drittwahrscheinlichste\n"
        f"Protokollname: Name des Protokolls\n"
        f"Sequenzen: Sequenzen des Protokolls\n"
        f"Kontrastmittelgabe: Kontrastmittelgabe im Protokoll\n"
        f"Radiologische Fragestellung: {row['Fragestellung']}"
        )
    
    completion = openai.chat.completions.create(
        model="gpt-4o",
        temperature=0.1,
        messages=[
            {"role": "system", "content": "You are a neuroradiologist."},
            {"role": "user", "content": prompt}
        ]
    )
    
    result = completion.choices[0].message.content.strip()

    return result, retrieved_docs

# Iterate over the test DataFrame and generate predictions
results = []
retrieved_documents = []

for index, row in df.iterrows():  
    result, retrieved_docs = prediction_prompt(row)
    results.append(result)
    retrieved_documents.append(retrieved_docs)


### EXTRACT DATA OF THE RESULTS

In [None]:
retrieved_docs_name = []

# Iterate through each list in the retrieved_documents and extract protocol name
for entry_list in retrieved_documents:
    name_group = []  

    if isinstance(entry_list, list):
        # Iterate through each document-like object in the list
        for entry in entry_list:
            # Check if the entry has 'page_content'
            if hasattr(entry, 'page_content'):
                page_content = entry.page_content
                
                # Split the content by lines
                lines = page_content.split('\n')
                
                for line in lines:
                    if re.match(r"Name\s*:", line.strip()):
                        # Extract the part after 'Name:'
                        name = line.split(":", 1)[1].strip()
                        name_group.append(name)

    # Append the group of names (per row) to the main list
    retrieved_docs_name.append(name_group)

In [None]:
# Initialize the lists to store the extracted data
abbreviations = []
diagnosis = []
differential_diagnosis = []
protocol_prediction = []
sequence_prediction = []
contrastmedium_prediction = []

# Iterate through each entry in the results
for entry in results:
    # Match the 'Abkürzungen:' section
    abbreviation_match = re.search(r'Abkürzungen:\s*(.*?)\n(?:Diagnose|Differentialdiagnosen):', entry, re.DOTALL)
    if abbreviation_match:
        abbreviations.append(abbreviation_match.group(1).strip())

    # Match the 'Diagnose:' section
    diagnosis_match = re.search(r'Diagnose:\s*(.*?)\n', entry, re.DOTALL)
    if diagnosis_match:
        diagnosis.append(diagnosis_match.group(1).strip())

    # Match the 'Differentialdiagnosen:' section
    dd_match = re.search(r'Diagnose:\s*(.*?)\n', entry, re.DOTALL)
    if dd_match:
        differential_diagnosis.append(dd_match.group(1).strip())
    else:
        differential_diagnosis.append(None)  # Append an None if the section is missing

    # Match the 'Protokollname:' section
    protocol_match = re.search(r'Protokollname:\s*(.*?)\n', entry, re.DOTALL)
    if protocol_match:
        protocol_prediction.append(protocol_match.group(1).strip())
        
    # Match the 'Sequenzen:' section
    sequenzen_match = re.search(r'Sequenzen:\s*(.*?)\n(?:Kontrastmittelgabe):', entry, re.DOTALL)
    if sequenzen_match:
        sequence_prediction.append(sequenzen_match.group(1).strip())

    # Match the 'Kontrastmittelgabe:' section
    contrastmedium_match = re.search(r'Kontrastmittelgabe:\s*(\S+)', entry)
    if contrastmedium_match:
        contrastmedium_prediction.append(contrastmedium_match.group(1).strip())

#Add extracted results to the dataframe
df.insert(7,'Abkürzungen', abbreviations)
df.insert(7,'Diagnosen', differential_diagnosis)
df.insert(8,'Vorhersage Protokollname',protocol_prediction)
df.insert(9,'Vorhersage Sequenzen', sequence_prediction)
df.insert(10,'Vorhersage Kontrastmittelgabe', contrastmedium_prediction)
df.insert(11,'Retrieved Documents',retrieved_docs_name)

#Save as csv
df.to_csv(path_or_buf='/path/results',sep=';')

# STATISTICAL ANALYSIS

In [None]:
from collections import Counter
from sklearn.metrics import cohen_kappa_score
from statsmodels.stats.contingency_tables import mcnemar
from scipy.stats import wilcoxon
import numpy as np
import pandas as pd
import re 
import ast

In [None]:
#Load results
df_OS_woRAG = pd.read_csv(filepath_or_buffer='/path/results',sep=';')
df_OS_RAG = pd.read_csv(filepath_or_buffer='/path/results',sep=';')
df_woRAG = pd.read_csv(filepath_or_buffer='/path/results',sep=';')
df_RAG = df_final = pd.read_csv(filepath_or_buffer='/path/results',sep=';')

#Import Ground Truth
df_GT = pd.read_csv(filepath_or_buffer='data/GT', sep=';')

## LLAMA 3.1 405B 

In [None]:
#Tokenize sequences
def tokenize(text):
    tokens = [token.strip() for token in text.split(',')]
    return tokens

#Function for calculation of token-based accuracy
def calculate_symmetric_token_based_accuracy(ground_truth, result):
    ground_truth_tokens = tokenize(ground_truth)
    result_tokens = tokenize(result)
    
    ground_truth_counts = Counter(ground_truth_tokens)
    result_counts = Counter(result_tokens)
    
    # Calculate the number of matching tokens considering repetitions
    matching_tokens_gt_to_res = sum(min(ground_truth_counts[token], result_counts[token]) for token in ground_truth_counts)
    matching_tokens_res_to_gt = sum(min(ground_truth_counts[token], result_counts[token]) for token in result_counts)
    
    # Total tokens in ground truth and result (considering repetitions)
    total_tokens_gt = sum(ground_truth_counts.values())
    total_tokens_res = sum(result_counts.values())
    
    # Calculate accuracy from ground truth to result
    accuracy_gt_to_res = matching_tokens_gt_to_res / total_tokens_gt if total_tokens_gt > 0 else 0
    
    # Calculate accuracy from result to ground truth
    accuracy_res_to_gt = matching_tokens_res_to_gt / total_tokens_res if total_tokens_res > 0 else 0
    
    # Symmetric accuracy: average of both directions
    symmetric_accuracy = (accuracy_gt_to_res + accuracy_res_to_gt) / 2
    
    return symmetric_accuracy

# Provide the data
ground_truths = df_GT['Sequenzen'] 
results1 = df_OS_woRAG['Vorhersage Sequenzen']

accuracies_OS_woRAG = []
for gt, res in zip(ground_truths, results1):
    accuracy = calculate_symmetric_token_based_accuracy(gt, res)
    accuracies_OS_woRAG.append(accuracy)

#Calculate average accuracy
average_accuracy_seq = sum(accuracies_OS_woRAG) / len(accuracies_OS_woRAG)

# Bootstrap function to calculate confidence intervals
def bootstrap_confidence_interval(data, num_samples=1000, confidence_level=0.95):
    # Resample with replacement and calculate means
    sample_means = [np.mean(np.random.choice(data, size=len(data), replace=True)) for _ in range(num_samples)]

    # Calculate the percentiles for the given confidence level
    lower_bound = np.percentile(sample_means, (1 - confidence_level) / 2 * 100)
    upper_bound = np.percentile(sample_means, (1 + confidence_level) / 2 * 100)

    return lower_bound, upper_bound
confidence_interval_seq = bootstrap_confidence_interval(accuracies_OS_woRAG)

#Contrast medium administration
#Replace values and change dtype
df_OS_woRAG['Vorhersage Kontrastmittelgabe'] = df_OS_woRAG['Vorhersage Kontrastmittelgabe'].replace({'ja': 1,'nein':0,'gegebenenfalls':0})
df_OS_woRAG['Vorhersage Kontrastmittelgabe'] = df_OS_woRAG['Vorhersage Kontrastmittelgabe'].astype(bool)

# Comparing the two columns and calculating accuracy
df_equal_OS_woRAG = df_OS_woRAG['Vorhersage Kontrastmittelgabe'] == df_GT['Kontrastmittelgabe'] 

# Calculating the number of correct predictions (where the comparison is True)
correct_predictions = df_equal_OS_woRAG.sum()

# Calculating accuracy
average_accuracy_cm = correct_predictions / len(df_equal_OS_woRAG) * 100

# Set the number of bootstrap samples
n_iterations = 1000
bootstrap_accuracies = []

# Number of total samples
n = len(df_equal_OS_woRAG)

# Perform bootstrapping
for i in range(n_iterations):
    # Sample with replacement
    bootstrap_sample = np.random.choice(df_equal_OS_woRAG, size=n, replace=True)
    
    # Calculate accuracy for this bootstrap sample
    accuracies = np.sum(bootstrap_sample) / len(bootstrap_sample) * 100
    bootstrap_accuracies.append(accuracies)

# Convert to a numpy array for easier manipulation
bootstrap_accuracies = np.array(bootstrap_accuracies)

# Calculate the 95% confidence interval
confidence_interval_cm = np.percentile(bootstrap_accuracies, [2.5, 97.5])

data_results = [{
    'Confidenz Interval Sequences': (confidence_interval_seq[0] * 100, confidence_interval_seq[1] * 100),
    'Average Accuracy Sequences': average_accuracy_seq * 100,
    'Kappa Score Sequences':'-',
    'Confidenz Interval Contrastmedium': (confidence_interval_cm[0], confidence_interval_cm[1]),
    'Average Accuracy Contrastmedium': average_accuracy_cm,
    'Kappa Score Contrastmedium':'-',
    'Number of Correct Retrieval': '-',
    'Number of Correct Protocol': '-',
    'Protocol/Retrieval': '-'
}]
index = ['LLama 3.1 wo RAG']

df_results = pd.DataFrame(data_results, index=index)

## LLAMA 3.1 405B WITH RAG

In [None]:
#Evaluation of Retrieval
# Function to normalize the protocol names 
def normalize_name(name):
    # Convert to lowercase
    name = name.lower()
    # Normalize spaces around dashes and parentheses, and remove extra spaces
    name = re.sub(r'\s*-\s*', '-', name)  # normalize spaces around dashes
    name = re.sub(r'\s*\(\s*', '(', name)  # normalize spaces before '('
    name = re.sub(r'\s*\)\s*', ')', name)  # normalize spaces after ')'
    name = re.sub(r"'", '', name)  # remove apostrophes

    name = re.sub(r'\s+', ' ', name).strip()  # Remove extra spaces
    return name

# Function to check if the normalized protocol name in ground truth is exactly contained in the normalized names of the corresponding row in results
def check_name_in_list2(list_one, list_two):
    results = []
    
    for i, name_one in enumerate(list_one):
        found = False
        # Normalize the name in ground truth
        normalized_name_one = normalize_name(name_one)
        
        # Get the corresponding row in results
        if i < len(list_two):
            row_two = [list_two[i]]

            
            # Iterate over the names in the corresponding row in results
            for name_two in row_two:
                # Normalize the name in results
                normalized_name_two = normalize_name(name_two)
        
                # Check if names match
                if normalized_name_one == normalized_name_two:
                    found = True
                    break
        
       
        results.append(found)
    
    return results

boolean_results = check_name_in_list2(df_GT['Protokolleinstufung'], df_OS_RAG['Retrieved Documents'])

correct_protocol = sum(boolean_results)

In [None]:
#MRI Sequences 
#Tokenize sequences
def tokenize(text):
    tokens = [token.strip() for token in text.split(',')]
    return tokens

#Function for calculation of token-based accuracy
def calculate_symmetric_token_based_accuracy(ground_truth, result):
    ground_truth_tokens = tokenize(ground_truth)
    result_tokens = tokenize(result)
    
    ground_truth_counts = Counter(ground_truth_tokens)
    result_counts = Counter(result_tokens)
    
    # Calculate the number of matching tokens considering repetitions
    matching_tokens_gt_to_res = sum(min(ground_truth_counts[token], result_counts[token]) for token in ground_truth_counts)
    matching_tokens_res_to_gt = sum(min(ground_truth_counts[token], result_counts[token]) for token in result_counts)
    
    # Total tokens in ground truth and result (considering repetitions)
    total_tokens_gt = sum(ground_truth_counts.values())
    total_tokens_res = sum(result_counts.values())
    
    # Calculate accuracy from ground truth to result
    accuracy_gt_to_res = matching_tokens_gt_to_res / total_tokens_gt if total_tokens_gt > 0 else 0
    
    # Calculate accuracy from result to ground truth
    accuracy_res_to_gt = matching_tokens_res_to_gt / total_tokens_res if total_tokens_res > 0 else 0
    
    # Symmetric accuracy: average of both directions
    symmetric_accuracy = (accuracy_gt_to_res + accuracy_res_to_gt) / 2
    
    return symmetric_accuracy

# Provide the data
ground_truths = df_GT['Sequenzen'] 
results2 = df_OS_RAG['Vorhersage Sequenzen']

accuracies_OS_RAG = []
for gt, res in zip(ground_truths, results2):
    accuracy = calculate_symmetric_token_based_accuracy(gt, res)
    accuracies_OS_RAG.append(accuracy)

#Calculate accuracy
average_accuracy_seq = sum(accuracies_OS_RAG) / len(accuracies_OS_RAG)

# Bootstrap function to calculate confidence intervals
def bootstrap_confidence_interval(data, num_samples=1000, confidence_level=0.95):
    # Resample with replacement and calculate means
    sample_means = [np.mean(np.random.choice(data, size=len(data), replace=True)) for _ in range(num_samples)]

    # Calculate the percentiles for the given confidence level
    lower_bound = np.percentile(sample_means, (1 - confidence_level) / 2 * 100)
    upper_bound = np.percentile(sample_means, (1 + confidence_level) / 2 * 100)

    return lower_bound, upper_bound
confidence_interval_seq = bootstrap_confidence_interval(accuracies_OS_RAG)

#Contrast Medium Administration
#Replace values and change dtype
df_OS_RAG['Vorhersage Kontrastmittelgabe'] = df_OS_RAG['Vorhersage Kontrastmittelgabe'].replace({'ja': 1,'nein':0,'gegebenenfalls':0})
df_OS_RAG['Vorhersage Kontrastmittelgabe'] = df_OS_RAG['Vorhersage Kontrastmittelgabe'].astype(bool)

# Comparing the two columns and calculating accuracy
df_equal_OS_RAG = df_OS_RAG['Vorhersage Kontrastmittelgabe'] == df_GT['Kontrastmittelgabe'] 

# Calculating the number of correct predictions (where the comparison is True)
correct_predictions = df_equal_OS_RAG.sum()

# Calculating accuracy
average_accuracy_cm = correct_predictions / len(df_equal_OS_RAG) * 100

# Set the number of bootstrap samples
n_iterations = 1000
bootstrap_accuracies = []

# Number of total samples
n = len(df_equal_OS_RAG)

# Perform bootstrapping to calculate confidence interval
for i in range(n_iterations):
    # Sample with replacement
    bootstrap_sample = np.random.choice(df_equal_OS_RAG, size=n, replace=True)
    
    # Calculate accuracy for this bootstrap sample
    accuracies = np.sum(bootstrap_sample) / len(bootstrap_sample) * 100
    bootstrap_accuracies.append(accuracies)

# Convert to a numpy array for easier manipulation
bootstrap_accuracies = np.array(bootstrap_accuracies)

# Calculate the 95% confidence interval
confidence_interval_cm = np.percentile(bootstrap_accuracies, [2.5, 97.5])

# Printing results

# New results to add
df_results.loc['LLAMA 3.1 with RAG'] = {
    'Confidenz Interval Sequences': (confidence_interval_seq[0] * 100, confidence_interval_seq[1] * 100),
    'Average Accuracy Sequences': average_accuracy_seq * 100,
    'Kappa Score Sequences':'-',
    'Confidenz Interval Contrastmedium': (confidence_interval_cm[0], confidence_interval_cm[1]),
    'Average Accuracy Contrastmedium': average_accuracy_cm,
    'Kappa Score Contrastmedium':'-',
    'Number of Correct Retrieval': correctly_retrieved,
    'Number of Correct Protocol': correct_protocol,
    'Protocol/Retrieval':accuracy_correctprotocol_out_of_right_retrieval
}

df_results


## GPT-4o

In [None]:
#MRI sequences 
#Tokenize sequences
def tokenize(text):
    tokens = [token.strip() for token in text.split(',')]
    return tokens

#Function to calculate token-based accuracy
def calculate_symmetric_token_based_accuracy(ground_truth, result):
    ground_truth_tokens = tokenize(ground_truth)
    result_tokens = tokenize(result)
    
    ground_truth_counts = Counter(ground_truth_tokens)
    result_counts = Counter(result_tokens)
    
    # Calculate the number of matching tokens considering repetitions
    matching_tokens_gt_to_res = sum(min(ground_truth_counts[token], result_counts[token]) for token in ground_truth_counts)
    matching_tokens_res_to_gt = sum(min(ground_truth_counts[token], result_counts[token]) for token in result_counts)
    
    # Total tokens in ground truth and result (considering repetitions)
    total_tokens_gt = sum(ground_truth_counts.values())
    total_tokens_res = sum(result_counts.values())
    
    # Calculate accuracy from ground truth to result
    accuracy_gt_to_res = matching_tokens_gt_to_res / total_tokens_gt if total_tokens_gt > 0 else 0
    
    # Calculate accuracy from result to ground truth
    accuracy_res_to_gt = matching_tokens_res_to_gt / total_tokens_res if total_tokens_res > 0 else 0
    
    # Symmetric accuracy: average of both directions
    symmetric_accuracy = (accuracy_gt_to_res + accuracy_res_to_gt) / 2
    
    return symmetric_accuracy

# Provide the data
ground_truths = df_GT['Sequenzen'] 
results3 = df_woRAG['Vorhersage Sequenzen']

accuracies_woRAG = []
for gt, res in zip(ground_truths, results3):
    accuracy = calculate_symmetric_token_based_accuracy(gt, res)
    accuracies_woRAG.append(accuracy)

#Calculate accuracy
average_accuracy_seq = sum(accuracies_woRAG) / len(accuracies_woRAG)

# Bootstrap function to calculate confidence intervals
def bootstrap_confidence_interval(data, num_samples=1000, confidence_level=0.95):
    # Resample with replacement and calculate means
    sample_means = [np.mean(np.random.choice(data, size=len(data), replace=True)) for _ in range(num_samples)]

    # Calculate the percentiles for the given confidence level
    lower_bound = np.percentile(sample_means, (1 - confidence_level) / 2 * 100)
    upper_bound = np.percentile(sample_means, (1 + confidence_level) / 2 * 100)

    return lower_bound, upper_bound
confidence_interval_seq = bootstrap_confidence_interval(accuracies_woRAG)

#Contrast Medium Administration
#Replace values and change dtype
df_woRAG['Vorhersage Kontrastmittelgabe'] = df_woRAG['Vorhersage Kontrastmittelgabe'].replace({'ja': 1,'nein':0,'gegebenenfalls':0})
df_woRAG['Vorhersage Kontrastmittelgabe'] = df_woRAG['Vorhersage Kontrastmittelgabe'].astype(bool)

# Comparing the two columns and calculating accuracy
df_equal_woRAG = df_woRAG['Vorhersage Kontrastmittelgabe'] == df_GT['Kontrastmittelgabe'] 

# Calculating the number of correct predictions (where the comparison is True)
correct_predictions = df_equal_woRAG.sum()

# Calculating accuracy
average_accuracy_cm = correct_predictions / len(df_equal_woRAG) * 100

# Set the number of bootstrap samples
n_iterations = 1000
bootstrap_accuracies = []

# Number of total samples
n = len(df_equal_woRAG)

# Perform bootstrapping
for i in range(n_iterations):
    # Sample with replacement
    bootstrap_sample = np.random.choice(df_equal_woRAG, size=n, replace=True)
    
    # Calculate accuracy for this bootstrap sample
    accuracies = np.sum(bootstrap_sample) / len(bootstrap_sample) * 100
    bootstrap_accuracies.append(accuracies)

# Convert to a numpy array for easier manipulation
bootstrap_accuracies = np.array(bootstrap_accuracies)

# Calculate the 95% confidence interval
confidence_interval_cm = np.percentile(bootstrap_accuracies, [2.5, 97.5])

# New results to add
df_results.loc['GPT-4o wo RAG'] = {
    'Confidenz Interval Sequences': (confidence_interval_seq[0] * 100, confidence_interval_seq[1] * 100),
    'Average Accuracy Sequences': average_accuracy_seq * 100,
    'Kappa Score Sequences':'-',
    'Confidenz Interval Contrastmedium': (confidence_interval_cm[0], confidence_interval_cm[1]),
    'Average Accuracy Contrastmedium': average_accuracy_cm,
    'Kappa Score Contrastmedium':'-',
    'Number of Correct Retrieval': '-',
    'Number of Correct Protocol': '-',
    'Protocol/Retrieval': '-'
}

df_results

## GPT-4o WITH RAG

In [None]:
# Evaluation of Retrieval
# Function to normalize the protocol names 
def normalize_name(name):
    # Convert to lowercase
    name = name.lower()
    # Normalize spaces around dashes and parentheses, and remove extra spaces
    name = re.sub(r'\s*-\s*', '-', name)  # normalize spaces around dashes
    name = re.sub(r'\s*\(\s*', '(', name)  # normalize spaces before '('
    name = re.sub(r'\s*\)\s*', ')', name)  # normalize spaces after ')'
    name = re.sub(r"'", '', name)  # remove apostrophes

    name = re.sub(r'\s+', ' ', name).strip()  # Remove extra spaces
    return name

# Function to check if the normalized protocol name in ground truth is exactly contained in the normalized names of the corresponding row in results
def check_name_in_list2(list_one, list_two):
    results = []
    
    for i, name_one in enumerate(list_one):
        found = False
        # Normalize the name in ground truth
        normalized_name_one = normalize_name(name_one)
        
        # Get the corresponding row in results
        if i < len(list_two):
            row_two = [list_two[i]]

            
            # Iterate over the names in the corresponding row in results
            for name_two in row_two:
                # Normalize the name in results
                normalized_name_two = normalize_name(name_two)
        
                # Check if names match
                if normalized_name_one == normalized_name_two:
                    found = True
                    break
        
       
        results.append(found)
    
    return results

boolean_results = check_name_in_list2(df_GT['Protokolleinstufung'], df_RAG['Retrieved Documents'])

correct_protocol = sum(boolean_results)

In [None]:
#MRI Sequences 
#Tokenize data
def tokenize(text):
    tokens = [token.strip() for token in text.split(',')]
    return tokens

#Function to calculate token-based accuracy
def calculate_symmetric_token_based_accuracy(ground_truth, result):
    ground_truth_tokens = tokenize(ground_truth)
    result_tokens = tokenize(result)
    
    ground_truth_counts = Counter(ground_truth_tokens)
    result_counts = Counter(result_tokens)
    
    # Calculate the number of matching tokens considering repetitions
    matching_tokens_gt_to_res = sum(min(ground_truth_counts[token], result_counts[token]) for token in ground_truth_counts)
    matching_tokens_res_to_gt = sum(min(ground_truth_counts[token], result_counts[token]) for token in result_counts)
    
    # Total tokens in ground truth and result (considering repetitions)
    total_tokens_gt = sum(ground_truth_counts.values())
    total_tokens_res = sum(result_counts.values())
    
    # Calculate accuracy from ground truth to result
    accuracy_gt_to_res = matching_tokens_gt_to_res / total_tokens_gt if total_tokens_gt > 0 else 0
    
    # Calculate accuracy from result to ground truth
    accuracy_res_to_gt = matching_tokens_res_to_gt / total_tokens_res if total_tokens_res > 0 else 0
    
    # Symmetric accuracy: average of both directions
    symmetric_accuracy = (accuracy_gt_to_res + accuracy_res_to_gt) / 2
    
    return symmetric_accuracy

# Provide the data
ground_truths = df_GT['Sequenzen'] 
results4 = df_RAG['Vorhersage Sequenzen']

accuracies_RAG = []
for gt, res in zip(ground_truths, results4):
    accuracy = calculate_symmetric_token_based_accuracy(gt, res)
    accuracies_RAG.append(accuracy)

#Calculate accuracy
average_accuracy_seq = sum(accuracies_RAG) / len(accuracies_RAG)

# Bootstrap function to calculate confidence intervals
def bootstrap_confidence_interval(data, num_samples=1000, confidence_level=0.95):
    # Resample with replacement and calculate means
    sample_means = [np.mean(np.random.choice(data, size=len(data), replace=True)) for _ in range(num_samples)]

    # Calculate the percentiles for the given confidence level
    lower_bound = np.percentile(sample_means, (1 - confidence_level) / 2 * 100)
    upper_bound = np.percentile(sample_means, (1 + confidence_level) / 2 * 100)

    return lower_bound, upper_bound
confidence_interval_seq = bootstrap_confidence_interval(accuracies_RAG)

#Contrast Medium Administration
#Replace values and change dtype
df_RAG['Vorhersage Kontrastmittelgabe'] = df_RAG['Vorhersage Kontrastmittelgabe'].replace({'ja': 1,'nein':0,'gegebenenfalls':0})
df_RAG['Vorhersage Kontrastmittelgabe'] = df_RAG['Vorhersage Kontrastmittelgabe'].astype(bool)

# Comparing the two columns and calculating accuracy
df_equal_RAG = df_RAG['Vorhersage Kontrastmittelgabe'] == df_GT['Kontrastmittelgabe'] 

# Calculating the number of correct predictions (where the comparison is True)
correct_predictions = df_equal_RAG.sum()

# Calculating accuracy
average_accuracy_cm = correct_predictions / len(df_equal_RAG) * 100

# Set the number of bootstrap samples
n_iterations = 1000
bootstrap_accuracies = []

# Number of total samples
n = len(df_equal_RAG)

# Perform bootstrapping
for i in range(n_iterations):
    # Sample with replacement
    bootstrap_sample = np.random.choice(df_equal_RAG, size=n, replace=True)
    
    # Calculate accuracy for this bootstrap sample
    accuracies = np.sum(bootstrap_sample) / len(bootstrap_sample) * 100
    bootstrap_accuracies.append(accuracies)

# Convert to a numpy array for easier manipulation
bootstrap_accuracies = np.array(bootstrap_accuracies)

# Calculate the 95% confidence interval
confidence_interval_cm = np.percentile(bootstrap_accuracies, [2.5, 97.5])

# New results to add
df_results.loc['GPT with RAG'] = {
    'Confidenz Interval Sequences': (confidence_interval_seq[0] * 100, confidence_interval_seq[1] * 100),
    'Average Accuracy Sequences': average_accuracy_seq * 100,
    'Kappa Score Sequences':'-',
    'Confidenz Interval Contrastmedium': (confidence_interval_cm[0], confidence_interval_cm[1]),
    'Average Accuracy Contrastmedium': average_accuracy_cm,
    'Kappa Score Contrastmedium':'-',
    'Number of Correct Retrieval': correctly_retrieved,
    'Number of Correct Protocol': correct_protocol,
    'Protocol/Retrieval':accuracy_correctprotocol_out_of_right_retrieval
}

df_results

## RADIOLOGISTS

In [None]:
df1 = pd.read_csv(filepath_or_buffer='/path/GT',sep=';')
df2= pd.read_csv(filepath_or_buffer='/path/selection resident 2', sep=';')
df3 = pd.read_csv(filepath_or_buffer='/path/selection resident 1', sep=';')
df4 = pd.read_csv(filepath_or_buffer='/path/selection radiologist 2', sep=';')
df5  = pd.read_csv(filepath_or_buffer='path/selection radiologist 1', sep=';')
df_GT = df1.fillna('')
df_resident2 = df2.fillna('')
df_resident1 = df3.fillna('')
df_radiologist2 = df4.fillna('')
df_radiologist1 = df5.fillna('')

### Radiologist 1

In [None]:
#MRI Sequences
#Tokenize sequences
def tokenize(text):
    tokens = [token.strip() for token in text.split(',')]
    return tokens

#Function to calculate token-based accuracy
def calculate_symmetric_token_based_accuracy(ground_truth, result):
    ground_truth_tokens = tokenize(ground_truth)
    result_tokens = tokenize(result)
    
    ground_truth_counts = Counter(ground_truth_tokens)
    result_counts = Counter(result_tokens)
    
    # Calculate the number of matching tokens considering repetitions
    matching_tokens_gt_to_res = sum(min(ground_truth_counts[token], result_counts[token]) for token in ground_truth_counts)
    matching_tokens_res_to_gt = sum(min(ground_truth_counts[token], result_counts[token]) for token in result_counts)
    
    # Total tokens in ground truth and result (considering repetitions)
    total_tokens_gt = sum(ground_truth_counts.values())
    total_tokens_res = sum(result_counts.values())
    
    # Calculate accuracy from ground truth to result
    accuracy_gt_to_res = matching_tokens_gt_to_res / total_tokens_gt if total_tokens_gt > 0 else 0
    
    # Calculate accuracy from result to ground truth
    accuracy_res_to_gt = matching_tokens_res_to_gt / total_tokens_res if total_tokens_res > 0 else 0
    
    # Symmetric accuracy: average of both directions
    symmetric_accuracy = (accuracy_gt_to_res + accuracy_res_to_gt) / 2
    
    return symmetric_accuracy

# Example usage with your provided data:
ground_truths = df_GT['Sequenzen']
results = df_radiologist1['Sequenzen'] 

accuracies_rad1 = []
for gt, res in zip(ground_truths, results):
    accuracy = calculate_symmetric_token_based_accuracy(gt, res)
    accuracies_rad1.append(accuracy)

average_accuracy_seq = sum(accuracies_rad1) / len(accuracies_rad1)

# Cohen's Kappa calculation
# Create agreement labels: 1 if they agree (accuracy == 1), else 0
agreement_labels_ground_truth = []
agreement_labels_result = []

for gt, res in zip(ground_truths, results):
    # Tokenize and sort to compare token-based categories
    tokenized_gt = tokenize(gt)
    tokenized_res = tokenize(res)
    
    # Label agreement: 1 if identical tokens, else 0
    agreement_labels_ground_truth.append(' '.join(sorted(tokenized_gt)))
    agreement_labels_result.append(' '.join(sorted(tokenized_res)))

# Calculate Cohen's Kappa
kappa_score_seq = cohen_kappa_score(agreement_labels_ground_truth, agreement_labels_result)

# Bootstrap function to calculate confidence intervals
def bootstrap_confidence_interval(data, num_samples=1000, confidence_level=0.95):
    # Resample with replacement and calculate means
    sample_means = [np.mean(np.random.choice(data, size=len(data), replace=True)) for _ in range(num_samples)]

    # Calculate the percentiles for the given confidence level
    lower_bound = np.percentile(sample_means, (1 - confidence_level) / 2 * 100)
    upper_bound = np.percentile(sample_means, (1 + confidence_level) / 2 * 100)

    return lower_bound, upper_bound
confidence_interval_seq = bootstrap_confidence_interval(accuracies_rad1)

In [None]:
#Contrast Medium Administration
# Comparing the two columns and calculating accuracy
df_radiologist1['Kontrastmittelgabe'] = df_radiologist1['Kontrastmittelgabe'].astype(bool)
df_GT['Kontrastmittelgabe'] = df_GT['Kontrastmittelgabe'].astype(bool)
df_equal_rad1 = df_radiologist1['Kontrastmittelgabe'] == df_GT['Kontrastmittelgabe']

# Calculating the number of correct predictions (where the comparison is True)
correct_predictions = df_equal_rad1.sum()

# Calculating accuracy
average_accuracy_cm = correct_predictions / len(df_equal_rad1) * 100

# Set the number of bootstrap samples
n_iterations = 1000
bootstrap_accuracies = []

# Number of total samples
n = len(df_equal_rad1)

# Perform bootstrapping
for i in range(n_iterations):
    # Sample with replacement
    bootstrap_sample = np.random.choice(df_equal_rad1, size=n, replace=True)
    
    # Calculate accuracy for this bootstrap sample
    accuracies = np.sum(bootstrap_sample) / len(bootstrap_sample) * 100
    bootstrap_accuracies.append(accuracies)

# Convert to a numpy array for easier manipulation
bootstrap_accuracies = np.array(bootstrap_accuracies)

# Calculate the 95% confidence interval
confidence_interval_cm = np.percentile(bootstrap_accuracies, [2.5, 97.5])

# Calculate Cohen's Kappa
kappa_score_cm = cohen_kappa_score(df_radiologist1['Kontrastmittelgabe'], df_GT['Kontrastmittelgabe'])


# New results to add
df_results.loc['Radiologist 1'] = {
    'Confidenz Interval Sequences': (confidence_interval_seq[0] * 100, confidence_interval_seq[1] * 100),
    'Average Accuracy Sequences': average_accuracy_seq * 100,
    'Kappa Score Sequences':kappa_score_seq,
    'Confidenz Interval Contrastmedium': (confidence_interval_cm[0], confidence_interval_cm[1]),
    'Average Accuracy Contrastmedium': average_accuracy_cm,
    'Kappa Score Contrastmedium': kappa_score_cm,
    'Number of Correct Retrieval': '-',
    'Number of Correct Protocol': '-',
    'Protocol/Retrieval':'-'
}



### Radiologist 2

In [None]:
#MRI Sequences 
#Tokenize sequences
def tokenize(text):
    tokens = [token.strip() for token in text.split(',')]
    return tokens

#Function to calculate token-based accuracy
def calculate_symmetric_token_based_accuracy(ground_truth, result):
    ground_truth_tokens = tokenize(ground_truth)
    result_tokens = tokenize(result)
    
    ground_truth_counts = Counter(ground_truth_tokens)
    result_counts = Counter(result_tokens)
    
    # Calculate the number of matching tokens considering repetitions
    matching_tokens_gt_to_res = sum(min(ground_truth_counts[token], result_counts[token]) for token in ground_truth_counts)
    matching_tokens_res_to_gt = sum(min(ground_truth_counts[token], result_counts[token]) for token in result_counts)
    
    # Total tokens in ground truth and result (considering repetitions)
    total_tokens_gt = sum(ground_truth_counts.values())
    total_tokens_res = sum(result_counts.values())
    
    # Calculate accuracy from ground truth to result
    accuracy_gt_to_res = matching_tokens_gt_to_res / total_tokens_gt if total_tokens_gt > 0 else 0
    
    # Calculate accuracy from result to ground truth
    accuracy_res_to_gt = matching_tokens_res_to_gt / total_tokens_res if total_tokens_res > 0 else 0
    
    # Symmetric accuracy: average of both directions
    symmetric_accuracy = (accuracy_gt_to_res + accuracy_res_to_gt) / 2
    
    return symmetric_accuracy

# Example usage with your provided data:
ground_truths = df_GT['Sequenzen']
results = df_radiologist2['Sequenzen'] 

accuracies_rad2 = []
for gt, res in zip(ground_truths, results):
    accuracy = calculate_symmetric_token_based_accuracy(gt, res)
    accuracies_rad2.append(accuracy)

average_accuracy_seq = sum(accuracies_rad2) / len(accuracies_rad2)

# Cohen's Kappa calculation
# Create agreement labels: 1 if they agree (accuracy == 1), else 0
agreement_labels_ground_truth = []
agreement_labels_result = []

for gt, res in zip(ground_truths, results):
    # Tokenize and sort to compare token-based categories
    tokenized_gt = tokenize(gt)
    tokenized_res = tokenize(res)
    
    # Label agreement: 1 if identical tokens, else 0
    agreement_labels_ground_truth.append(' '.join(sorted(tokenized_gt)))
    agreement_labels_result.append(' '.join(sorted(tokenized_res)))

# Calculate Cohen's Kappa
kappa_score_seq = cohen_kappa_score(agreement_labels_ground_truth, agreement_labels_result)

# Bootstrap function to calculate confidence intervals
def bootstrap_confidence_interval(data, num_samples=1000, confidence_level=0.95):
    # Resample with replacement and calculate means
    sample_means = [np.mean(np.random.choice(data, size=len(data), replace=True)) for _ in range(num_samples)]

    # Calculate the percentiles for the given confidence level
    lower_bound = np.percentile(sample_means, (1 - confidence_level) / 2 * 100)
    upper_bound = np.percentile(sample_means, (1 + confidence_level) / 2 * 100)

    return lower_bound, upper_bound
confidence_interval_seq = bootstrap_confidence_interval(accuracies_rad2)

In [None]:
#Contrast Medium Administration
# Comparing the two columns and calculating accuracy
df_equal_rad2 = df_radiologist2['Kontrastmittelgabe'].astype(bool) == df_GT['Kontrastmittelgabe'].astype(bool)

# Calculating the number of correct predictions (where the comparison is True)
correct_predictions = df_equal_rad2.sum()

# Calculating accuracy
average_accuracy_cm = correct_predictions / len(df_equal_rad2) * 100

# Set the number of bootstrap samples
n_iterations = 1000
bootstrap_accuracies = []

# Number of total samples
n = len(df_equal_rad2)

# Perform bootstrapping
for i in range(n_iterations):
    # Sample with replacement
    bootstrap_sample = np.random.choice(df_equal_rad2, size=n, replace=True)
    
    # Calculate accuracy for this bootstrap sample
    accuracies = np.sum(bootstrap_sample) / len(bootstrap_sample) * 100
    bootstrap_accuracies.append(accuracies)

# Convert to a numpy array for easier manipulation
bootstrap_accuracies = np.array(bootstrap_accuracies)

# Calculate the 95% confidence interval
confidence_interval_cm = np.percentile(bootstrap_accuracies, [2.5, 97.5])

# Calculate Cohen's Kappa
kappa_score_cm = cohen_kappa_score(df_radiologist2['Kontrastmittelgabe'].astype(bool), df_GT['Kontrastmittelgabe'].astype(bool))

# New results to add
df_results.loc['Radiologist 2'] = {
    'Confidenz Interval Sequences': (confidence_interval_seq[0] * 100, confidence_interval_seq[1] * 100),
    'Average Accuracy Sequences': average_accuracy_seq * 100,
    'Kappa Score Sequences':kappa_score_seq,
    'Confidenz Interval Contrastmedium': (confidence_interval_cm[0], confidence_interval_cm[1]),
    'Average Accuracy Contrastmedium': average_accuracy_cm,
    'Kappa Score Contrastmedium': kappa_score_cm,
    'Number of Correct Retrieval': '-',
    'Number of Correct Protocol': '-',
    'Protocol/Retrieval':'-'
}




### Resident 1

In [None]:
#MRI Sequences
#Tokenize sequences
def tokenize(text):
    tokens = [token.strip() for token in text.split(',')]
    return tokens

#Function to calculate token-based accuracy
def calculate_symmetric_token_based_accuracy(ground_truth, result):
    ground_truth_tokens = tokenize(ground_truth)
    result_tokens = tokenize(result)
    
    ground_truth_counts = Counter(ground_truth_tokens)
    result_counts = Counter(result_tokens)
    
    # Calculate the number of matching tokens considering repetitions
    matching_tokens_gt_to_res = sum(min(ground_truth_counts[token], result_counts[token]) for token in ground_truth_counts)
    matching_tokens_res_to_gt = sum(min(ground_truth_counts[token], result_counts[token]) for token in result_counts)
    
    # Total tokens in ground truth and result (considering repetitions)
    total_tokens_gt = sum(ground_truth_counts.values())
    total_tokens_res = sum(result_counts.values())
    
    # Calculate accuracy from ground truth to result
    accuracy_gt_to_res = matching_tokens_gt_to_res / total_tokens_gt if total_tokens_gt > 0 else 0
    
    # Calculate accuracy from result to ground truth
    accuracy_res_to_gt = matching_tokens_res_to_gt / total_tokens_res if total_tokens_res > 0 else 0
    
    # Symmetric accuracy: average of both directions
    symmetric_accuracy = (accuracy_gt_to_res + accuracy_res_to_gt) / 2
    
    return symmetric_accuracy

# Example usage with your provided data:
ground_truths = df_GT['Sequenzen']
results = df_resident1['Sequenzen'] 

accuracies_res1 = []
for gt, res in zip(ground_truths, results):
    accuracy = calculate_symmetric_token_based_accuracy(gt, res)
    accuracies_res1.append(accuracy)

average_accuracy_seq = sum(accuracies_res1) / len(accuracies_res1)

# Cohen's Kappa calculation
# Create agreement labels: 1 if they agree (accuracy == 1), else 0
agreement_labels_ground_truth = []
agreement_labels_result = []

for gt, res in zip(ground_truths, results):
    # Tokenize and sort to compare token-based categories
    tokenized_gt = tokenize(gt)
    tokenized_res = tokenize(res)
    
    # Label agreement: 1 if identical tokens, else 0
    agreement_labels_ground_truth.append(' '.join(sorted(tokenized_gt)))
    agreement_labels_result.append(' '.join(sorted(tokenized_res)))

# Calculate Cohen's Kappa
kappa_score_seq = cohen_kappa_score(agreement_labels_ground_truth, agreement_labels_result)

# Bootstrap function to calculate confidence intervals
def bootstrap_confidence_interval(data, num_samples=1000, confidence_level=0.95):
    # Resample with replacement and calculate means
    sample_means = [np.mean(np.random.choice(data, size=len(data), replace=True)) for _ in range(num_samples)]

    # Calculate the percentiles for the given confidence level
    lower_bound = np.percentile(sample_means, (1 - confidence_level) / 2 * 100)
    upper_bound = np.percentile(sample_means, (1 + confidence_level) / 2 * 100)

    return lower_bound, upper_bound
confidence_interval_seq = bootstrap_confidence_interval(accuracies_res1)

In [None]:
#Contrast Medium Administration
# Comparing the two columns and calculating accuracy
df_equal_res1 = df_resident1['Kontrastmittelgabe'].astype(bool) == df_GT['Kontrastmittelgabe'].astype(bool)

# Calculating the number of correct predictions (where the comparison is True)
correct_predictions = df_equal_res1.sum()

# Calculating accuracy
average_accuracy_cm = correct_predictions / len(df_equal_res1) * 100

# Set the number of bootstrap samples
n_iterations = 1000
bootstrap_accuracies = []

# Number of total samples
n = len(df_equal_res1)

# Perform bootstrapping
for i in range(n_iterations):
    # Sample with replacement
    bootstrap_sample = np.random.choice(df_equal_res1, size=n, replace=True)
    
    # Calculate accuracy for this bootstrap sample
    accuracies = np.sum(bootstrap_sample) / len(bootstrap_sample) * 100
    bootstrap_accuracies.append(accuracies)

# Convert to a numpy array for easier manipulation
bootstrap_accuracies = np.array(bootstrap_accuracies)

# Calculate the 95% confidence interval
confidence_interval_cm = np.percentile(bootstrap_accuracies, [2.5, 97.5])

# Calculate Cohen's Kappa
kappa_score_cm = cohen_kappa_score(df_resident1['Kontrastmittelgabe'].astype(bool), df_GT['Kontrastmittelgabe'].astype(bool))

# New results to add
df_results.loc['Resident 1'] = {
    'Confidenz Interval Sequences': (confidence_interval_seq[0] * 100, confidence_interval_seq[1] * 100),
    'Average Accuracy Sequences': average_accuracy_seq * 100,
    'Kappa Score Sequences':kappa_score_seq,
    'Confidenz Interval Contrastmedium': (confidence_interval_cm[0], confidence_interval_cm[1]),
    'Average Accuracy Contrastmedium': average_accuracy_cm,
    'Kappa Score Contrastmedium': kappa_score_cm,
    'Number of Correct Retrieval': '-',
    'Number of Correct Protocol': '-',
    'Protocol/Retrieval':'-'
}


### Resident 2

In [None]:
#MRI Sequences
#Tokenize sequences
def tokenize(text):
    tokens = [token.strip() for token in text.split(',')]
    return tokens

#Function to calculate token-based accuracy
def calculate_symmetric_token_based_accuracy(ground_truth, result):
    ground_truth_tokens = tokenize(ground_truth)
    result_tokens = tokenize(result)
    
    ground_truth_counts = Counter(ground_truth_tokens)
    result_counts = Counter(result_tokens)
    
    # Calculate the number of matching tokens considering repetitions
    matching_tokens_gt_to_res = sum(min(ground_truth_counts[token], result_counts[token]) for token in ground_truth_counts)
    matching_tokens_res_to_gt = sum(min(ground_truth_counts[token], result_counts[token]) for token in result_counts)
    
    # Total tokens in ground truth and result (considering repetitions)
    total_tokens_gt = sum(ground_truth_counts.values())
    total_tokens_res = sum(result_counts.values())
    
    # Calculate accuracy from ground truth to result
    accuracy_gt_to_res = matching_tokens_gt_to_res / total_tokens_gt if total_tokens_gt > 0 else 0
    
    # Calculate accuracy from result to ground truth
    accuracy_res_to_gt = matching_tokens_res_to_gt / total_tokens_res if total_tokens_res > 0 else 0
    
    # Symmetric accuracy: average of both directions
    symmetric_accuracy = (accuracy_gt_to_res + accuracy_res_to_gt) / 2
    
    return symmetric_accuracy

# Example usage with your provided data:
ground_truths = df_GT['Sequenzen']
results = df_resident2['Sequenzen'] 

accuracies_res2 = []
for gt, res in zip(ground_truths, results):
    accuracy = calculate_symmetric_token_based_accuracy(gt, res)
    accuracies_res2.append(accuracy)

average_accuracy_seq = sum(accuracies_res2) / len(accuracies_res2)

# Cohen's Kappa calculation
# Create agreement labels: 1 if they agree (accuracy == 1), else 0
agreement_labels_ground_truth = []
agreement_labels_result = []

for gt, res in zip(ground_truths, results):
    # Tokenize and sort to compare token-based categories
    tokenized_gt = tokenize(gt)
    tokenized_res = tokenize(res)
    
    # Label agreement: 1 if identical tokens, else 0
    agreement_labels_ground_truth.append(' '.join(sorted(tokenized_gt)))
    agreement_labels_result.append(' '.join(sorted(tokenized_res)))

# Calculate Cohen's Kappa
kappa_score_seq = cohen_kappa_score(agreement_labels_ground_truth, agreement_labels_result)

# Bootstrap function to calculate confidence intervals
def bootstrap_confidence_interval(data, num_samples=1000, confidence_level=0.95):
    # Resample with replacement and calculate means
    sample_means = [np.mean(np.random.choice(data, size=len(data), replace=True)) for _ in range(num_samples)]

    # Calculate the percentiles for the given confidence level
    lower_bound = np.percentile(sample_means, (1 - confidence_level) / 2 * 100)
    upper_bound = np.percentile(sample_means, (1 + confidence_level) / 2 * 100)

    return lower_bound, upper_bound
confidence_interval_seq = bootstrap_confidence_interval(accuracies_res2)

In [None]:
#Contrast Medium Administration
# Comparing the two columns and calculating accuracy
df_equal_res2 = df_resident2['Kontrastmittelgabe'].astype(bool) == df_GT['Kontrastmittelgabe'].astype(bool)

# Calculating the number of correct predictions (where the comparison is True)
correct_predictions = df_equal_res2.sum()

# Calculating accuracy
average_accuracy_cm = correct_predictions / len(df_equal_res2) * 100

# Set the number of bootstrap samples
n_iterations = 1000
bootstrap_accuracies = []

# Number of total samples
n = len(df_equal_res2)

# Perform bootstrapping
for i in range(n_iterations):
    # Sample with replacement
    bootstrap_sample = np.random.choice(df_equal_res2, size=n, replace=True)
    
    # Calculate accuracy for this bootstrap sample
    accuracies = np.sum(bootstrap_sample) / len(bootstrap_sample) * 100
    bootstrap_accuracies.append(accuracies)

# Convert to a numpy array for easier manipulation
bootstrap_accuracies = np.array(bootstrap_accuracies)

# Calculate the 95% confidence interval
confidence_interval_cm = np.percentile(bootstrap_accuracies, [2.5, 97.5])

# Calculate Cohen's Kappa
kappa_score_cm = cohen_kappa_score(df_resident2['Kontrastmittelgabe'].astype(bool), df_GT['Kontrastmittelgabe'].astype(bool))

# New results to add
df_results.loc['Resident 2'] = {
    'Confidenz Interval Sequences': (confidence_interval_seq[0] * 100, confidence_interval_seq[1] * 100),
    'Average Accuracy Sequences': average_accuracy_seq * 100,
    'Kappa Score Sequences':kappa_score_seq,
    'Confidenz Interval Contrastmedium': (confidence_interval_cm[0], confidence_interval_cm[1]),
    'Average Accuracy Contrastmedium': average_accuracy_cm,
    'Kappa Score Contrastmedium': kappa_score_cm,
    'Number of Correct Retrieval': '-',
    'Number of Correct Protocol': '-',
    'Protocol/Retrieval':'-'
}

df_results.to_csv(path_or_buf='/Users/laranoellereiner/Projekte/Promotionsdaten/Promotion/Results/Final Results.csv')

## COMPARISON BETWEEN MODEL RESULTS

### Radiologists

In [None]:
mean_accuracies = []

# Loop through the values in all accuracy lists simultaneously
for res2, res1, rad1, rad2 in zip(accuracies_res2, accuracies_res1, accuracies_rad1, accuracies_rad2):
    accuracy = (res2 + res1 + rad1 + rad2) / 4 
    mean_accuracies.append(accuracy) 

mean_accuracies_CM = []

# Loop through the values in all accuracy lists simultaneously
for res2, res1, rad1, rad2 in zip(df_equal_rad1, df_equal_rad2, df_equal_res1, df_equal_res2):
    accuracy = (res2 + res1 + rad1 + rad2) / 4  # Calculate the average accuracy
    mean_accuracies_CM.append(accuracy)  # Append the result to mean_accuracy list

# Calculate standard deviation for both sets of accuracies
sd_mean_accuracy = np.std(mean_accuracies)
sd_mean_accuracy_CM = np.std(mean_accuracies_CM)

#Calculate mean accuracy 
mean_accuracy = sum(mean_accuracies)/len(mean_accuracies)
mean_accuracy_CM = sum(mean_accuracies_CM)/len(mean_accuracies_CM)

print(mean_accuracy)
print(mean_accuracy_CM)
print(sd_mean_accuracy)
print(sd_mean_accuracy_CM)


### LLAMA 3.1 Sequences non-RAG to RAG

In [None]:
model_a_accuracies = accuracies_OS_woRAG
model_b_accuracies = accuracies_OS_RAG

# Perform Wilcoxon signed-rank test
stat, p_value = wilcoxon(model_a_accuracies, model_b_accuracies)

# Output the results
print(f"Wilcoxon test statistic: {stat}")
print(f"P-value: {p_value}")

comparison_results = [{
    'P-Value': p_value
}]
index = ['LLama 3.1 Sequences non-RAG to RAG']

df_comparison_results = pd.DataFrame(comparison_results, index=index)

### LLAMA 3.1 Contrastmedium non-RAG to RAG

In [None]:
# Comparing the two columns 
df_equal_OS_woRAG = df_OS_woRAG['Vorhersage Kontrastmittelgabe'] == df_GT['Kontrastmittelgabe'] 
df_equal_OS_RAG = df_OS_RAG['Vorhersage Kontrastmittelgabe'] == df_GT['Kontrastmittelgabe'] 

# Lists of boolean predictions from two LLMs
llm1_predictions = df_equal_OS_RAG
llm2_predictions = df_equal_OS_woRAG

# Initialize counts for the 2x2 contingency table
a = b = c = d = 0

# Compare each prediction pair and update contingency table counts
for pred1, pred2 in zip(llm1_predictions, llm2_predictions):
    if pred1 == True and pred2 == True:
        a += 1
    elif pred1 == True and pred2 == False:
        b += 1
    elif pred1 == False and pred2 == True:
        c += 1
    elif pred1 == False and pred2 == False:
        d += 1

# Create the contingency table
table = np.array([[a, b],
                  [c, d]])

# Perform the McNemar test
result = mcnemar(table, exact=False)  # Set exact=True for small samples

# Output the test statistic and p-value
print(f'Contingency table: \n{table}')
print(f'Chi-squared: {result.statistic}, p-value: {result.pvalue}')
p_value = (result.pvalue)

df_comparison_results.loc['LLama 3.1 Contrastmedium non-RAG to RAG']= {'P-Value':p_value}



### GPT Sequences non-RAG to RAG

In [None]:
model_a_accuracies = accuracies_woRAG
model_b_accuracies = accuracies_RAG

# Perform Wilcoxon signed-rank test
stat, p_value = wilcoxon(model_a_accuracies, model_b_accuracies)

# Output the results
print(f"Wilcoxon test statistic: {stat}")
print(f"P-value: {p_value}")

df_comparison_results.loc['GPT Sequences non-RAG to RAG'] = {'P-Value':p_value}



### GPT Contrastmedium non-RAG to RAG

In [None]:
# Comparing the two columns 
df_equal_woRAG = df_woRAG['Vorhersage Kontrastmittelgabe'] == df_GT['Kontrastmittelgabe'] 
df_equal_RAG = df_RAG['Vorhersage Kontrastmittelgabe'] == df_GT['Kontrastmittelgabe'] 

# Lists of boolean predictions from two LLMs
llm1_predictions = df_equal_woRAG
llm2_predictions = df_equal_RAG

# Initialize counts for the 2x2 contingency table
a = b = c = d = 0

# Compare each prediction pair and update contingency table counts
for pred1, pred2 in zip(llm1_predictions, llm2_predictions):
    if pred1 == True and pred2 == True:
        a += 1
    elif pred1 == True and pred2 == False:
        b += 1
    elif pred1 == False and pred2 == True:
        c += 1
    elif pred1 == False and pred2 == False:
        d += 1

# Create the contingency table
table = np.array([[a, b],
                  [c, d]])

# Perform the McNemar test
result = mcnemar(table, exact=False)  # Set exact=True for small samples

# Output the test statistic and p-value
print(f'Contingency table: \n{table}')
print(f'Chi-squared: {result.statistic}, p-value: {result.pvalue}')
p_value = result.pvalue

df_comparison_results.loc['GPT Contrastmedium non-RAG to RAG']= {'P-Value':p_value}


### LLAMA 3.1 RAG to GPT RAG Sequences

In [None]:
model_a_accuracies = accuracies_OS_RAG
model_b_accuracies = accuracies_RAG

# Perform Wilcoxon signed-rank test
stat, p_value = wilcoxon(model_a_accuracies, model_b_accuracies)

# Output the results
print(f"Wilcoxon test statistic: {stat}")
print(f"P-value: {p_value}")

df_comparison_results.loc['LLama RAG to GPT RAG Sequences']= {'P-Value':p_value}

### LLAMA 3.1 RAG to GPT RAG Contrastmedium

In [None]:
# Comparing the two columns 
df_equal_OS_RAG = df_OS_RAG['Vorhersage Kontrastmittelgabe'] == df_GT['Kontrastmittelgabe'] 
df_equal_RAG = df_RAG['Vorhersage Kontrastmittelgabe'] == df_GT['Kontrastmittelgabe'] 

# Lists of boolean predictions from two LLMs
llm1_predictions = df_equal_OS_RAG
llm2_predictions = df_equal_RAG

# Initialize counts for the 2x2 contingency table
a = b = c = d = 0

# Compare each prediction pair and update contingency table counts
for pred1, pred2 in zip(llm1_predictions, llm2_predictions):
    if pred1 == True and pred2 == True:
        a += 1
    elif pred1 == True and pred2 == False:
        b += 1
    elif pred1 == False and pred2 == True:
        c += 1
    elif pred1 == False and pred2 == False:
        d += 1

# Create the contingency table
table = np.array([[a, b],
                  [c, d]])

# Perform the McNemar test
result = mcnemar(table, exact=False)  # Set exact=True for small samples

# Output the test statistic and p-value
print(f'Contingency table: \n{table}')
print(f'Chi-squared: {result.statistic}, p-value: {result.pvalue}')
p_value = result.pvalue


df_comparison_results.loc['LLama RAG to GPT RAG Contrastmedium']= {'P-Value':p_value}

### Radiologists to GPT-4o with RAG

In [None]:
model_a_accuracies = mean_accuracies
model_b_accuracies = accuracies_RAG

# Perform Wilcoxon signed-rank test
stat, p_value = wilcoxon(model_a_accuracies, model_b_accuracies)

# Output the results
print(f"Wilcoxon test statistic: {stat}")
print(f"P-value: {p_value}")

df_comparison_results.loc['Radiologists to GPT RAG Sequences']= {'P-Value':p_value}

In [None]:
accuracy_CM_RAG = df_RAG['Vorhersage Kontrastmittelgabe'].astype(float)


# Example lists of accuracies for two models
model_a_accuracies = mean_accuracies_CM
model_b_accuracies = accuracy_CM_RAG

# Perform Wilcoxon signed-rank test
stat, p_value = wilcoxon(model_a_accuracies, model_b_accuracies)

# Output the results
print(f"Wilcoxon test statistic: {stat}")
print(f"P-value: {p_value}")
p_value = result.pvalue

df_comparison_results.loc['Radiologists to GPT RAG Contrastmedium']= {'P-Value':p_value}

### Radiologists to LLama 3.1 with RAG

In [None]:
model_a_accuracies = mean_accuracies
model_b_accuracies = accuracies_OS_RAG

# Perform Wilcoxon signed-rank test
stat, p_value = wilcoxon(model_a_accuracies, model_b_accuracies)

# Output the results
print(f"Wilcoxon test statistic: {stat}")
print(f"P-value: {p_value}")

df_comparison_results.loc['Radiologists to LLama RAG Sequences']= {'P-Value':p_value}


In [None]:
accuracy_CM_OS_RAG = df_OS_RAG['Vorhersage Kontrastmittelgabe'].astype(float)


# Example lists of accuracies for two models
model_a_accuracies = mean_accuracies_CM
model_b_accuracies = accuracy_CM_OS_RAG

# Perform Wilcoxon signed-rank test
stat, p_value = wilcoxon(model_a_accuracies, model_b_accuracies)

# Output the results
print(f"Wilcoxon test statistic: {stat}")
print(f"P-value: {p_value}")
p_value = result.pvalue

df_comparison_results.loc['Radiologists to LLama RAG Contrastmedium']= {'P-Value':p_value}

df_comparison_results.to_csv(path_or_buf='/Users/laranoellereiner/Projekte/Promotionsdaten/Promotion/Results/Comparison Results.csv', sep=';')