## Swiss MedQA - A study in LLMs answering medical questions in Swiss languages


#### Process the question data to be executed

In [None]:
# Load question data 
import pandas as pd

filename = "dataset-20241228/MKDS_SwissMedQA_Spreadsheets_1024.xlsx"
sheetName = "Spreadsheet 2 text only"

df = pd.read_excel(filename,sheet_name=sheetName, engine= 'openpyxl')
df.columns = df.columns.str.strip()

df

In [None]:
# Load information about the questions
questioninfosheetname = "Spreadsheet 1"
df_questioninfo = pd.read_excel(filename,sheet_name=questioninfosheetname, engine= 'openpyxl')
df_questioninfo.columns = df_questioninfo.columns.str.strip()
df_questioninfo

In [None]:
df_questioninfo["Modality Type"] = ["text only" if m == "textonly" else m.strip() for m in df_questioninfo["Modality Type"]]

df_questioninfo[ ["Unique ID","Modality Type"] ].groupby("Modality Type").count()

In [None]:
df_questioninfo[ ["Unique ID","Modality Type","Category (1=General medicine, 2=Surgery/Traumatology, 3=Pediatrics, 4=gynocology, 5=Public Health and others"]].groupby(["Modality Type","Category (1=General medicine, 2=Surgery/Traumatology, 3=Pediatrics, 4=gynocology, 5=Public Health and others"]).count()

In [None]:
df_questioninfo[ ["Unique ID","Modality Type","acute (=1) vs chronic (=0)"]].groupby(["Modality Type","acute (=1) vs chronic (=0)"]).count()

In [None]:
df_questioninfo[ ["Unique ID","Modality Type","Diagnosis (=1), Treatment (=2), Other(=3)"]].groupby(["Modality Type","Diagnosis (=1), Treatment (=2), Other(=3)"]).count()

#### Set up language models for execution via APIs

In [None]:
# Imports for language model setup

#from mlx_lm import load, generate

from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import pipeline
from openai import OpenAI
import anthropic


In [None]:
# Specify the models that will be used in the study 
model_fullnames = {#"llama32-3b":"mlx-community/Llama-3.2-3B-Instruct",  # On server use: "llama32-3b":"meta-llama/Llama-3.2-3B-Instruct",
                   #"gpt35":"gpt-3.5-turbo",
                   #"qwen72b":"Qwen/Qwen2.5-72B-Instruct",
                   #"yi-34b":"01-ai/Yi-34B-Chat",
                   #"llama31-70b":"meta-llama/Meta-Llama-3.1-70B-Instruct",
                   "llama33-70b":"meta-llama/Llama-3.3-70B-Instruct",
                   "llama31-405b":"meta-llama/Meta-Llama-3.1-405B-Instruct",
                   "deepseek-v3":"deepseek-ai/DeepSeek-V3",
                   #"deepseek-r1":"deepseek-ai/DeepSeek-R1",
                   #"mixtral-8x22b":"mistralai/Mixtral-8x22B-Instruct-v0.1",
                   "claude-sonnet-3.7":"claude-3-7-sonnet-20250219",
                   #"claude-haiku-3.5":"claude-3-5-haiku-20241022",
                   "gpt4o":"gpt-4o",   
                   #"o3-mini":"o3-mini",  
}

In [None]:
# Set up the models
import os
model = None
modelname = None
client = None
tokenizer = None

def generateFromPrompt(modeln, systemMessage, prompt):
    global modelname
    global model
    global client
    global tokenizer
    if modelname is None or modelname != modeln:
        modelname = modeln
        if modelname in [ "llama32-3b" ]:  # Local model 
            model, tokenizer = load(model_fullnames[modelname])
        elif modelname in [ "gpt35", "gpt4o" , "o3-mini" ]: # OpenAI models
            client = OpenAI(
                api_key=os.environ.get("OPENAI_KEY"),
            )
        elif modelname in ["claude-sonnet-3.7","claude-haiku-3.5"]: # Anthropic Models
            client = anthropic.Anthropic(
                api_key=os.environ.get("ANTHROPIC_KEY")
            )
        else: # DeepInfra models
            client = OpenAI(
                api_key = os.environ.get("DEEPINFRA_KEY"),
                base_url="https://api.deepinfra.com/v1/openai",
            )
    if modelname in [ "llama32-3b" ]:  # Local model 
        if hasattr(tokenizer, "apply_chat_template") and tokenizer.chat_template is not None:
            messages = [{"role": "system", "content": systemMessage},
                {"role": "user", "content": prompt}]
            prompt = tokenizer.apply_chat_template(
                messages, tokenize=False, add_generation_prompt=True
            )
            response = generate(model, tokenizer, prompt=prompt, verbose=False)
            return response
        else:
            print("Error! Local model appears not set up correctly")
    elif modelname in ["claude-sonnet-3.7","claude-haiku-3.5"]: # Anthropic models
        messages=[
            {"role": "user", "content": prompt}
          ]
        response = client.messages.create(
            model=model_fullnames[modelname],
            messages=messages,
            system=systemMessage,
            max_tokens=10,
        )
        return response.content[0].text
    else: # Via one of the OpenAI APIs
        messages=[
            {"role": "system", "content": systemMessage},
            {"role": "user", "content": prompt}
          ]
        completion = client.chat.completions.create(
            model=model_fullnames[modelname],
            messages=messages,
        )
        response=completion.choices[0].message.content
        return response


res = generateFromPrompt("llama33-70b","You are a helpful assistant.","Hello!")
print(res)
res = generateFromPrompt("o3-mini","You are a helpful assistant.","Hello!")
print(res)
res = generateFromPrompt("claude-sonnet-3.7","You are a helpful assistant.","Hello!")
print(res)

In [None]:
# Define the processing functions to answer the questions and clean the responses
def processQuestion(modelName, systemMsg, promptTemplate, question, A, B, C, D, E):
    userMsg = promptTemplate.format(question, A, B, C, D, E)
    res = generateFromPrompt(modelName,systemMsg,userMsg)
    return res

def cleanResponse(response):
    if not isinstance(response,str):
        response = str(response)
    if '-' in response: 
        response = response.split('-')[0]
    response = response.strip()
    return response

import re

def isCorrect(response,groundTruth):
    if len(groundTruth)>1:
        groundTruth = list(groundTruth)
        response = clean = re.sub(r'[^\w\s]|\s', '', response)
        response = list(response)
        return set(groundTruth) == set(response)
    else: 
        return groundTruth == response

In [None]:
# Test the prompt on a single question
ENG_PROMPTS = True

LANG_SYSTEM_MSGS = {"en": "You are a helpful medical question answering assistant. Please carefully follow the exact instructions, return only the answer letter, and do not provide explanations.",
                    "de": "Du bist ein hilfreicher Assistent bei der Beantwortung medizinischer Fragen. Bitte befolge die genauen Anweisungen sorgfältig, sende nur den Antwortbrief zurück und füge keine Erklärungen bei.",
                    "fr": "Tu es un assistant qui répond à des questions médicales utiles. Merci de suivre scrupuleusement les instructions exactes, de ne renvoyer que la lettre de réponse et de ne pas fournir d'explications.",
                    "it": "Sei un utile assistente medico che risponde alle domande. Ti preghiamo di seguire attentamente le istruzioni, di restituire solo la lettera di risposta e di non fornire spiegazioni."}

LANG_PROMPT_TEMPLS = {"en": "Please answer the following multiple choice question by selecting the correct response option or options. \
    Question: {}. Response options: A - {}, B - {}, C - {}, D - {}, E - {}. Return only the letter or letters (A,B,C,D,E) corresponding to the correct answer. The correct answer to the question is: ",
                      "de": "Bitte beantworte die folgende Multiple-Choice-Frage, indem du die richtige(n) Antwortoption(en) auswählst. \
    Frage: {}. Antwortoptionen: A - {}, B - {}, C - {}, D - {}, E - {}. Gib nur den/die Buchstaben (A, B, C, D, E) an, die der richtigen Antwort entsprechen. Die richtige Antwort auf die Frage lautet:",
                      "fr": "Réponds à la question à choix multiples suivante en sélectionnant la ou les bonnes options de réponse. \
    Question : {}. Options de réponse : A - {}, B - {}, C - {}, D - {}, E - {}. Ne renvoie que la ou les lettres (A,B,C,D,E) correspondant à la bonne réponse. La bonne réponse à la question est :",
                      "it": "Rispondi alla seguente domanda a scelta multipla selezionando la o le opzioni di risposta corrette. \
    Domanda: {}. Opzioni di risposta: A - {}, B - {}, C - {}, D - {}, E - {}. Riporta solo la lettera o le lettere (A,B,C,D,E) corrispondenti alla risposta corretta. La risposta corretta alla domanda è:" }

def getSystemMessage(language):
    if ENG_PROMPTS:
        return LANG_SYSTEM_MSGS['en'] # Always use the English prompts
    else: 
        return LANG_SYSTEM_MSGS[language]

def getPromptTemplate(language): 
    if ENG_PROMPTS:
        return LANG_PROMPT_TEMPLS['en'] # Always prompt in English
    else: 
        return LANG_PROMPT_TEMPLS[language]
        
for model_name in model_fullnames: # test all models
    response = processQuestion(model_name, getSystemMessage('it'), getPromptTemplate('it'), "How many fingers does a human hand have?","Z","3","2","5","4")
    print(model_name,response)


In [None]:
# Prepare the results storage - either a new file (if doing a new run) or reload (if starting again half way)
RELOAD = True
resultsDirectory = "../results/results-20250308/"
resultsFilename = "dfresults.csv"

import os
os.makedirs(resultsDirectory, exist_ok=True)

if RELOAD: 
    # Load from stored file 
    df_results = pd.read_csv(resultsDirectory+resultsFilename,index_col=0)
    # Remove calculated columns
    df_results.drop(["DE_clean", "FR_clean", "IT_clean","DE_correct","FR_correct","IT_correct","AnswerType","Category (1=General medicine, 2=Surgery/Traumatology, 3=Pediatrics, 4=gynocology, 5=Public Health and others","acute (=1) vs chronic (=0)","Diagnosis (=1), Treatment (=2), Other(=3)"],axis=1,inplace=True,errors='ignore')
else: 
    df_results = pd.DataFrame(columns= list(df.columns.values)+ ["Model","DE","FR","IT"])

print(df_results.shape)
df_results.head()

In [None]:
import datetime

# MAIN EXECUTION: Run the models and collect the results. This is going to be quite slow. 
for eng_prompts in [True,False]:
    ENG_PROMPTS = eng_prompts
    
    for modeln in model_fullnames.keys():
        for i in range(10): 
            
            matching_rows = ((df_results['Model'] == modeln) & (df_results['EngPrompt'] == eng_prompts) & (df_results['Run'] == i)).any()
            if matching_rows:
                print("Not executing model:", modeln, "EngPrompt:",eng_prompts,"Run:",i,"as already non-empty.")
            else: 
                print("Starting execution of model:", modeln, "EngPrompt:",eng_prompts,"Run:",i)
                rowResults = {"EngPrompt":eng_prompts,"Model":modeln,"Run":i}

                for index, row in df.iterrows():
                    qno = row["Question Set"]
                    rowResults.update(row)

                    for language in ['de','fr','it']:
                        question = row["Question "+language]
                        optA = row["Answer A "+language]
                        optB = row["Answer B "+language]
                        optC = row["Answer C "+language]
                        optD = row["Answer D "+language]
                        optE = row["Answer E "+language]

                        # Run the LLM
                        result = processQuestion(modeln,getSystemMessage(language),getPromptTemplate(language),question,optA,optB,optC,optD,optE)
                        #print(index, " : ",language," :",question,"----->",result)
                        rowResults[language.upper()] = result

                    #print(rowResults)
                    df_results.loc[len(df_results)] = rowResults
                
                df_results.to_csv(resultsDirectory+resultsFilename)    
                print(f"{datetime.datetime.now().strftime('%H:%M:%S')} Finished execution of (Eng?{eng_prompts}), model {modeln}, run {i}. Results table size: {df_results.shape} ")

        

In [None]:
# Clean and save the results  
import os

for language in ["DE","FR","IT"]:
    df_results[language+"_clean"] = [cleanResponse(c) for c in df_results[language] ]

for language in ["DE","FR","IT"]:
    df_results[language+"_correct"] = [isCorrect(c,g) for c,g in zip(df_results[language+"_clean"],df_results["MC modality"]) ]



# Add columns for additional question information - Answer Type MC single (0) or multiple (1) choices
df_results["AnswerType"] = [1 if len(g)>1 else 0 for g in df_results["MC modality"] ]

# Add columns for question information - Question category
# Category (1=General medicine, 2=Surgery/Traumatology, 3=Pediatrics, 4=gynocology, 5=Public Health and others
df_results = df_results.merge(df_questioninfo[ ["Unique ID", "Category (1=General medicine, 2=Surgery/Traumatology, 3=Pediatrics, 4=gynocology, 5=Public Health and others"] ], on="Unique ID", how="left")


# Add the columns for the additional question information
# acute (=1) vs chronic (=0) 	Diagnosis (=1), Treatment (=2), Other(=3)
df_results = df_results.merge(df_questioninfo[ ["Unique ID", "acute (=1) vs chronic (=0)", "Diagnosis (=1), Treatment (=2), Other(=3)"] ], on="Unique ID", how="left")


# Save the augmented results file with the previously specified directory and file name. 
df_results.to_csv(resultsDirectory+resultsFilename+"-clean.csv", index=False)