In [None]:
# import necessary libraries 
import pandas as pd
import os
import textstat
from openai import OpenAI
import json
import re
import requests
from dotenv import load_dotenv
import math
import unicodedata

In [None]:
import autogen
from autogen import ConversableAgent

In [None]:
os.environ['OPENAI_API_KEY'] = ""

In [None]:
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
CLIENT = OpenAI(api_key = OPENAI_API_KEY)
OPENAI_MODEL = "gpt-4o-mini"

In [None]:
llm_config={"model": OPENAI_MODEL}

In [None]:
# import prompts
from jh_pfx_prompts import example, icd10_example, single_fewshot_icd10_labeling_prompt, writer_prompt,doctor_prompt, readability_checker_prompt, ICD10_LABELER_INSTRUCTION

In [None]:
#reading levels
PROFESSIONAL = "Professional"
COLLEGE_GRADUATE = "College Graduate"
COLLEGE = "College"
TENTH_TO_TWELTH_GRADE = "10th to 12th grade"
EIGTH_TO_NINTH_GRADE = "8th to 9th grade"
SEVENTH_GRADE = "7th grade"
SIXTH_GRADE = "6th grade"
FIFTH_GRADE = "5th grade"
N_A = "N/A"

In [None]:
# reading ease variables
fifth_grade = 95
sixth_grade = 85
seventh_grade = 75
eigth_and_ninth_grade = 65
tenth_to_twelfth_grade = 55
college = 40
college_graduate = 20
professional = 5

In [None]:
def adjust_difference(diff, threshold):
    """Adjust the readability difference based on the threshold."""
    if diff > threshold:
        return diff - threshold
    return 0

In [None]:
def extract_json_gpt4o(groupchat):
    """
    Extracts the first valid JSON object from the 'content' of messages,
    handling GPT-4o's potential formatting quirks.
    """
    for msg in reversed(groupchat.messages):
        content = msg.get("content", "").strip()

        # Normalize encoding
        content = unicodedata.normalize("NFKC", content)

        # Remove markdown blocks if they exist
        content = re.sub(r"```json|```", "", content).strip()

        # Try direct JSON parsing first
        try:
            return json.loads(content)
        except json.JSONDecodeError:
            pass  # Continue if direct parsing fails

        # Extract JSON from mixed text
        json_match = re.search(r"\{.*?\}", content, re.DOTALL)
        if json_match:
            try:
                return json.loads(json_match.group(0))
            except json.JSONDecodeError:
                pass  # Continue if regex extraction fails

    return None

In [None]:
def label_icd10s(pfx_outputs_json):
    pfx_icd10_fewshot_examples = ""
    for i, row in df_fewshot.iterrows():
        pfx_icd10_fewshot_examples += icd10_example.format(**row)

    pfx_icd10_codes = []
    for pfx_output in pfx_outputs_json:
        try:
            prompt = single_fewshot_icd10_labeling_prompt.format(
                examples=pfx_icd10_fewshot_examples,
                PFx=pfx_output['PFx']
            )
        except Exception as e:
            print("ERROR: %s" % pfx_output)
            return

        response = CLIENT.chat.completions.create(
            model=OPENAI_MODEL,
            temperature=0.0,
            messages=[
                {"role": "system", "content": "You are an ICD10 medical coder for incidental findings."},
                {"role": "system", "content": prompt}
            ],
            stream=False,
        )
        
        # Create a simple wrapper object with a 'messages' attribute as a list of dictionaries.
        wrapper = type("Wrapper", (), {})()
        # Convert the ChatCompletionMessage to a dictionary.
        wrapper.messages = [{
            "role": response.choices[0].message.role,
            "content": response.choices[0].message.content
        }]
        
        pfx_icd10_codes.append(wrapper)
        
    return list(map(extract_json_gpt4o, pfx_icd10_codes))

In [None]:
def get_last_agent_response(messages, agent_name):
    for message in messages:
        if message['name'] == agent_name:
            return message["content"]
    return None

In [None]:
df_eval = pd.read_csv('pfx_incidental_findings.csv', skiprows = range(1, 31), nrows = 20)

In [None]:
df_eval

In [None]:
df_fewshot = pd.read_csv('pfx_fewshot_examples_college.csv')

In [None]:
def state_transition(last_speaker, groupchat):
    messages = groupchat.messages

    if last_speaker is manager:
        return writer
    elif last_speaker is writer:
        return icd10_labeler 
    elif last_speaker is icd10_labeler:
        return doctor
    elif last_speaker is doctor:
        if "INACCURATE" in messages[-1]["content"]:
            return writer 
        else:
            return readability_checker
    elif last_speaker is readability_checker:
        if "All done!" in messages[-1]["content"]:
            return None 
        else:
            return writer

In [None]:
# empty list to store results
results = pd.DataFrame(columns=["finding", "ICD10_code", "PFx", "PFx_ICD10_code"])

for i, row in df_eval.iterrows():
    writer = ConversableAgent(
        name = "Writer",
        system_message = writer_prompt.format(Incidental_Finding = row['Incidental_Finding'], Reading_Level = SIXTH_GRADE),
        llm_config = llm_config,
        code_execution_config = False,
        human_input_mode = "NEVER",
    )
        
    doctor = ConversableAgent( 
        name = "Doctor",
        system_message = doctor_prompt.format(Incidental_Finding = row['Incidental_Finding'], ICD10_code = row["ICD10_code"]),
        llm_config = llm_config,
        code_execution_config = False,
        human_input_mode = "NEVER",
    ) 
        
    readability_checker = ConversableAgent(
        name = "Readability_Checker",
        system_message = readability_checker_prompt.format(reading_level = SIXTH_GRADE),
        llm_config = llm_config,
        code_execution_config = False,
        human_input_mode = "NEVER",
    )
    
    icd10_labeler = ConversableAgent(
        name = "ICD10_Labeler",
        system_message = ICD10_LABELER_INSTRUCTION,
        llm_config = llm_config,
        code_execution_config = False,
        human_input_mode = "NEVER",
    )

    # create agent groupchat
    groupchat = autogen.GroupChat(
        agents = [writer, icd10_labeler, doctor, readability_checker],
        messages = [],
        max_round = 20,
        speaker_selection_method = state_transition,
    )
    
    manager = autogen.GroupChatManager(
        groupchat = groupchat, llm_config = llm_config,
    )
    
    groupchat_result = manager.initiate_chat(manager, message = """Please play your specified role in 
    generating a patient friendly explanation of an inicidental MRI finding.""")

    chat = extract_json_gpt4o(groupchat)

    results.loc[i] = {
        "finding": row['Incidental_Finding'],
        "ICD10_code": row['ICD10_code'],
        "PFx": chat.get("PFx", ""),
        "PFx_ICD10_code": chat.get("PFx_ICD10_code", "")
    }
        
    


In [None]:
"""
# PARALLEL API CALLS
from concurrent.futures import ThreadPoolExecutor, as_completed

# Prepare result collection
results = pd.DataFrame(columns=["finding", "ICD10_code", "PFx", "PFx_ICD10_code"])

# Thread-safe collection for results
from threading import Lock
result_lock = Lock()

def process_row(i, row):
    writer = ConversableAgent(
        name="Writer",
        system_message=writer_prompt.format(Incidental_Finding=row['Incidental_Finding'], Reading_Level=SIXTH_GRADE),
        llm_config=llm_config,
        code_execution_config=False,
        human_input_mode="NEVER",
    )

    doctor = ConversableAgent(
        name="Doctor",
        system_message=doctor_prompt.format(Incidental_Finding=row['Incidental_Finding'], ICD10_code=row["ICD10_code"]),
        llm_config=llm_config,
        code_execution_config=False,
        human_input_mode="NEVER",
    )

    readability_checker = ConversableAgent(
        name="Readability_Checker",
        system_message=readability_checker_prompt.format(reading_level=SIXTH_GRADE),
        llm_config=llm_config,
        code_execution_config=False,
        human_input_mode="NEVER",
    )

    icd10_labeler = ConversableAgent(
        name="ICD10_Labeler",
        system_message=ICD10_LABELER_INSTRUCTION,
        llm_config=llm_config,
        code_execution_config=False,
        human_input_mode="NEVER",
    )

    groupchat = autogen.GroupChat(
        agents=[writer, icd10_labeler, doctor, readability_checker],
        messages=[],
        max_round=20,
        speaker_selection_method=state_transition,
    )

    manager = autogen.GroupChatManager(
        groupchat=groupchat,
        llm_config=llm_config,
    )

    manager.initiate_chat(manager, message="""Please play your specified role in 
        generating a patient friendly explanation of an incidental MRI finding.""")

    chat = extract_json_gpt4o(groupchat)

    with result_lock:
        results.loc[i] = {
            "finding": row['Incidental_Finding'],
            "ICD10_code": row['ICD10_code'],
            "PFx": chat.get("PFx", ""),
            "PFx_ICD10_code": chat.get("PFx_ICD10_code", "")
        }

# Parallel execution
with ThreadPoolExecutor(max_workers=5) as executor:
    futures = [executor.submit(process_row, i, row) for i, row in df_eval.iterrows()]
    for future in as_completed(futures):
        future.result()  # Propagate exceptions if any
"""

In [None]:
results

In [None]:
# Create a new list to store the labeled ICD10 responses
labeled_icd10_responses = []

# Iterate over each response in pfx_zeroshot_output_all_df and apply the label_icd10s function
for response in results['PFx']:
    labeled_icd10_responses.append(label_icd10s([{'PFx': response}]))

In [None]:
labeled_icd10_responses

In [None]:
# Create lists to store the results
agent_icd10_codes = []
icd10_matches = []
pfx_icd10_matches = []
flesch_scores = []

agent_icd10_codes.extend([ list(x[0].values())[0] if x and isinstance(x, list) and len(x) > 0 else "" for x in labeled_icd10_responses])  

for index, row in results.iterrows():
    agent_icd10_code = agent_icd10_codes[index]
    icd10_match = (row["ICD10_code"] == agent_icd10_code)
    icd10_matches.append(icd10_match)

    # compare 
    pfx_icd10_match = (row["PFx_ICD10_code"] == row["ICD10_code"])
    pfx_icd10_matches.append(pfx_icd10_match)

    # Calculate the Flesch Reading Ease score
    flesch_score = textstat.flesch_reading_ease(row['PFx'])
    flesch_scores.append(flesch_score)

# Add the results to the DataFrame
results['_0_agent_icd10_codes'] = agent_icd10_codes
results['_0_icd10_matches'] = icd10_matches
results['_0_pfx_icd10_matches'] = pfx_icd10_matches
results['_0_flesch'] = flesch_scores

In [None]:
desired_reading_ease = sixth_grade
# Calculate threshold for penalty
if desired_reading_ease >= 55:
    threshold = 10
else:
    threshold = 20

# Create lists to store the results
accuracy_icd10_matches_list = []
accuracy_pfx_matches_list = []
readability_difference_list = []
overall_score_list = []
log_overall_score_list = []

# Iterate over each row in the DataFrame
for index, row in results.iterrows():
    # Calculate accuracy scores
    accuracy_icd10_matches = row["_0_icd10_matches"]
    accuracy_pfx_matches = row["_0_pfx_icd10_matches"]
    flesch_score = row["_0_flesch"]

    # total number of icd10 matches
    total_icd10_matches = accuracy_icd10_matches + accuracy_pfx_matches

    # Adjust weights for overall score
    # Calculate readability score 
    readability_score = flesch_score
    readability_difference = abs(readability_score - desired_reading_ease)

    # Compute the overall score
    overall_score = total_icd10_matches * 0.8  + 0.2 * (1/(readability_difference + 1))

    # Calculate readability score
    readability_difference_log = desired_reading_ease - flesch_score
    if readability_difference_log <= threshold:  # No penalty if difference is within the threshold
        readability_difference_p = 0
    else:  # Apply penalty only if readability exceeds the threshold
        readability_difference_with_threshold = readability_difference_log - threshold
        readability_difference_p = math.log(1 + readability_difference_with_threshold) / math.log(20)

    log_overall_score = total_icd10_matches * 0.8 + readability_difference_log * 0.2

    # Append results to lists
    accuracy_icd10_matches_list.append(float(accuracy_icd10_matches))
    accuracy_pfx_matches_list.append(float(accuracy_pfx_matches))
    readability_difference_list.append(float(readability_difference))
    overall_score_list.append(float(overall_score))
    log_overall_score_list.append(float(log_overall_score))
    

# Create a DataFrame with the results
grades_data = {
    "accuracy_agent_icd10": accuracy_icd10_matches_list,
    "accuracy_pfx_icd10": accuracy_pfx_matches_list,
    "readability_difference": readability_difference_list,
    "overall_score": overall_score_list,
    "log_overall_score": log_overall_score_list,
}
grades = pd.DataFrame(grades_data)
results = pd.concat([results, grades], axis=1)


In [None]:
grades

In [None]:
results

In [None]:
results.to_csv('autogen_4omini_test_30_50_part.csv', index = False)