In [3]:
# import necessary libraries 
import pandas as pd
import os
import textstat
from openai import OpenAI
import json
import re
import requests
from dotenv import load_dotenv
import math
import unicodedata

In [4]:
from autogen import LLMConfig
from autogen import ConversableAgent, LLMConfig
from autogen.agentchat import initiate_group_chat
from autogen.agentchat.group.patterns import RoundRobinPattern
from autogen.agentchat.group import OnCondition, StringLLMCondition
from autogen.agentchat.group import AgentTarget
from autogen.agentchat.group import TerminateTarget

In [5]:
from pydantic import BaseModel, Field
from typing import Optional
from typing import Annotated

In [None]:
os.environ['OPENAI_API_KEY'] = ""

In [7]:
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
CLIENT = OpenAI(api_key = OPENAI_API_KEY)
OPENAI_MODEL = "gpt-4o"

In [8]:
llm_config = LLMConfig(
    api_type="openai",
    model="gpt-4o",
    api_key=OPENAI_API_KEY,
)

In [9]:
# import prompts
from jh_pfx_prompts import example, icd10_example, single_fewshot_icd10_labeling_prompt, writer_prompt,doctor_prompt, readability_checker_prompt, ICD10_LABELER_INSTRUCTION

In [10]:
#reading levels
PROFESSIONAL = "Professional"
COLLEGE_GRADUATE = "College Graduate"
COLLEGE = "College"
TENTH_TO_TWELTH_GRADE = "10th to 12th grade"
EIGTH_TO_NINTH_GRADE = "8th to 9th grade"
SEVENTH_GRADE = "7th grade"
SIXTH_GRADE = "6th grade"
FIFTH_GRADE = "5th grade"
N_A = "N/A"

In [11]:
# reading ease variables
fifth_grade = 95
sixth_grade = 85
seventh_grade = 75
eigth_and_ninth_grade = 65
tenth_to_twelfth_grade = 55
college = 40
college_graduate = 20
professional = 5

In [12]:
def adjust_difference(diff, threshold):
    """Adjust the readability difference based on the threshold."""
    if diff > threshold:
        return diff - threshold
    return 0

In [13]:
def extract_json_gpt4o(chat_result, verbose=False):
    messages = getattr(chat_result, "chat_history", None) or getattr(chat_result, "messages", [])

    for msg in reversed(messages):
        name = msg.get("name", "").lower()
        if name != "icd10_labeler":
            continue

        content = msg.get("content", "").strip()
        content = unicodedata.normalize("NFKC", content)

        if verbose:
            print(f"[DEBUG] Raw content from {name}:\n{content}")

        content = re.sub(r"```(?:json)?", "", content).strip("` \n")

        try:
            return json.loads(content)
        except json.JSONDecodeError:
            pass

        # fallback with simpler, safe regex
        json_candidates = re.findall(r"\{.*?\}", content, re.DOTALL)
        for candidate in json_candidates:
            try:
                return json.loads(candidate)
            except json.JSONDecodeError:
                continue

        if verbose:
            print(f"[WARN] No valid JSON in {name}'s message.")
        return None

    print("[WARN] No message from 'icd10_labeler' found.")
    return None


In [14]:
def extract_json_icd10(groupchat):
    """
    Extracts the first valid JSON object from the 'content' of messages,
    handling GPT-4o's potential formatting quirks.
    """
    for msg in reversed(groupchat.messages):
        content = msg.get("content", "").strip()

        # Normalize encoding
        content = unicodedata.normalize("NFKC", content)

        # Remove markdown blocks if they exist
        content = re.sub(r"```json|```", "", content).strip()

        # Try direct JSON parsing first
        try:
            return json.loads(content)
        except json.JSONDecodeError:
            pass  # Continue if direct parsing fails

        # Extract JSON from mixed text
        json_match = re.search(r"\{.*?\}", content, re.DOTALL)
        if json_match:
            try:
                return json.loads(json_match.group(0))
            except json.JSONDecodeError:
                pass  # Continue if regex extraction fails

    return None


In [15]:
def label_icd10s(pfx_outputs_json):
    pfx_icd10_fewshot_examples = ""
    for i, row in df_fewshot.iterrows():
        pfx_icd10_fewshot_examples += icd10_example.format(**row)

    pfx_icd10_codes = []
    for pfx_output in pfx_outputs_json:
        try:
            prompt = single_fewshot_icd10_labeling_prompt.format(
                examples=pfx_icd10_fewshot_examples,
                PFx=pfx_output['PFx']
            )
        except Exception as e:
            print("ERROR: %s" % pfx_output)
            return

        response = CLIENT.chat.completions.create(
            model=OPENAI_MODEL,
            temperature=0.0,
            messages=[
                {"role": "system", "content": "You are an ICD10 medical coder for incidental findings."},
                {"role": "user", "content": prompt}
            ],
            stream=False,
        )
        
        # Create a simple wrapper object with a 'messages' attribute as a list of dictionaries.
        wrapper = type("Wrapper", (), {})()
        # Convert the ChatCompletionMessage to a dictionary.
        wrapper.messages = [{
            "role": response.choices[0].message.role,
            "content": response.choices[0].message.content
        }]
        
        pfx_icd10_codes.append(wrapper)
        
    return list(map(extract_json_icd10, pfx_icd10_codes))

In [88]:
df_eval = pd.read_csv('pfx_incidental_findings.csv', skiprows = range(1, 271), nrows = 14)


In [89]:
df_eval

Unnamed: 0,Body Part,Organ,Incidental_Finding,ICD10_code,ICD-10 Code Description
0,Head,Pharynx,Pharyngeal Wall Mass,R22.1,"Localized swelling, mass and lump, neck"
1,Head,Pituitary,Pituitary Microadenoma,D35.2,Benign neoplasm of pituitary gland
2,Head,Salivary Glands,Parotid Cyst,K11.6,Mucocele of salivary gland
3,Head,Salivary Glands,Sialadenitis,K11.20,"Sialoadenitis, unspecified"
4,Head,Salivary Glands,Salivary Gland Stone,K11.5,Sialolithiasis
5,Head,Salivary Glands,Pleomorphic Adenoma,D11.0,Benign neoplasm of parotid gland
6,Head,Scalp,Scalp Hematoma,S00.03XA,"Contusion of scalp, initial encounter"
7,Head,Sinuses,Mucosal Thickening in Sinuses,J32.9,"Chronic sinusitis, unspecified"
8,Head,Sinuses,Sinus Polyp,J33.8,Other polyp of sinus
9,Head,Sinuses,Mucosal Thickening in Maxillary Sinus,J32.0,Chronic maxillary sinusitis


In [18]:
df_fewshot = pd.read_csv('pfx_fewshot_examples_college.csv')

In [19]:
def calculate_fres(
    pfx_text: Annotated[str, "A patient-friendly explanation string."]
) -> dict:
    """Calculate the Flesch Reading Ease Score and estimated reading level for a given explanation."""
    
    def count_syllables(word):
        word = word.lower()
        word = re.sub(r'[^a-z]', '', word)
        if not word:
            return 0
        syllables = re.findall(r'[aeiouy]+', word)
        if word.endswith("e") and not word.endswith("le"):
            syllables = syllables[:-1]
        return max(1, len(syllables))

    sentences = re.split(r'[.!?]', pfx_text)
    sentences = [s.strip() for s in sentences if s.strip()]
    num_sentences = len(sentences)

    words = re.findall(r'\b\w+\b', pfx_text)
    num_words = len(words)
    num_syllables = sum(count_syllables(word) for word in words)

    if num_sentences == 0 or num_words == 0:
        return {"error": "Input must contain at least one sentence and one word."}

    fres = 206.835 - 1.015 * (num_words / num_sentences) - 84.6 * (num_syllables / num_words)

    if fres >= 90:
        grade_level = "5th grade"
    elif fres >= 80:
        grade_level = "6th grade"
    elif fres >= 70:
        grade_level = "7th grade"
    elif fres >= 60:
        grade_level = "8th–9th grade"
    elif fres >= 50:
        grade_level = "10th–12th grade"
    elif fres >= 30:
        grade_level = "College"
    elif fres >= 10:
        grade_level = "College graduate"
    else:
        grade_level = "Professional"

    return {
        "FRES": round(fres, 2),
        "Reading_Level": grade_level
    }

In [20]:
class WriterOutput(BaseModel):
    finding: str = Field(..., description="Name of incidental finding")
    ICD10_Code: str = Field(..., description="The ICD-10 code for the incidental finding")
    PFx: str = Field(..., description="Patient-friendly explanation of the finding")

class LabelerOutput(BaseModel):
    finding: str = Field(..., description="Name of incidental finding")
    ICD10_Code: str = Field(..., description="The ICD-10 code given by writer")
    PFx: str = Field(..., description="The patient-friendly explanation given by writer")
    PFx_ICD10_Code: str = Field(..., description="The ICD-10 code you determine based off of PFx")

class DoctorReadabilityOutput(BaseModel):
    Verdict: str = Field(..., description="Overall judgment about the explanation, accurate or inaccurate")
    Explanation: Optional[str] = Field(None, description="Why the verdict was given")
    Improvements: Optional[str] = Field(None, description="Suggested changes to improve readability or accuracy")


In [76]:
llm_config = LLMConfig(
    api_type="openai",
    model="gpt-4o",
    api_key=OPENAI_API_KEY,
)

writer_config=LLMConfig(
    api_type="openai",
    model="gpt-4o",
    api_key=OPENAI_API_KEY,
    response_format=WriterOutput,
)

labeler_config = LLMConfig(
    api_type="openai",
    model="gpt-4o",
    api_key=OPENAI_API_KEY,
    response_format=LabelerOutput,
)

doctor_config = LLMConfig(
    api_type="openai",
    model="gpt-4o",
    api_key=OPENAI_API_KEY,
    response_format=DoctorReadabilityOutput,
)

readability_config=LLMConfig(
    api_type="openai",
    model="gpt-4o",
    api_key=OPENAI_API_KEY,
    response_format=DoctorReadabilityOutput,
    
)

In [None]:
results = pd.DataFrame(columns=["finding", "ICD10_code", "PFx", "PFx_ICD10_Code"])
for i, row in df_eval.iterrows():
    with llm_config:
        writer = ConversableAgent(
            name = "writer",
            system_message = writer_prompt.format(Incidental_Finding = row['Incidental_Finding'], Reading_Level = SIXTH_GRADE),
            llm_config = writer_config,
        )
    
        icd10_labeler = ConversableAgent(
            name = "icd10_labeler",
            system_message = ICD10_LABELER_INSTRUCTION,
            llm_config = labeler_config,
            code_execution_config=False,
        )
    
        doctor = ConversableAgent( 
            name = "Doctor",
            system_message = doctor_prompt.format(Incidental_Finding = row['Incidental_Finding'], ICD10_code = row["ICD10_code"]),
            llm_config = doctor_config,
            code_execution_config=False,
        )
    
        readability_checker = ConversableAgent(
            name = "Readability_Checker",
            system_message = readability_checker_prompt.format(reading_level = SIXTH_GRADE),
            llm_config = readability_config,
            code_execution_config=False,
            functions=[calculate_fres],
        )
    
        pattern = RoundRobinPattern(
            initial_agent = writer,
            agents = [writer, icd10_labeler, doctor, readability_checker],
        )

        writer.handoffs.set_after_work(AgentTarget(icd10_labeler))
    
        icd10_labeler.handoffs.set_after_work(AgentTarget(doctor))

        doctor.handoffs.add_llm_conditions([
            OnCondition(
                target=AgentTarget(readability_checker),
                condition=StringLLMCondition(prompt="If the response is medically accurate, send the response to the readability_checker."),
            ),
            OnCondition(
                target=AgentTarget(writer),
                condition=StringLLMCondition(prompt="""If the response is medically inaccuare or the original and pfx_icd10_codes are signifigantly different, 
                send the response back to the writer agent with an explanation of why it was sent back and suggestions for improvement in medical accuracy."""),
            ),
        ])

        readability_checker.handoffs.add_llm_conditions([
            OnCondition(
                target=AgentTarget(writer),
                condition=StringLLMCondition("""If the response does not meet the criteria for the desired reading level, send it back to the writer agent
                with an explanation of why it wasn't readable enough and suggestions for improving the readability."""),
            ),
            OnCondition(
              target=TerminateTarget(),
                condition=StringLLMCondition("If the response meets the readability criteria, send it to TerminateTarget."),
            ),
        ])
    

        result, context, last_agent = initiate_group_chat(
            pattern = pattern,
            messages = """Please play your specified role in generating a patient friendly explanation of an inicidental MRI finding.""",
            max_rounds = 20,
        )

        chat = extract_json_gpt4o(result)

        results.loc[i] = {
        "finding": row['Incidental_Finding'],
        "ICD10_code": row['ICD10_code'],
        "PFx": chat.get("PFx", ""),
        "PFx_ICD10_Code": chat.get("PFx_ICD10_Code", "")
        }
        

[33m_User[0m (to chat_manager):

Please play your specified role in generating a patient friendly explanation of an inicidental MRI finding.

--------------------------------------------------------------------------------
[32m
Next speaker: writer
[0m
[31m
>>>>>>>> USING AUTO REPLY...[0m
[33mwriter[0m (to chat_manager):

{"finding":"Pharyngeal Wall Mass","ICD10_Code":"C10.9","PFx":"The doctors found something a little unexpected when they were looking at your test results. They noticed a mass, or a lump, on the wall of your throat. This area is known as the pharyngeal wall, which is just a fancy term for the part of your throat that helps you swallow and speak. \n\nIt’s like if you were exploring a cave, and you found a small bump on the wall that you hadn’t noticed before. It might be nothing, but it’s a good idea to check it out more closely to see what it might be."}

--------------------------------------------------------------------------------
[32m
Next speaker: icd10_

In [78]:
results

Unnamed: 0,finding,ICD10_code,PFx,PFx_ICD10_Code
0,Orbital Cellulitis,H05.01,Orbital cellulitis is when the area around you...,H05.01
1,Orbital Fracture,S02.82,An orbital fracture means that a bone around t...,S02.82
2,Tonsillar Asymmetry,J35.8,Tonsillar asymmetry means one of your tonsils ...,J35.8
3,Tonsillar Hypertrophy,J35.1,Tonsillar hypertrophy means your tonsils are b...,J35.1


In [79]:
# Create a new list to store the labeled ICD10 responses
labeled_icd10_responses = []

# Iterate over each response in pfx_zeroshot_output_all_df and apply the label_icd10s function
for response in results['PFx']:
    labeled_icd10_responses.append(label_icd10s([{'PFx': response}]))

In [80]:
labeled_icd10_responses

[[{'ICD10_code': 'H05.01'}],
 [{'ICD10_code': 'S02.3'}],
 [{'ICD10_code': 'J35.8'}],
 [{'ICD10_code': 'J35.1'}]]

In [81]:
# Create lists to store the results
agent_icd10_codes = []
icd10_matches = []
pfx_icd10_matches = []
flesch_scores = []

agent_icd10_codes.extend([ list(x[0].values())[0] if x and isinstance(x, list) and len(x) > 0 else "" for x in labeled_icd10_responses])  

for index, row in results.iterrows():
    agent_icd10_code = agent_icd10_codes[index]
    icd10_match = (row["ICD10_code"] == agent_icd10_code)
    icd10_matches.append(icd10_match)

    # compare 
    pfx_icd10_match = (row["PFx_ICD10_Code"] == row["ICD10_code"])
    pfx_icd10_matches.append(pfx_icd10_match)

    # Calculate the Flesch Reading Ease score
    flesch_score = textstat.flesch_reading_ease(row['PFx'])
    flesch_scores.append(flesch_score)

# Add the results to the DataFrame
results['_0_agent_icd10_codes'] = agent_icd10_codes
results['_0_icd10_matches'] = icd10_matches
results['_0_pfx_icd10_matches'] = pfx_icd10_matches
results['_0_flesch'] = flesch_scores

In [82]:
desired_reading_ease = sixth_grade
# Calculate threshold for penalty
if desired_reading_ease >= 55:
    threshold = 10
else:
    threshold = 20

# Create lists to store the results
accuracy_icd10_matches_list = []
accuracy_pfx_matches_list = []
readability_difference_list = []
overall_score_list = []
log_overall_score_list = []

# Iterate over each row in the DataFrame
for index, row in results.iterrows():
    # Calculate accuracy scores
    accuracy_icd10_matches = row["_0_icd10_matches"]
    accuracy_pfx_matches = row["_0_pfx_icd10_matches"]
    flesch_score = row["_0_flesch"]

    # total number of icd10 matches
    total_icd10_matches = accuracy_icd10_matches + accuracy_pfx_matches

    # Adjust weights for overall score
    # Calculate readability score 
    readability_score = flesch_score
    readability_difference = abs(readability_score - desired_reading_ease)

    # Compute the overall score
    overall_score = total_icd10_matches * 0.8  + 0.2 * (1/(readability_difference + 1))

    # Calculate readability score
    readability_difference_log = desired_reading_ease - flesch_score
    if readability_difference_log <= threshold:  # No penalty if difference is within the threshold
        readability_difference_p = 0
    else:  # Apply penalty only if readability exceeds the threshold
        readability_difference_with_threshold = readability_difference_log - threshold
        readability_difference_p = math.log(1 + readability_difference_with_threshold) / math.log(20)

    log_overall_score = total_icd10_matches * 0.8 + readability_difference_log * 0.2

    # Append results to lists
    accuracy_icd10_matches_list.append(float(accuracy_icd10_matches))
    accuracy_pfx_matches_list.append(float(accuracy_pfx_matches))
    readability_difference_list.append(float(readability_difference))
    overall_score_list.append(float(overall_score))
    log_overall_score_list.append(float(log_overall_score))
    

# Create a DataFrame with the results
grades_data = {
    "accuracy_agent_icd10": accuracy_icd10_matches_list,
    "accuracy_pfx_icd10": accuracy_pfx_matches_list,
    "readability_difference": readability_difference_list,
    "overall_score": overall_score_list,
    "log_overall_score": log_overall_score_list,
}
grades = pd.DataFrame(grades_data)
results = pd.concat([results, grades], axis=1)


In [83]:
grades

Unnamed: 0,accuracy_agent_icd10,accuracy_pfx_icd10,readability_difference,overall_score,log_overall_score
0,1.0,1.0,9.76,1.618587,-0.352
1,0.0,1.0,8.03,0.822148,-0.806
2,1.0,1.0,0.28,1.75625,1.544
3,1.0,1.0,3.37,1.645767,2.274


In [84]:
results

Unnamed: 0,finding,ICD10_code,PFx,PFx_ICD10_Code,_0_agent_icd10_codes,_0_icd10_matches,_0_pfx_icd10_matches,_0_flesch,accuracy_agent_icd10,accuracy_pfx_icd10,readability_difference,overall_score,log_overall_score
0,Orbital Cellulitis,H05.01,Orbital cellulitis is when the area around you...,H05.01,H05.01,True,True,94.76,1.0,1.0,9.76,1.618587,-0.352
1,Orbital Fracture,S02.82,An orbital fracture means that a bone around t...,S02.82,S02.3,False,True,93.03,0.0,1.0,8.03,0.822148,-0.806
2,Tonsillar Asymmetry,J35.8,Tonsillar asymmetry means one of your tonsils ...,J35.8,J35.8,True,True,85.28,1.0,1.0,0.28,1.75625,1.544
3,Tonsillar Hypertrophy,J35.1,Tonsillar hypertrophy means your tonsils are b...,J35.1,J35.1,True,True,81.63,1.0,1.0,3.37,1.645767,2.274


In [85]:
results.to_csv('autogen_missing_2.csv', index = False)