In [8]:
# import necessary libraries
import pandas as pd
import os
import textstat
from openai import OpenAI
import json
import re
import requests
from dotenv import load_dotenv
import math

In [9]:
# import prompts 
from jh_pfx_prompts import example, icd10_example, baseline_zeroshot_prompt, single_fewshot_prompt, single_fewshot_icd10_labeling_prompt

In [10]:
os.environ['OPENAI_API_KEY'] = ''

In [11]:
# api key
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
CLIENT = OpenAI(api_key = OPENAI_API_KEY)
OPENAI_MODEL = "gpt-4o"

In [12]:
#reading levels
PROFESSIONAL = "Professional"
COLLEGE_GRADUATE = "College Graduate"
COLLEGE = "College"
TENTH_TO_TWELTH_GRADE = "10th to 12th grade"
EIGTH_TO_NINTH_GRADE = "8th to 9th grade"
SEVENTH_GRADE = "7th grade"
SIXTH_GRADE = "6th grade"
FIFTH_GRADE = "5th grade"
N_A = "N/A"

In [13]:
# https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests#Flesch_reading_ease
def map_reading_level(flesch_reading_ease):
    if flesch_reading_ease < 10:
        return PROFESSIONAL
    elif 10.0 <= flesch_reading_ease < 30.0:
        return COLLEGE_GRADUATE
    elif 30.0 <= flesch_reading_ease < 50.0:
        return COLLEGE
    elif 50.0 <= flesch_reading_ease < 60.0:
        return TENTH_TO_TWELTH_GRADE
    elif 60.0 <= flesch_reading_ease < 70.0:
        return EIGTH_TO_NINTH_GRADE
    elif 70.0 <= flesch_reading_ease < 80.0:
        return SEVENTH_GRADE
    elif 80.0 <= flesch_reading_ease < 90.0:
        return SIXTH_GRADE
    elif 90.0 <= flesch_reading_ease < 100.0:
        return FIFTH_GRADE 
    else:
        return N_A

In [14]:
# reading ease variables
fifth_grade = 95
sixth_grade = 85
seventh_grade = 75
eigth_and_ninth_grade = 65
tenth_to_twelfth_grade = 55
college = 40
college_graduate = 20
professional = 5

In [15]:
def adjust_difference(diff, threshold):
    """Adjust the readability difference based on the threshold."""
    if diff > threshold:
        return diff - threshold
    return 0

In [16]:
# import fewshot examples
df_fewshot = pd.read_csv('pfx_fewshot_examples_college.csv')

In [17]:
# import evaluation data 
df_eval = pd.read_csv('pfx_evaluation_data.csv')

In [18]:
def extract_json(openai_response):
    if openai_response:  # Ensure the response is not None
        # Directly search for JSON within the string response
        json_match = re.search(r'```.*?(\{.*?\}).*?```', openai_response, re.DOTALL)
        if json_match:
            json_str = json_match.group(1)  # Extract JSON-like content
            try:
                # Convert extracted string to a JSON object
                json_object = json.loads(json_str.replace('\n', ''))
                return json_object
            except json.JSONDecodeError as e:
                # Handle JSON decoding errors
                print("JSON decoding failed: ", e)
                return {}
        else:
            print("No JSON object found in the response.")
            return {}
    else:
        return None

In [19]:
def label_icd10s(pfx_outputs_json):
    labels = []
    for response in pfx_outputs_json:
        try:
            # Directly get the ICD10_code from the dictionary
            labels.append(response.get("ICD10_code", "Unknown"))
        except Exception as e:
            print(f"Error processing response: {e}")
            labels.append("Error")
    return labels

In [20]:
load_dotenv()

True

In [21]:
pfx_fewshot_examples = ""
for i, row in df_fewshot.iterrows():
    pfx_fewshot_examples += example.format(**row)

pfx_fewshot_outputs = []

for run in range(5):
    for i, row in df_eval.iloc[24:25].iterrows():
        prompt = single_fewshot_prompt.format(
            Examples=pfx_fewshot_examples,
            Incidental_Finding=row["Incidental_Finding"],
            Reading_Level=SIXTH_GRADE
        )
        pfx_response = CLIENT.chat.completions.create(
            model=OPENAI_MODEL,
            temperature=0.0,
            messages=[
                {"role": "system", "content": "You are a medical doctor rephrasing and explaining medical terminology to a patient in an understandable manner."},
                {"role": "user", "content": prompt}
            ],
            stream=False,
        )
        # Corrected append
        pfx_fewshot_outputs.append(pfx_response.choices[0].message.content)



In [22]:
pfx_fewshot_outputs_json = list(map(extract_json, pfx_fewshot_outputs))

In [23]:
pfx_fewshot_outputs_json

[{'finding': 'Bladder diverticulum',
  'ICD10_code': 'N32.3',
  'PFx': "A bladder diverticulum is a small pouch that forms in the bladder wall. It is often found by accident during imaging tests for other reasons. Most of the time, these pouches do not cause any problems and do not need treatment. However, if a diverticulum becomes large or causes symptoms like frequent urinary tract infections or difficulty emptying the bladder, it might need to be checked further. In some cases, a doctor might suggest treatment to prevent complications. It's important to keep an eye on any symptoms and discuss them with your healthcare provider.",
  'PFx_ICD10_code': 'N32.3'},
 {'finding': 'Bladder diverticulum',
  'ICD10_code': 'N32.3',
  'PFx': "A bladder diverticulum is a small pouch that forms in the bladder wall. It is often found by accident during imaging tests for other reasons. Most of the time, these pouches do not cause any problems and do not need treatment. However, if a diverticulum bec

In [32]:
pfx_fewshot_output_df = pd.DataFrame(pfx_fewshot_outputs_json)

In [33]:
pfx_fewshot_output_df

Unnamed: 0,finding,ICD10_code,PFx,PFx_ICD10_code
0,Bladder diverticulum,N32.3,A bladder diverticulum is a small pouch that f...,N32.3
1,Bladder diverticulum,N32.3,A bladder diverticulum is a small pouch that f...,N32.3
2,Bladder diverticulum,N32.3,A bladder diverticulum is a small pouch that f...,N32.3
3,Bladder diverticulum,N32.3,A bladder diverticulum is a small pouch that f...,N32.3
4,Bladder diverticulum,N32.3,A bladder diverticulum is a small pouch that f...,N32.3


In [34]:
pfx_fewshot_outputs_icd10_labels = label_icd10s(pfx_fewshot_outputs_json)

In [35]:
pfx_fewshot_outputs_icd10_labels

['N32.3', 'N32.3', 'N32.3', 'N32.3', 'N32.3']

In [36]:
# Initialize variables
highest_grade = float('-inf')
best_response = None
grades_data = []  # List to temporarily store data for creating the DataFrame
desired_reading_ease = sixth_grade

for index, response in enumerate(pfx_fewshot_output_df):
    # Create a temporary DataFrame for grading the current response
    temp_df = pd.DataFrame([{
        "PFx": response,  # Add PFx response
        "ICD10_code": pfx_fewshot_output_df.loc[index, "ICD10_code"],  # Ensure ICD10_code is included
        "PFx_ICD10_code": pfx_fewshot_output_df.loc[index, "PFx_ICD10_code"]  # Include PFx_ICD10_code if available
    }])
    
    # Extract ICD-10 codes for the current response
    icd10_codes = pfx_fewshot_outputs_icd10_labels[index]  # Directly assign the string value
    temp_df["_0_icd10_codes"] = icd10_codes  # Assign single value to the single row
    
    # Apply grading logic for ICD-10 matches
    temp_df["_0_icd10_matches"] = temp_df["ICD10_code"] == temp_df["_0_icd10_codes"]
    temp_df["_0_pfx_icd10_matches"] = temp_df["ICD10_code"] == temp_df["PFx_ICD10_code"]
    temp_df["_0_flesch"] = temp_df["PFx"].apply(textstat.flesch_reading_ease)
    
    # Calculate accuracy scores
    accuracy_icd10_matches = temp_df["_0_icd10_matches"].mean()
    accuracy_pfx_matches = temp_df["_0_pfx_icd10_matches"].mean()

    flesch_score = temp_df["_0_flesch"].iloc[0]
    
    # Total number of ICD10 matches
    total_icd10_matches = accuracy_icd10_matches + accuracy_pfx_matches

    # Calculate readability score
    readability_difference = abs(flesch_score - desired_reading_ease)

    # Compute the overall score
    overall_score = total_icd10_matches * 0.8  + 0.2 * (1/(readability_difference + 1))

    # Store the grades for later DataFrame creation
    grades_data.append({
        "response_index": index,
        "accuracy_agent_icd10": accuracy_icd10_matches,
        "accuracy_pfx_icd10": accuracy_pfx_matches,
        "readability_score": flesch_score,
        "readability_difference": readability_difference,
        "overall_score": overall_score
    })
    
    # Check if this overall_score is the highest
    if overall_score > highest_grade:
        highest_grade = overall_score
        best_response = response

# Create the grades DataFrame from the collected data
grades = pd.DataFrame(grades_data)



In [38]:
grades

Unnamed: 0,response_index,accuracy_agent_icd10,accuracy_pfx_icd10,readability_score,readability_difference,overall_score
0,0,1.0,1.0,36.62,48.38,1.60405
1,1,1.0,1.0,36.62,48.38,1.60405
2,2,1.0,1.0,121.22,36.22,1.605373
3,3,1.0,1.0,36.62,48.38,1.60405


Head,Chiari I malformation,Q07.0,
Neck,Thyroid nodule,E04.1,
Neck,Cervical lymphadenopathy,R59.0,
Neck,Parotid gland cyst,K11.4,
Neck,Carotid artery stenosis,I65.2,
Neck,Cervical disc herniation,M50.20,
Chest,Pulmonary nodule,R91.1,
Chest,Mediastinal lymphadenopathy,R59.1,
Chest,Hiatal hernia,K44.9,
Chest,Coronary artery calcification,I25.10,
Chest,Pericardial effusion,I31.3,
Abdomen,Liver cyst,K76.89,
Abdomen,Renal cyst,N28.1,
Abdomen,Adrenal adenoma,D35.00,
Abdomen,Pancreatic cyst,K86.2,
Abdomen,Splenomegaly,R16.1,
Pelvis,Ovarian cyst,N83.20,
Pelvis,Uterine fibroid,D25.9,
Pelvis,Prostatic hypertrophy,N40.0,
Pelvis,Pelvic lymphadenopathy,R59.1,
Pelvis,Bladder diverticulum,N32.3,