In [1]:
# import necessary libraries
import pandas as pd
import os
import textstat
from openai import OpenAI
import json
import re
import requests
from dotenv import load_dotenv

In [2]:
# import prompts 
from jh_pfx_prompts import example, icd10_example, baseline_zeroshot_prompt, single_fewshot_prompt, single_fewshot_icd10_labeling_prompt

In [3]:
os.environ['OPENAI_API_KEY'] = 'enter API key'

In [4]:
# api key
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
CLIENT = OpenAI(api_key = OPENAI_API_KEY)
OPENAI_MODEL = "gpt-4o"

In [5]:
#reading levels
PROFESSIONAL = "Professional"
COLLEGE_GRADUATE = "College Graduate"
COLLEGE = "College"
TENTH_TO_TWELTH_GRADE = "10th to 12th grade"
EIGTH_TO_NINTH_GRADE = "8th to 9th grade"
SEVENTH_GRADE = "7th grade"
SIXTH_GRADE = "6th grade"
FIFTH_GRADE = "5th grade"
N_A = "N/A"

In [6]:
# https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests#Flesch_reading_ease
def map_reading_level(flesch_reading_ease):
    if flesch_reading_ease < 10:
        return PROFESSIONAL
    elif 10.0 <= flesch_reading_ease < 30.0:
        return COLLEGE_GRADUATE
    elif 30.0 <= flesch_reading_ease < 50.0:
        return COLLEGE
    elif 50.0 <= flesch_reading_ease < 60.0:
        return TENTH_TO_TWELTH_GRADE
    elif 60.0 <= flesch_reading_ease < 70.0:
        return EIGTH_TO_NINTH_GRADE
    elif 70.0 <= flesch_reading_ease < 80.0:
        return SEVENTH_GRADE
    elif 80.0 <= flesch_reading_ease < 90.0:
        return SIXTH_GRADE
    elif 90.0 <= flesch_reading_ease < 100.0:
        return FIFTH_GRADE 
    else:
        return N_A

In [7]:
# reading ease variables
fifth_grade = 95
sixth_grade = 85
seventh_grade = 75
eigth_and_ninth_grade = 65
tenth_to_twelfth_grade = 55
college = 40
college_graduate = 20
professional = 5

In [8]:
def adjust_difference(diff, threshold):
    """Adjust the readability difference based on the threshold."""
    if diff > threshold:
        return diff - threshold
    return 0

In [9]:
# import fewshot examples
df_fewshot = pd.read_csv('pfx_fewshot_examples_college.csv')

In [10]:
# import evaluation data 
df_eval = pd.read_csv('pfx_evaluation_data.csv')

In [11]:
def extract_json(openai_response):
    if openai_response:  # Ensure the response is not None
        # Directly search for JSON within the string response
        json_match = re.search(r'```.*?(\{.*?\}).*?```', openai_response, re.DOTALL)
        if json_match:
            json_str = json_match.group(1)  # Extract JSON-like content
            try:
                # Convert extracted string to a JSON object
                json_object = json.loads(json_str.replace('\n', ''))
                return json_object
            except json.JSONDecodeError as e:
                # Handle JSON decoding errors
                print("JSON decoding failed: ", e)
                return {}
        else:
            print("No JSON object found in the response.")
            return {}
    else:
        return None

In [12]:
def label_icd10s(pfx_outputs_json):
    labels = []
    for response in pfx_outputs_json:
        try:
            # Directly get the ICD10_code from the dictionary
            labels.append(response.get("ICD10_code", "Unknown"))
        except Exception as e:
            print(f"Error processing response: {e}")
            labels.append("Error")
    return labels

In [13]:
load_dotenv()

True

In [127]:
pfx_fewshot_examples = ""
for i, row in df_fewshot.iterrows():
    pfx_fewshot_examples += example.format(**row)

pfx_fewshot_outputs = []

for run in range(5):
    for i, row in df_eval.iloc[2:3].iterrows():
        prompt = single_fewshot_prompt.format(
            Examples=pfx_fewshot_examples,
            Incidental_Finding=row["Incidental_Finding"],
            Reading_Level=SIXTH_GRADE
        )
        pfx_response = CLIENT.chat.completions.create(
            model=OPENAI_MODEL,
            temperature=0.0,
            messages=[
                {"role": "system", "content": "You are a medical doctor rephrasing and explaining medical terminology to a patient in an understandable manner."},
                {"role": "user", "content": prompt}
            ],
            stream=False,
        )
        # Corrected append
        pfx_fewshot_outputs.append(pfx_response.choices[0].message.content)



In [128]:
pfx_fewshot_outputs_json = list(map(extract_json, pfx_fewshot_outputs))

In [129]:
pfx_fewshot_outputs_json

[{'finding': 'Pituitary microadenoma',
  'ICD10_code': 'D35.2',
  'PFx': 'A pituitary microadenoma is a small, non-cancerous growth in the pituitary gland, which is a tiny organ at the base of the brain. These growths are often found by chance during imaging tests for other reasons. Most of the time, they do not cause any symptoms and do not need treatment. However, in some cases, they can affect hormone levels, which might lead to symptoms like headaches or vision changes. If you experience any symptoms, your doctor might suggest further tests to check hormone levels and discuss possible treatment options.',
  'PFx_ICD10_code': 'D35.2'},
 {'finding': 'Pituitary microadenoma',
  'ICD10_code': 'D35.2',
  'PFx': 'A pituitary microadenoma is a small, non-cancerous growth in the pituitary gland, which is a tiny organ at the base of your brain. This gland makes hormones that help control many important functions in your body. Most of the time, these small growths do not cause any symptoms a

In [130]:
pfx_fewshot_output_df = pd.DataFrame(pfx_fewshot_outputs_json)

In [131]:
pfx_fewshot_output_df

Unnamed: 0,finding,ICD10_code,PFx,PFx_ICD10_code
0,Pituitary microadenoma,D35.2,"A pituitary microadenoma is a small, non-cance...",D35.2
1,Pituitary microadenoma,D35.2,"A pituitary microadenoma is a small, non-cance...",D35.2
2,Pituitary microadenoma,D35.2,"A pituitary microadenoma is a small, non-cance...",D35.2
3,Pituitary microadenoma,D35.2,"A pituitary microadenoma is a small, non-cance...",D35.2
4,Pituitary microadenoma,D35.2,"A pituitary microadenoma is a small, non-cance...",D35.2


In [132]:
pfx_fewshot_outputs_icd10_labels = label_icd10s(pfx_fewshot_outputs_json)

In [133]:
pfx_fewshot_outputs_icd10_labels

['D35.2', 'D35.2', 'D35.2', 'D35.2', 'D35.2']

In [134]:
# Initialize variables
highest_grade = float('-inf')
best_response = None
grades_data = []  # List to temporarily store data for creating the DataFrame
desired_reading_ease = sixth_grade

for index, response in enumerate(pfx_fewshot_outputs_json):
    # Create a temporary DataFrame for grading the current response
    temp_df = pd.DataFrame([response])  # Wrap response in a list to simulate grading for one response
    
    # Extract ICD-10 codes for the current response
    icd10_codes = pfx_fewshot_outputs_icd10_labels[index]  # Directly assign the string value
    temp_df["_0_icd10_codes"] = icd10_codes  # Assign single value to the single row
    
    # Apply grading logic for ICD-10 matches
    temp_df["_0_icd10_matches"] = temp_df.ICD10_code == temp_df["_0_icd10_codes"]
    temp_df["_0_pfx_icd10_matches"] = temp_df.ICD10_code == temp_df["PFx_ICD10_code"]
    temp_df["_0_flesch"] = temp_df["PFx"].apply(textstat.flesch_reading_ease)
    
    # Calculate accuracy scores
    accuracy_icd10_matches = sum(temp_df["_0_icd10_matches"]) / len(temp_df.index)
    accuracy_pfx_matches = sum(temp_df["_0_pfx_icd10_matches"]) / len(temp_df.index)
    
    readability_score = temp_df["_0_flesch"]
    readability_difference = abs(readability_score - desired_reading_ease)
    
    # Calculate readability difference
    if desired_reading_ease >= 55:
        # If bigger than 5, subtract 5; otherwise set to 0
        readability_difference = readability_difference.apply(lambda x: x - 5 if x > 5 else 0)
        readability_difference_p = readability_difference / 10
    elif desired_reading_ease >= 20:
        # If bigger than 10, subtract 10; otherwise set to 0
        readability_difference = readability_difference.apply(lambda x: x - 10 if x > 10 else 0)
        readability_difference_p = readability_difference / 20

    readability_difference_p_val = readability_difference_p.iloc[0]
        
    # Compute the overall score using adjusted weights
    overall_score =  (accuracy_icd10_matches * 0.35) + (accuracy_pfx_matches * 0.35) - (readability_difference_p_val * 0.3)
    
    # Store the grades for later DataFrame creation
    grades_data.append({
        "response_index": index,
        "accuracy_agent_icd10": accuracy_icd10_matches,
        "accuracy_pfx_icd10": accuracy_pfx_matches,
        "readability_score": readability_score,
        "readability_difference": readability_difference,
        "overall_score": overall_score
    })
    
    # Check if this overall_score is the highest
    if overall_score > highest_grade:
        highest_grade = overall_score
        best_response = response

# Create the grades DataFrame from the collected data
grades = pd.DataFrame(grades_data)


In [135]:
grades

Unnamed: 0,response_index,accuracy_agent_icd10,accuracy_pfx_icd10,readability_score,readability_difference,overall_score
0,0,1.0,1.0,"0 70.13 Name: _0_flesch, dtype: float64","0 9.87 Name: _0_flesch, dtype: float64",0.4039
1,1,1.0,1.0,"0 68.5 Name: _0_flesch, dtype: float64","0 11.5 Name: _0_flesch, dtype: float64",0.355
2,2,1.0,1.0,"0 70.73 Name: _0_flesch, dtype: float64","0 9.27 Name: _0_flesch, dtype: float64",0.4219
3,3,1.0,1.0,"0 70.13 Name: _0_flesch, dtype: float64","0 9.87 Name: _0_flesch, dtype: float64",0.4039
4,4,1.0,1.0,"0 70.73 Name: _0_flesch, dtype: float64","0 9.27 Name: _0_flesch, dtype: float64",0.4219
