In [None]:
# import necessary libraries
import pandas as pd
import os
import textstat
from openai import OpenAI
import openai
import json
import re
import requests
from dotenv import load_dotenv
import math
from IPython.display import FileLink
import concurrent.futures
import time
import logging


In [None]:
# import prompts 
from jh_pfx_prompts import example, icd10_example, baseline_zeroshot_prompt, single_fewshot_prompt, single_fewshot_icd10_labeling_prompt

In [None]:
os.environ['OPENAI_API_KEY'] = ''

In [None]:
# api key
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
CLIENT = OpenAI(api_key = OPENAI_API_KEY)
OPENAI_MODEL = "gpt-4o"

In [None]:
#reading levels
PROFESSIONAL = "Professional"
COLLEGE_GRADUATE = "College Graduate"
COLLEGE = "College"
TENTH_TO_TWELTH_GRADE = "10th to 12th grade"
EIGTH_TO_NINTH_GRADE = "8th to 9th grade"
SEVENTH_GRADE = "7th grade"
SIXTH_GRADE = "6th grade"
FIFTH_GRADE = "5th grade"
N_A = "N/A"

In [None]:
# https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests#Flesch_reading_ease
def map_reading_level(flesch_reading_ease):
    if flesch_reading_ease < 10:
        return PROFESSIONAL
    elif 10.0 <= flesch_reading_ease < 30.0:
        return COLLEGE_GRADUATE
    elif 30.0 <= flesch_reading_ease < 50.0:
        return COLLEGE
    elif 50.0 <= flesch_reading_ease < 60.0:
        return TENTH_TO_TWELTH_GRADE
    elif 60.0 <= flesch_reading_ease < 70.0:
        return EIGTH_TO_NINTH_GRADE
    elif 70.0 <= flesch_reading_ease < 80.0:
        return SEVENTH_GRADE
    elif 80.0 <= flesch_reading_ease < 90.0:
        return SIXTH_GRADE
    elif 90.0 <= flesch_reading_ease < 100.0:
        return FIFTH_GRADE 
    else:
        return N_A

In [None]:
# reading ease variables
fifth_grade = 95
sixth_grade = 85
seventh_grade = 75
eigth_and_ninth_grade = 65
tenth_to_twelfth_grade = 55
college = 40
college_graduate = 20
professional = 5

In [None]:
def adjust_difference(diff, threshold):
    """Adjust the readability difference based on the threshold."""
    if diff > threshold:
        return diff - threshold
    return 0

In [None]:
# import fewshot examples
df_fewshot = pd.read_csv('pfx_fewshot_examples_college.csv')

In [None]:
# import evaluation data 
df_eval = pd.read_csv('missingmf.csv')

In [None]:
print(df_eval.columns)

In [None]:
def extract_json(openai_response):
    if openai_response:  # Ensure the response is not None
        # Directly search for JSON within the string response
        json_match = re.search(r'```.*?(\{.*?\}).*?```', openai_response, re.DOTALL)
        if json_match:
            json_str = json_match.group(1)  # Extract JSON-like content
            try:
                # Convert extracted string to a JSON object
                json_object = json.loads(json_str.replace('\n', ''))
                return json_object
            except json.JSONDecodeError as e:
                # Handle JSON decoding errors
                print("JSON decoding failed: ", e)
                return {}
        else:
            print("No JSON object found in the response.")
            return {}
    else:
        return None

In [None]:
def label_icd10s(pfx_output):
    """
    Takes a single PFx response (string or JSON) and returns
    a labeled ICD-10 result as a Python dictionary (or object).
    """

    # Build up the few-shot examples for ICD-10 labeling
    pfx_icd10_fewshot_examples = ""
    for i, row in df_fewshot.iterrows():
        pfx_icd10_fewshot_examples += icd10_example.format(**row)

    # Generate the prompt for ICD-10 labeling
    # (Adjust the '{PFx}' if pfx_output is a dictionary with a specific key you need)
    prompt = single_fewshot_icd10_labeling_prompt.format(
        examples=pfx_icd10_fewshot_examples,
        PFx=pfx_output  # or PFx=pfx_output['key'] if needed
    )

    # Call the model to get ICD-10 codes
    pfx_icd10_response = CLIENT.chat.completions.create(
        model=OPENAI_MODEL,
        temperature=0.0,
        messages=[
            {
                "role": "system",
                "content": "You are an ICD10 medical coder for incidental findings. Always respond with a valid JSON object containing the ICD-10 code and its explanation."
            },
            {
                "role": "system",
                "content": prompt
            }
        ],
        stream=False,
    )

    # Extract the JSON structure (or dictionary) from the LLM response
    labeled_result = extract_json(pfx_icd10_response.choices[0].message.content)  # Accessing the message content

    return labeled_result


In [None]:
# Initialize the results DataFrame
results_df = pd.DataFrame(columns=["finding", "ICD10_code", "PFx", "PFx_ICD10_code"])

# Generate few-shot examples
pfx_fewshot_examples = ""
for i, row in df_fewshot.iterrows():
    pfx_fewshot_examples += example.format(**row)

# Generate PFx for each row in df_eval with 5 runs
for i, row in df_eval.iterrows():
    for run in range(5):  # Perform 5 runs
        # Format the prompt
        prompt = single_fewshot_prompt.format(
            Examples=pfx_fewshot_examples,
            Incidental_Finding=row["Incidental_Finding"],
            Reading_Level=SIXTH_GRADE
        )
        
        # Generate response from the client
        pfx_response = CLIENT.chat.completions.create(
            model=OPENAI_MODEL,
            temperature=0.0,
            messages=[
                {"role": "system", "content": "You are a medical doctor rephrasing and explaining medical terminology to a patient in an understandable manner."},
                {"role": "user", "content": prompt}
            ],
            stream=False,
        )
        
        # Extract JSON from the response
        extracted_response = extract_json(pfx_response.choices[0].message.content)
        
        # Create a new DataFrame for the current row
        new_row = pd.DataFrame({
            "finding": [row["Incidental_Finding"]],
            "ICD10_code": [row.get("ICD-10 Code", None)],  # Handle missing 'ICD10_code'
            "PFx": [extracted_response["PFx"]],  # Extracted explanation
            "PFx_ICD10_code": [extracted_response.get("PFx_ICD10_code", None)]  # Optional field
        })
        
        # Concatenate the new row to the results DataFrame
        results_df = pd.concat([results_df, new_row], ignore_index=True)

In [None]:
import pandas as pd
import json
import time
import openai
import concurrent.futures
import logging
from collections import defaultdict

# Configure logging
logging.basicConfig(filename="openai_api.log", level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

# Initialize OpenAI Client
client = openai.OpenAI()

# Initialize the results DataFrame
results_df = pd.DataFrame(columns=["finding", "ICD10_code", "PFx", "PFx_ICD10_code"])

# Generate few-shot examples
pfx_fewshot_examples = "\n".join(example.format(**row.to_dict()) for _, row in df_fewshot.iterrows())

# Function to generate a request
def create_task(row_idx, run):
    prompt = single_fewshot_prompt.format(
        Examples=pfx_fewshot_examples,
        Incidental_Finding=df_eval.iloc[row_idx]["Incidental_Finding"],
        Reading_Level=SIXTH_GRADE
    )
    return {
        "custom_id": f"task-{row_idx}-run-{run}",
        "model": OPENAI_MODEL,
        "temperature": 0.0,
        "messages": [
            {"role": "system", "content": "You are a medical professional rephrasing and explaining medical terminology to a patient in an understandable manner."},
            {"role": "user", "content": prompt}
        ],
        "row_idx": row_idx
    }

# Prepare batch requests (5 per row)
batch_requests = [create_task(i, run) for i in range(len(df_eval)) for run in range(5)]

# Function to call OpenAI API with retry logic
def call_openai_api(task):
    """Calls OpenAI API and ensures a valid response with retries."""
    max_retries = 5
    for attempt in range(max_retries):
        try:
            response = client.chat.completions.create(
                model=task["model"],
                messages=task["messages"],
                temperature=task["temperature"]
            )
            return task["custom_id"], response, task["row_idx"]
        except openai.OpenAIError as e:
            wait_time = (attempt + 1) * 2
            logging.warning(f"API error for {task['custom_id']}: {e}. Retrying in {wait_time}s...")
            time.sleep(wait_time)
    logging.error(f"Failed after {max_retries} attempts: {task['custom_id']}")
    return task["custom_id"], None, task["row_idx"]

# Run API calls in parallel
results = []
max_workers = 13
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
    future_to_task = {executor.submit(call_openai_api, task): task for task in batch_requests}

    for future in concurrent.futures.as_completed(future_to_task):
        task = future_to_task[future]
        try:
            custom_id, response, row_idx = future.result()
            if response:
                results.append({"custom_id": custom_id, "row_idx": row_idx, "response": response})
            else:
                logging.warning(f"Missing response for {custom_id}")
        except Exception as e:
            logging.error(f"Unexpected error with {task['custom_id']}: {e}")

# Strict check: Ensure 5 responses per row
responses_per_row = defaultdict(list)
for item in results:
    responses_per_row[item["row_idx"]].append(item)

# Identify missing responses
missing_tasks = []
for row_idx in range(len(df_eval)):
    count = len(responses_per_row[row_idx])
    if count < 5:
        missing_runs = list(range(count, 5))  # Find missing runs
        logging.warning(f"Row {row_idx} only received {count}/5 responses. Retrying {len(missing_runs)} requests.")
        for run in missing_runs:
            missing_tasks.append(create_task(row_idx, run))

# Retry missing responses (if any)
if missing_tasks:
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_task = {executor.submit(call_openai_api, task): task for task in missing_tasks}
        for future in concurrent.futures.as_completed(future_to_task):
            task = future_to_task[future]
            try:
                custom_id, response, row_idx = future.result()
                if response:
                    responses_per_row[row_idx].append({"custom_id": custom_id, "row_idx": row_idx, "response": response})
                else:
                    logging.warning(f"Missing response for retry {custom_id}")
            except Exception as e:
                logging.error(f"Unexpected error with retry {task['custom_id']}: {e}")

# Final verification: Log any remaining missing responses
for row_idx in range(len(df_eval)):
    count = len(responses_per_row[row_idx])
    if count < 5:
        logging.error(f"FINAL WARNING: Row {row_idx} still missing {5 - count} responses despite retries.")

# Process responses & update DataFrame
new_rows = []
for row_idx in responses_per_row:
    for item in responses_per_row[row_idx]:
        task_id = item["custom_id"]
        content = item["response"].choices[0].message.content.strip()

        if not content:
            logging.warning(f"Empty response for {task_id}")
            extracted_response = {"PFx": "ERROR: No response", "PFx_ICD10_code": "ERROR"}
        else:
            try:
                extracted_response = extract_json(content)
            except Exception as e:
                logging.error(f"JSON extraction failed for {task_id}: {e}")
                extracted_response = {"PFx": "ERROR: JSON parsing failed", "PFx_ICD10_code": "ERROR"}

        new_rows.append({
            "finding": df_eval.iloc[row_idx]["Incidental_Finding"],
            "ICD10_code": df_eval.iloc[row_idx].get("ICD-10 Code", None),
            "PFx": extracted_response["PFx"],
            "PFx_ICD10_code": extracted_response.get("PFx_ICD10_code", None)
        })

# Create DataFrame
results_df = pd.DataFrame(new_rows)


In [None]:
results_df

In [None]:
results_df = pd.read_csv("mftest55.csv")

In [None]:
len(results_df)

In [None]:
results_2_df = results_df.iloc[175:]

In [None]:
results_2_df

In [None]:
# Create a new list to store the labeled ICD10 responses
labeled_icd10_responses_2 = []

# Iterate over each response in results and apply label_icd10s functions 
for response in results_2_df['PFx']:
    labeled_icd10_responses_2.append(label_icd10s(response))

In [None]:
labeled_icd10_responses

In [None]:
len(labeled_icd10_responses)

In [None]:
labeled_icd10_responses += labeled_icd10_responses_2

In [None]:
len(labeled_icd10_responses)

In [None]:
# Create lists to store the results
agent_icd10_codes = []
icd10_matches = []
pfx_icd10_matches = []
flesch_scores = []

agent_icd10_codes.extend([list(x.values())[0] if x else "" for x in labeled_icd10_responses])

for index, row in results_df.iterrows():
    # Compare to the "ICD10_code" in your DataFrame (if it exists)
    agent_icd10_code = agent_icd10_codes[index]
    icd10_match = (row["ICD10_code"] == agent_icd10_code)
    icd10_matches.append(icd10_match)

    # compare 
    pfx_icd10_match = (row["PFx_ICD10_code"] == row["ICD10_code"])
    pfx_icd10_matches.append(pfx_icd10_match)

    # Calculate the Flesch Reading Ease score
    flesch_score = textstat.flesch_reading_ease(row['PFx'])
    flesch_scores.append(flesch_score)

# Add the results to the DataFrame
results_df['_0_agent_icd10_codes'] = agent_icd10_codes
results_df['_0_icd10_matches'] = icd10_matches
results_df['_0_pfx_icd10_matches'] = pfx_icd10_matches
results_df['_0_flesch'] = flesch_scores

In [None]:
desired_reading_ease = sixth_grade
# Calculate threshold for penalty
if desired_reading_ease >= 55:
    threshold = 10
else:
    threshold = 20

# Create lists to store the results
accuracy_icd10_matches_list = []
accuracy_pfx_matches_list = []
readability_difference_list = []
overall_score_list = []
log_overall_score_list = []

# Iterate over each row in the DataFrame
for index, row in results_df.iterrows():
    # Calculate accuracy score
    accuracy_icd10_matches = row["_0_icd10_matches"]
    accuracy_pfx_matches = row["_0_pfx_icd10_matches"]
    flesch_score = row["_0_flesch"]

    # total number of icd10 matches
    total_icd10_matches = accuracy_icd10_matches + accuracy_pfx_matches

    # Adjust weights for overall score
    # Calculate readability score 
    readability_score = flesch_score
    readability_difference = abs(readability_score - desired_reading_ease)

    # Compute the overall score
    overall_score = total_icd10_matches * 0.8  + 0.2 * (1/(readability_difference + 1))

    # Calculate readability score
    readability_difference_log = desired_reading_ease - flesch_score
    if readability_difference_log <= threshold:  # No penalty if difference is within the threshold
        readability_difference_p = 0
    else:  # Apply penalty only if readability exceeds the threshold
        readability_difference_with_threshold = readability_difference_log - threshold
        readability_difference_p = math.log(1 + readability_difference_with_threshold) / math.log(20)

    log_overall_score = total_icd10_matches * 0.8 + readability_difference_log * 0.2

    # Append results to lists
    accuracy_icd10_matches_list.append(float(accuracy_icd10_matches))
    accuracy_pfx_matches_list.append(float(accuracy_pfx_matches))
    readability_difference_list.append(float(readability_difference))
    overall_score_list.append(float(overall_score))
    log_overall_score_list.append(float(log_overall_score))

# Create a DataFrame with the results
grades_data = {
    "accuracy_agent_icd10": accuracy_icd10_matches_list,
    "accuracy_pfx_icd10": accuracy_pfx_matches_list,
    "readability_difference": readability_difference_list,
    "overall_score": overall_score_list,
    "log_overall_score": log_overall_score_list,
}
grades = pd.DataFrame(grades_data)
results_df = pd.concat([results_df, grades], axis=1)

In [None]:
results_df

In [None]:
results_df.sort_values(by="finding", ascending=True, inplace=True)

In [None]:
results_df.to_csv('mf_pfx_6.csv', index = False)

In [None]:
FileLink('mf_pfx_6.csv')