In [None]:
import pandas as pd
import requests
import openai
import os
import sys
import json
from tqdm import tqdm
import numpy as np
from environs import Env
from azure.identity import DefaultAzureCredential, get_bearer_token_provider
from openai import AzureOpenAI
from openai import OpenAI
from pydantic import BaseModel
from tqdm import tqdm
from typing import Optional
from rapidfuzz import process, fuzz

sys.path.append('../')
from src.labelprocessor import LabelProcessor
from utils.loader import load_data
from utils.structs import IntervalHistoryOutput

In [None]:
type = "GI"
studies_folder = f"../data/{type}/CSV/"

In [None]:
j_df, k_df = load_data(studies_folder)

In [None]:
# Process labels and calculate accuracy
processor = LabelProcessor(k_df, j_df)
result_df = processor.process()
# Keep only the samples without conflict and rearrange data
gt_df = processor.generate_gt()

In [None]:
main_prompt = """You are a helpful assistant with a strong clinical background in oncology. 
You know that medical notes are generally organized in sections, and your task is to find 
the part of the note corresponding to the section containing the History of present illness and 
the Interval history. You should organize this information in a json file that will contain a 
dictionary with two keys: HPI_Interval_Hx_begin, and HPI_Interval_Hx_end. HPI_Interval_Hx_begin 
should contain the 5 first words of the HPI_Interval_Hx section, and HPI_Interval_Hx_end should 
contain the last 5 words of the HPI_Interval_Hx section.  Here is the medical note: """

In [None]:
def generate_hpi_intervals(df):
    hpi_begin = []
    hpi_end = []
    for note in tqdm(df['note_text'], desc="Processing Notes"):
        prompt = main_prompt + note
        payload = {
            "prompt": prompt,
        }
        headers = {
            "Content-Type": "application/json"
        }
        try:
            url = "http://localhost:8080/completion"
            response = requests.post(url, json=payload, headers=headers)
            response.raise_for_status()
            result = response.json().get('content', None)
            if result:
                # print(f"Result content: {result}")
                result_dict = json.loads(result)
                hpi_begin_text = result_dict.get('HPI_Interval_Hx_begin', None)
                hpi_end_text = result_dict.get('HPI_Interval_Hx_end', None)
                if hpi_begin_text:
                    begin_words = hpi_begin_text.split()
                    hpi_begin.append(' '.join(begin_words[:5]))
                else:
                    hpi_begin.append(None)
                if hpi_end_text:
                    end_words = hpi_end_text.split()
                    hpi_end.append(' '.join(end_words[:5]))
                else:
                    hpi_end.append(None)
            else:
                hpi_begin.append(None)
                hpi_end.append(None)
        except requests.exceptions.RequestException as e:
            print(f"Error: {e}")
            hpi_begin.append(None)
            hpi_end.append(None)
        except json.JSONDecodeError as e:
            print(f"JSON Decode Error: {e}")
            hpi_begin.append(None)
            hpi_end.append(None)
    df['start_pred_string'] = hpi_begin
    df['end_pred_string'] = hpi_end
    return df

In [None]:
updated_df = generate_hpi_intervals(gt_df)
updated_df.to_csv(f"../outputs/local_llm/{type}/8192/llama_3_1_8B_Q4.csv")

In [None]:
updated_df = pd.read_csv(f"../outputs/local_llm/{type}/8192/llama_3_1_8B_Q6.csv")

In [None]:
updated_df

In [None]:
gt_df = processor.reorganize_outputs(updated_df, IntervalHistoryOutput)

In [None]:
gt_df.head()

In [None]:
# Adjusted function to take 'note', 'start', and 'end' from the same DataFrame row with optional fuzzy matching
def extract_section_from_note_from_row(row, use_fuzzy: bool = False, fuzz_threshold: int = 80) -> Optional[str]:
    """
    Extracts a section from a clinical note given the start and end strings from the same DataFrame row,
    with optional fuzzy matching.

    :param row: A row from a DataFrame containing 'note_text', 'start', and 'end' columns.
    :param use_fuzzy: Boolean flag to enable fuzzy matching.
    :param fuzz_threshold: The minimum similarity score for fuzzy matching (0-100).
    :return: The extracted section text, or None if the section cannot be found.
    """
    note = row['note']
    start_string = row['start_pred_string']
    end_string = row['end_pred_string']
    
    if pd.isna(start_string) or pd.isna(end_string):
        return np.nan, np.nan

    if use_fuzzy:
        # Perform fuzzy matching for start_string
        start_match = process.extractOne(start_string, note.splitlines(), scorer=fuzz.partial_ratio, score_cutoff=fuzz_threshold)
        if start_match:
            start_string = start_match[0]
        
        # Perform fuzzy matching for end_string
        end_match = process.extractOne(end_string, note.splitlines(), scorer=fuzz.partial_ratio, score_cutoff=fuzz_threshold)
        if end_match:
            end_string = end_match[0]

    if start_string not in note or end_string not in note:
        return np.nan, np.nan

    start_index = note.find(start_string)
    end_index = note.find(end_string, start_index) + len(end_string)

    if start_index == -1 or end_index == -1 or start_index >= end_index:
        return np.nan, np.nan

    return start_index, end_index

In [None]:
# Apply the function to each row in the DataFrame with fuzzy matching enabled
gt_df['start_pred_strict'], gt_df['end_pred_strict'] = zip(*gt_df.apply(lambda row: extract_section_from_note_from_row(row, use_fuzzy=False, fuzz_threshold=70), axis=1))
gt_df['start_pred_fuzzy'], gt_df['end_pred_fuzzy'] = zip(*gt_df.apply(lambda row: extract_section_from_note_from_row(row, use_fuzzy=True, fuzz_threshold=70), axis=1))

In [None]:
gt_df.head()

# Metrics

In [None]:
def compute_metrics(df, start_col, end_col, start_pred_col, end_pred_col):
    def calc_metrics(row):
        gt_start, gt_end = row[start_col], row[end_col]
        pred_start, pred_end = row[start_pred_col], row[end_pred_col]
        em = int(gt_start == pred_start and gt_end == pred_end)
        intersection = max(0, min(gt_end, pred_end) - max(gt_start, pred_start) + 1)
        pred_len = pred_end - pred_start + 1
        gt_len = gt_end - gt_start + 1
        precision = intersection / pred_len if pred_len > 0 else 0
        recall = intersection / gt_len if gt_len > 0 else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        return pd.Series([em, precision, recall, f1_score], index=['EM', 'Precision', 'Recall', 'F1_Score'])

    result_df = df[[start_col, end_col, start_pred_col, end_pred_col]].copy()
    result_df[['EM', 'Precision', 'Recall', 'F1_Score']] = df.apply(calc_metrics, axis=1)
    return result_df

# Apply to both strict and fuzzy
strict_df = compute_metrics(gt_df, 'start', 'end', 'start_pred_strict', 'end_pred_strict')
fuzzy_df = compute_metrics(gt_df, 'start', 'end', 'start_pred_fuzzy', 'end_pred_fuzzy')

In [None]:
# Summary statistics for both strict and fuzzy predictions in table format
def create_summary(df, label):
    summary = df[['EM', 'Precision', 'Recall', 'F1_Score']].mean().reset_index()
    summary.columns = ['Metric', label]
    return summary

# Create summaries for strict and fuzzy
strict_summary = create_summary(strict_df, 'Strict')
fuzzy_summary = create_summary(fuzzy_df, 'Fuzzy')

# Merge and display both summaries side by side in table format
summary_table = pd.merge(strict_summary, fuzzy_summary, on='Metric')
print(summary_table)

In [None]:
# Filter rows where any of the metrics is zero
non_zero_strict = strict_df[(strict_df['Precision'] != 0) & (strict_df['Recall'] != 0) & (strict_df['F1_Score'] != 0)]
non_zero_fuzzy = fuzzy_df[(fuzzy_df['Precision'] != 0) & (fuzzy_df['Recall'] != 0) & (fuzzy_df['F1_Score'] != 0)]

print(len(non_zero_strict)/len(gt_df), len(non_zero_fuzzy)/len(gt_df))

# Create summaries for strict and fuzzy
strict_summary = create_summary(non_zero_strict, 'Strict')
fuzzy_summary = create_summary(non_zero_fuzzy, 'Fuzzy')

# Merge and display both summaries side by side in table format
summary_table = pd.merge(strict_summary, fuzzy_summary, on='Metric')
print(summary_table)