In [None]:
import pandas as pd
import openai
import os
import sys
import numpy as np
from environs import Env
from azure.identity import DefaultAzureCredential, get_bearer_token_provider
from openai import AzureOpenAI
from openai import OpenAI
from pydantic import BaseModel
from tqdm import tqdm
from typing import Optional
from rapidfuzz import process, fuzz

sys.path.append('../')
from src.labelprocessor import LabelProcessor
from utils.loader import load_data

env = Env()
env.read_env('../.env')
study_path = env("STUDY_PATH")

In [None]:
type = "GI"
studies_folder = f"{study_path}/{type}/CSV/"

In [5]:
j_df, k_df = load_data(studies_folder)

In [6]:
# Process labels and calculate accuracy
processor = LabelProcessor(k_df, j_df)
result_df = processor.process()
# Keep only the samples without conflict and rearrange data
gt_df = processor.generate_gt()

In [None]:
gt_df

In [8]:
model_name="gpt-4o-mini-2024-07-18-api"
api_version = "2023-05-15"
endpoint = env("ENDPOINT")
entra_scope = env("ENTRA_SCOPE")

token_provider = get_bearer_token_provider(
    DefaultAzureCredential(), 
    entra_scope
)

client = AzureOpenAI(
    api_version=api_version,
    azure_endpoint=endpoint,
    azure_ad_token_provider=token_provider,
)

In [None]:
class IntervalHistoryOutput(BaseModel):
    start_string: Optional[str] = None
    end_string: Optional[str] = None

tools = [openai.pydantic_function_tool(IntervalHistoryOutput)]
print(tools)

def remove_strict_field(data):
    # Iterate through each dictionary in the list
    for item in data:
        # Check if 'strict' is a key in the 'function' dictionary
        if 'strict' in item['function']:
            # Remove the 'strict' field
            del item['function']['strict']
    return data

def extract_name_value(data):
    # Access the 'name' field in the 'function' dictionary
    name_value = data[0]['function']['name']
    return name_value

In [10]:
def openai_chat_completion_response(prompt, input_text):
#improve input of schema
    schema= [openai.pydantic_function_tool(IntervalHistoryOutput)]
    schema=remove_strict_field(schema)
    completion = client.chat.completions.create(
        model=model_name,
           
        #do not change temperature all research in lab uses temp of 0 unless otherwise discussed
        temperature=0.0,
        messages=[
        {"role": "system", "content": prompt},
        {"role": "user", "content": input_text}
      ],
        tools=schema,
        tool_choice={"type": "function", "function": {"name": f"{extract_name_value(schema)}"}},
    )
    
    return completion.choices[0].message.tool_calls[0].function.arguments

In [11]:
prompt="You are a helpful clinical researcher. Your task is to identify the 'Interval History' section in clinical notes provide the first and last 5 words in the Interval History section. if no Interval History output None"

In [14]:
# input = gt_df['note_text'].iloc[2]
# print(openai_chat_completion_response(prompt,input))

In [None]:
def process_text_data(df, prompt, text_column, new_col):
    """
    Process text data in the specified column of the DataFrame using an API call.

    Args:
    df (pd.DataFrame): DataFrame containing the text data.
    text_column (str): Column name in DataFrame that contains the text data to process.
    new_col (str): Column name for storing the API responses.

    Returns:
    pd.DataFrame: DataFrame with the new column containing API responses.
    """
    
    chunks = df[text_column].tolist()
    responses = []  # List to hold the responses

    for input_text in tqdm(chunks, desc="Processing text data"):
        try:
            # Call the API function for each prompt
            response = openai_chat_completion_response(prompt,input_text)
            responses.append(response)  # Append the response to the list
        except Exception as e:
            print(f"An error occurred with text '{input_text}': {e}")
            responses.append(None)  # Append None in case of an error

    # After the loop, responses will have the same order as chunks
    df[new_col] = responses  # Assign responses to the DataFrame
    return df

In [None]:
input_column='note_text'
output_column_name='gpt_hx'
subgt_df = gt_df.iloc[0:30].copy(deep=True)
outputs_df = process_text_data(subgt_df,prompt,input_column,output_column_name)
outputs_df.to_json('../outputs/gpt_outputs.json', orient='records')

outputs_df = pd.read_json("../outputs/gpt_outputs.json", orient="records")

In [None]:
gt_df = processor.reorganize_outputs(outputs_df, IntervalHistoryOutput)

In [None]:
gt_df.head()

In [None]:
# Adjusted function to take 'note', 'start', and 'end' from the same DataFrame row with optional fuzzy matching
def extract_section_from_note_from_row(row, use_fuzzy: bool = False, fuzz_threshold: int = 80) -> Optional[str]:
    """
    Extracts a section from a clinical note given the start and end strings from the same DataFrame row,
    with optional fuzzy matching.

    :param row: A row from a DataFrame containing 'note_text', 'start', and 'end' columns.
    :param use_fuzzy: Boolean flag to enable fuzzy matching.
    :param fuzz_threshold: The minimum similarity score for fuzzy matching (0-100).
    :return: The extracted section text, or None if the section cannot be found.
    """
    note = row['note']
    start_string = row['start_pred_string']
    end_string = row['end_pred_string']
    
    if pd.isna(start_string) or pd.isna(end_string):
        return np.nan, np.nan

    if use_fuzzy:
        # Perform fuzzy matching for start_string
        start_match = process.extractOne(start_string, note.splitlines(), scorer=fuzz.partial_ratio, score_cutoff=fuzz_threshold)
        if start_match:
            start_string = start_match[0]
        
        # Perform fuzzy matching for end_string
        end_match = process.extractOne(end_string, note.splitlines(), scorer=fuzz.partial_ratio, score_cutoff=fuzz_threshold)
        if end_match:
            end_string = end_match[0]

    if start_string not in note or end_string not in note:
        return np.nan, np.nan

    start_index = note.find(start_string)
    end_index = note.find(end_string, start_index) + len(end_string)

    if start_index == -1 or end_index == -1 or start_index >= end_index:
        return np.nan, np.nan

    return start_index, end_index

In [None]:
# Apply the function to each row in the DataFrame with fuzzy matching enabled
gt_df['start_pred_strict'], gt_df['end_pred_strict'] = zip(*gt_df.apply(lambda row: extract_section_from_note_from_row(row, use_fuzzy=False, fuzz_threshold=70), axis=1))
gt_df['start_pred_fuzzy'], gt_df['end_pred_fuzzy'] = zip(*gt_df.apply(lambda row: extract_section_from_note_from_row(row, use_fuzzy=True, fuzz_threshold=70), axis=1))

In [None]:
gt_df.head()

# Metrics

In [None]:
def compute_metrics(df, start_col, end_col, start_pred_col, end_pred_col):
    def calc_metrics(row):
        gt_start, gt_end = row[start_col], row[end_col]
        pred_start, pred_end = row[start_pred_col], row[end_pred_col]
        em = int(gt_start == pred_start and gt_end == pred_end)
        intersection = max(0, min(gt_end, pred_end) - max(gt_start, pred_start) + 1)
        pred_len = pred_end - pred_start + 1
        gt_len = gt_end - gt_start + 1
        precision = intersection / pred_len if pred_len > 0 else 0
        recall = intersection / gt_len if gt_len > 0 else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        return pd.Series([em, precision, recall, f1_score], index=['EM', 'Precision', 'Recall', 'F1_Score'])

    result_df = df[[start_col, end_col, start_pred_col, end_pred_col]].copy()
    result_df[['EM', 'Precision', 'Recall', 'F1_Score']] = df.apply(calc_metrics, axis=1)
    return result_df

# Apply to both strict and fuzzy
strict_df = compute_metrics(gt_df, 'start', 'end', 'start_pred_strict', 'end_pred_strict')
fuzzy_df = compute_metrics(gt_df, 'start', 'end', 'start_pred_fuzzy', 'end_pred_fuzzy')

In [None]:
# Summary statistics for both strict and fuzzy predictions in table format
def create_summary(df, label):
    summary = df[['EM', 'Precision', 'Recall', 'F1_Score']].mean().reset_index()
    summary.columns = ['Metric', label]
    return summary

# Create summaries for strict and fuzzy
strict_summary = create_summary(strict_df, 'Strict')
fuzzy_summary = create_summary(fuzzy_df, 'Fuzzy')

# Merge and display both summaries side by side in table format
summary_table = pd.merge(strict_summary, fuzzy_summary, on='Metric')
print(summary_table)

In [None]:
# Filter rows where any of the metrics is zero
non_zero_strict = strict_df[(strict_df['Precision'] != 0) & (strict_df['Recall'] != 0) & (strict_df['F1_Score'] != 0)]
non_zero_fuzzy = fuzzy_df[(fuzzy_df['Precision'] != 0) & (fuzzy_df['Recall'] != 0) & (fuzzy_df['F1_Score'] != 0)]

print(len(non_zero_strict)/len(gt_df), len(non_zero_fuzzy)/len(gt_df))

# Create summaries for strict and fuzzy
strict_summary = create_summary(non_zero_strict, 'Strict')
fuzzy_summary = create_summary(non_zero_fuzzy, 'Fuzzy')

# Merge and display both summaries side by side in table format
summary_table = pd.merge(strict_summary, fuzzy_summary, on='Metric')
print(summary_table)

In [None]:
import tiktoken
enc = tiktoken.get_encoding("o200k_base")

def token_counter(text):
    return len(enc.encode(text))

# Function to compute the total ratio
def compute_total_token_ratio(df):
    # Total tokens for the entire 'note' column
    total_full_tokens = df['note'].apply(token_counter).sum()

    # Total tokens for the substring between start_pred_fuzzy and end_pred_fuzzy
    total_fuzzy_tokens = df.apply(lambda row: token_counter(row['note'][row['start_pred_fuzzy']:row['end_pred_fuzzy']]), axis=1).sum()

    # Return the ratio of fuzzy tokens to full note tokens
    return total_fuzzy_tokens / total_full_tokens if total_full_tokens > 0 else 0

# Example usage
total_ratio = compute_total_token_ratio(gt_df)
print(f"Total token ratio (fuzzy vs full note): {total_ratio}")

# Timing Rapid Fuzz

In [None]:
n_notes = 1000000

import timeit

def extract_section_from_note_from_row(row, use_fuzzy: bool = False, fuzz_threshold: int = 80) -> Optional[str]:
    """
    Extracts a section from a clinical note given the start and end strings from the same DataFrame row,
    with optional fuzzy matching.

    :param row: A row from a DataFrame containing 'note_text', 'start', and 'end' columns.
    :param use_fuzzy: Boolean flag to enable fuzzy matching.
    :param fuzz_threshold: The minimum similarity score for fuzzy matching (0-100).
    :return: The extracted section text, or None if the section cannot be found.
    """
    note = row['note']
    start_string = row['start_pred_string']
    end_string = row['end_pred_string']
    
    if pd.isna(start_string) or pd.isna(end_string):
        return np.nan, np.nan

    if use_fuzzy:
        # Perform fuzzy matching for start_string
        start_match = process.extractOne(start_string, note.splitlines(), scorer=fuzz.partial_ratio, score_cutoff=fuzz_threshold)
        if start_match:
            start_string = start_match[0]
        
        # Perform fuzzy matching for end_string
        end_match = process.extractOne(end_string, note.splitlines(), scorer=fuzz.partial_ratio, score_cutoff=fuzz_threshold)
        if end_match:
            end_string = end_match[0]

    if start_string not in note or end_string not in note:
        return np.nan, np.nan

    start_index = note.find(start_string)
    end_index = note.find(end_string, start_index) + len(end_string)

    if start_index == -1 or end_index == -1 or start_index >= end_index:
        return np.nan, np.nan

    return start_index, end_index

# Define a wrapper function for applying the extraction to your DataFrame
def benchmark_function():
    gt_df['start_pred_fuzzy'], gt_df['end_pred_fuzzy'] = zip(
        *gt_df.apply(lambda row: extract_section_from_note_from_row(row, use_fuzzy=True, fuzz_threshold=70), axis=1)
    )

# Time the execution of 1000 calls to the function
execution_time = timeit.timeit(benchmark_function, number=int(n_notes/len(gt_df)))

print(f"Execution time for {n_notes} notes: {execution_time:.2f} seconds")
print(f"Average processing time per note: {(execution_time / n_notes) * 1000:.4f} ms")

In [None]:
n_notes = 1000000

import timeit

def extract_section_from_note_from_row_fuzzy(row, use_fuzzy: bool = False, fuzz_threshold: int = 80) -> Optional[str]:
    """
    Extracts a section from a clinical note given the start and end strings from the same DataFrame row,
    with optional fuzzy matching.

    :param row: A row from a DataFrame containing 'note_text', 'start', and 'end' columns.
    :param use_fuzzy: Boolean flag to enable fuzzy matching.
    :param fuzz_threshold: The minimum similarity score for fuzzy matching (0-100).
    :return: The extracted section text, or None if the section cannot be found.
    """
    note = row['note']
    start_string = row['start_pred_string']
    end_string = row['end_pred_string']
    
    if pd.isna(start_string) or pd.isna(end_string):
        return np.nan, np.nan

    if use_fuzzy:
        # Perform fuzzy matching for start_string
        start_match = process.extractOne(start_string, note.splitlines(), scorer=fuzz.partial_ratio, score_cutoff=fuzz_threshold)
        if start_match:
            start_string = start_match[0]
            start_index = note.find(start_string)
        
            # Perform fuzzy matching for end_string
            end_match = process.extractOne(end_string, note[start_index:].splitlines(), scorer=fuzz.partial_ratio, score_cutoff=fuzz_threshold)
            if end_match:
                end_string = end_match[0]

    if start_string not in note or end_string not in note:
        return np.nan, np.nan

    start_index = note.find(start_string)
    end_index = note.find(end_string, start_index) + len(end_string)

    if start_index == -1 or end_index == -1 or start_index >= end_index:
        return np.nan, np.nan

    return start_index, end_index

# Define a wrapper function for applying the extraction to your DataFrame
def benchmark_function():
    gt_df['start_pred_fuzzy'], gt_df['end_pred_fuzzy'] = zip(
        *gt_df.apply(lambda row: extract_section_from_note_from_row(row, use_fuzzy=True, fuzz_threshold=70), axis=1)
    )

# Time the execution of 1000 calls to the function
execution_time = timeit.timeit(benchmark_function, number=int(n_notes/len(gt_df)))

print(f"Execution time for {n_notes} notes: {execution_time:.2f} seconds")
print(f"Average processing time per note: {(execution_time / n_notes) * 1000:.4f} ms")

In [None]:
n_notes = 1000000

import timeit

def extract_section_from_note_from_row_fuzzy(row, fuzz_threshold: int = 80) -> Optional[tuple]:
    """
    Extracts a section from a clinical note given the start and end strings from the same DataFrame row,
    with optional fuzzy matching.

    :param row: A row from a DataFrame containing 'note', 'start_pred_string', and 'end_pred_string' columns.
    :param use_fuzzy: Boolean flag to enable fuzzy matching.
    :param fuzz_threshold: The minimum similarity score for fuzzy matching (0-100).
    :return: The start and end index of the extracted section, or (np.nan, np.nan) if the section cannot be found.
    """
    note = row['note']
    start_string = row['start_pred_string']
    end_string = row['end_pred_string']
    
    if pd.isna(start_string) or pd.isna(end_string):
        return np.nan, np.nan

    # Split note into lines
    note_lines = note.splitlines()

    # Perform fuzzy matching for start_string
    start_match = process.extractOne(start_string, note_lines, scorer=fuzz.partial_ratio, score_cutoff=fuzz_threshold)
    if start_match:
        start_string = start_match[0]
        start_index = len("\n".join(note_lines[:start_match[2]])) + (1 if start_match[2] > 0 else 0)

        # Perform fuzzy matching for end_string starting after the matched start line
        end_match = process.extractOne(end_string, note_lines[start_match[2]:], scorer=fuzz.partial_ratio, score_cutoff=fuzz_threshold, processor=utils.default_process)
        if end_match:
            end_string = end_match[0]
            end_line_index = start_match[2] + end_match[2]
            end_index = len("\n".join(note_lines[:end_line_index + 1]))
    else:
        return np.nan, np.nan

    if start_index == -1 or end_index == -1 or start_index >= end_index:
        return np.nan, np.nan

    return start_index, end_index

# Define a wrapper function for applying the extraction to your DataFrame
def benchmark_function():
    gt_df['start_pred_fuzzy'], gt_df['end_pred_fuzzy'] = zip(
        *gt_df.apply(lambda row: extract_section_from_note_from_row_fuzzy(row, fuzz_threshold=70), axis=1)
    )

# Time the execution of 1000 calls to the function
execution_time = timeit.timeit(benchmark_function, number=int(n_notes/len(gt_df)))

print(f"Execution time for {n_notes} notes: {execution_time:.2f} seconds")
print(f"Average processing time per note: {(execution_time / n_notes) * 1000:.4f} ms")