### Importing utilities

In [1]:
import sys
from pathlib import Path
src_path = Path().resolve().parent / "src"
sys.path.append(str(src_path))

from preprocessing import Preprocessing
from inference import Inference, get_pred_indexes
from report import PDFGenerator, postprocessing

### CORAL - Unannotated dataset

The format is supported by default by the preprocessing module.

In [None]:
# preprocessor = Preprocessing("../data/curated-oncology-reports/1.0/coral/unannotated/data/breastca_unannotated.csv", note_text_column="note_text")
preprocessor = Preprocessing("../data/curated-oncology-reports/1.0/coral/unannotated/data/pdac_unannotated.csv", note_text_column="note_text")
df = preprocessor.get_processed_dataframe()

### CORAL - Annotated dataset

You can also manually import the data if your format differs.

In [None]:
import os
import pandas as pd

def extract_annotations(folder_path):
    """
    Extract information from .ann and .txt files and store it in a DataFrame.

    :param folder_path: Path to the folder containing .ann and .txt files.
    :return: DataFrame with columns note, RCH_start_gt, RCH_end_gt, AP_start_gt, AP_end_gt.
    """
    data = []

    # Iterate through all files in the folder
    for filename in os.listdir(folder_path):
        # Process only .ann files
        if filename.endswith(".ann"):
            file_number = os.path.splitext(filename)[0]  # Get the file number (e.g., 20 from 20.ann)
            txt_file = os.path.join(folder_path, f"{file_number}.txt")
            ann_file = os.path.join(folder_path, filename)

            # Read the note from the .txt file
            with open(txt_file, "r") as txt_f:
                note = txt_f.read().strip()

            # Initialize variables for the annotations
            RCH_start_gt, RCH_end_gt, AP_start_gt, AP_end_gt = None, None, None, None

            # Parse the .ann file
            with open(ann_file, "r") as ann_f:
                for line in ann_f:
                    parts = line.strip().split()
                    if len(parts) >= 4:
                        tag, label, start, end = parts[0], parts[1], parts[2], parts[3]
                        if label == "hpi_start":
                            RCH_start_gt = int(start)
                        elif label == "hpi_end":
                            RCH_end_gt = int(end)
                        elif label == "ap_start":
                            AP_start_gt = int(start)
                        elif label == "ap_end":
                            AP_end_gt = int(end)

            # Append the extracted data
            data.append({
                "file_number": file_number,
                "note": note,
                "RCH_start_gt": RCH_start_gt,
                "RCH_end_gt": RCH_end_gt,
                "AP_start_gt": AP_start_gt,
                "AP_end_gt": AP_end_gt,
            })

    # Create DataFrame
    df = pd.DataFrame(data)
    df.set_index("file_number", inplace=True)
    return df

In [None]:
folder_path = "../data/curated-oncology-reports/1.0/coral/annotated/pdac"
# folder_path = "../data/curated-oncology-reports/1.0/coral/annotated/breastca"
df = extract_annotations(folder_path)
df.index = df.index.astype(int)
df = df.sort_index()

### Inference

In [None]:
inference_engine = Inference("../models/Meta-Llama-3.1-8B-Instruct")

In [None]:
llm_output = inference_engine.generate(df)

In [None]:
llm_output = get_pred_indexes(llm_output)

llm_output now contains the predicted start and end indexes for both RCH and AP sections

In [None]:
llm_output.head()

### Post processing

To get the extracted sections a column of strings or a pdf to visualize the results

In [None]:
postprocessed_df = postprocessing(llm_output)

In [None]:
# without ground truth overlay
generator = PDFGenerator(llm_output, 'RCH_start_pred', 'RCH_end_pred',
                                       'AP_start_pred', 'AP_end_pred')

# with ground truth overlay
# generator = PDFGenerator(llm_output, 'RCH_start_pred', 'RCH_end_pred',
#                                        'AP_start_pred', 'AP_end_pred',
#                                        "RCH_start_gt", "RCH_end_gt",
#                                         'AP_start_gt', 'AP_end_gt')

In [None]:
postprocessed_df.to_csv("../outputs/coral_pred.csv")