In [None]:
import os
import math
import pandas as pd
import sys
from environs import Env
sys.path.append('../')
from src.labelprocessor import LabelProcessor
from src.errorlogger import ErrorLogger
from src.pdfgenerator import PDFGenerator
from utils.loader import load_data
from utils.annotation_analysis import annotation_overview_text, annotation_overview_plot

env = Env()
env.read_env('../.env')
study_path = env("STUDY_PATH")

In [None]:
type = "GI" # Breast, GI, Neuro
studies_folder = f"{study_path}/{type}/CSV/"

# Label Processor for computing scores

In [None]:
j_df, k_df = load_data(studies_folder)

In [None]:
# Process labels and calculate accuracy
processor = LabelProcessor(k_df, j_df)
result_df = processor.process()

In [None]:
print(result_df['HPI_Interval_Hx'].mean())

In [None]:
print(result_df['A&P'].mean())

# Error Logger for error analysis

In [None]:
error_logger = ErrorLogger(result_df)
# error_logger.log_errors(f"../outputs/{type}_conflicts_log.txt")

In [None]:
generator = PDFGenerator(k_df, j_df, result_df)
generator.convert_to_pdf(f"../outputs/{type}.pdf")

In [None]:
error_df = error_logger.get_error_df()
filtered_k_df = k_df.loc[error_df.index]
filtered_j_df = j_df.loc[error_df.index]
generator_error = PDFGenerator(filtered_k_df, filtered_j_df, error_df)
generator_error.convert_to_pdf(f"../outputs/{type}_conflicts.pdf")

In [None]:
annotation_overview_text(result_df, 0.8, type)

In [None]:
annotation_overview_plot(result_df, 0.8, type)

# Creating adjudication batches

In [None]:
reference = pd.read_csv(f'{study_path}/notes_to_annotate/{type}/{type}_batch_1.csv')
def create_subdataframe(result_df, j_df, reference, threshold):
    merged_df = j_df.join(result_df[['HPI_Interval_Hx', 'A&P']], how='inner')
    mask = (merged_df['HPI_Interval_Hx'] < threshold) | (merged_df['A&P'] < threshold)
    sub_df = merged_df[mask].copy()
    columns_to_keep = reference.columns.tolist() + ['HPI_Interval_Hx', 'A&P']
    sub_df = sub_df[columns_to_keep]
    return sub_df

In [None]:
sub_df = create_subdataframe(result_df, j_df, reference, threshold=0.8)

In [None]:
def save_to_csv_in_batches(df, type, batch_size=25):
    num_batches = math.ceil(len(df) / batch_size)
    for i in range(num_batches):
        batch_df = df.iloc[i * batch_size : (i + 1) * batch_size]
        file_name = f"../outputs/adjudication/{type}/{type}_batch_{i + 1}.csv"
        batch_df.to_csv(file_name, index=False)
        error_logger = ErrorLogger(batch_df)
        error_df = error_logger.get_error_df()
        filtered_k_df = k_df.loc[error_df.index]
        filtered_j_df = j_df.loc[error_df.index]
        generator_error = PDFGenerator(filtered_k_df, filtered_j_df, error_df)
        generator_error.convert_to_pdf(f"../outputs/adjudication/{type}/{type}_batch_{i + 1}.pdf")
        print(f"Saved: {file_name}")

In [None]:
save_to_csv_in_batches(sub_df, type, batch_size=25)

In [None]:
import json

gt_df = processor.generate_gt()

def extract_labels(df):
    hpi_texts = []
    ap_texts = []
    
    for row in df['k_label']:
        if pd.notna(row):
            try:
                label_data = json.loads(row)
                hpi_text = None
                ap_text = None
                for item in label_data:
                    text = item['text']
                    label = item['labels'][0]  # We assume there's only one label in the list
                    if label == "HPI_Interval_Hx":
                        hpi_text = text
                    elif label == "A&P":
                        ap_text = text
                hpi_texts.append(hpi_text)
                ap_texts.append(ap_text)
            except json.JSONDecodeError:
                hpi_texts.append(None)
                ap_texts.append(None)
        else:
            hpi_texts.append(None)
            ap_texts.append(None)
    new_df = pd.DataFrame({
        'HPI_Interval_Hx': hpi_texts,
        'A&P': ap_texts
    })
    return new_df

new_df = extract_labels(gt_df)

new_df = j_df.join(new_df[['HPI_Interval_Hx', 'A&P']], how='inner')
columns_to_keep = reference.columns.tolist() + ['HPI_Interval_Hx', 'A&P']
new_df = new_df[columns_to_keep]

new_df.to_csv(f"../outputs/gt/{type}_gt.csv")