This Nb uses the system message test output files and extracts the metrics for the benchmark. This is done to avoid expensive recomputation of eICU tasks

In [10]:
import pandas as pd
import numpy as np
import os
import re
import json
import sys

sys.path.append(r"C:\Users\janbe\Documents\GitHub Repos\pulse")
from src.eval.metrics import calculate_all_metrics, calculate_auprc, calculate_auroc

In [11]:
outputfolder_path = r"C:\Users\janbe\Documents\GitHub Repos\pulse\output_to_keep\results_benchmark\20250613_112431_DeepseekR1Llama8b"

In [12]:
def categorize_files(outputfolder_path_list):
    """
    Categorize files in the output folders into metrics report files, metadata files, and log files.

    Args:
        outputfolder_path_list (list): List of output folder paths.

    Returns:
        dict: A dictionary containing categorized files.
    """
    file_list = []
    for outputfolder_path in outputfolder_path_list:
        # List all files in the folder and its subfolders
        for root, dirs, files in os.walk(outputfolder_path):
            for file in files:
                file_list.append(os.path.join(root, file))

    categorized_files = {
        "metrics_report_files": [f for f in file_list if "metrics_report" in f],
        "metadata_files": [f for f in file_list if "metadata" in f],
        "log_files": [f for f in file_list if "log" in f],
    }

    print("Metrics Report Files:")
    for file in categorized_files["metrics_report_files"]:
        print(file)
    print("\nMetadata Files:")
    for file in categorized_files["metadata_files"]:
        print(file)
    print("\nLog Files:")
    for file in categorized_files["log_files"]:
        print(file)

    return categorized_files
    

In [13]:
def load_metadata(metadata_path_list):
    """
    Load metadata from a CSV file into a DataFrame.
    
    Args:
        metadata_path (str): Path to the metadata CSV file.
        
    Returns:
        pd.DataFrame: DataFrame containing the metadata.
    """
    df_mdata = pd.DataFrame()
    for m_path in metadata_path_list:
        try:
            df = pd.read_csv(m_path)
            # Extract model name, task, dataset, and timestamp from the metadata path
            # match = re.search(
            #     r"\\([^\\]+)_([^_]+)_([^_]+)_(\d{8}_\d{6})_metadata\.csv$", m_path
            # )
            match1 = re.search(
                r"\\([0-9]{8}_[0-9]{6})_[^\\]+\\([^_]+)_([^_]+)_([^_]+)_output_metadata\.csv$", m_path
            )
            match2 = re.search(
                r"\\([0-9]{8}_[0-9]{6})_[^\\]+\\([^_]+)_([^_]+)_([^_]+)_([^_]+)_([^_]+)_metadata\.csv$",
                m_path,
            )

            if match1:
                timestamp, model_name, task, dataset = match1.groups()
                print(f"Model Name: {model_name}, Task: {task}, Dataset: {dataset}, Timestamp: {timestamp}")
                # Add extracted metadata to the DataFrame
                df['model_name'] = model_name
                df['task'] = task
                df['dataset'] = dataset
                df['timestamp'] = timestamp

            elif match2:
                timestamp, model_name, task, dataset, _ , _ = match2.groups()
                print(
                    f"Model Name: {model_name}, Task: {task}, Dataset: {dataset}, Timestamp: {timestamp}"
                )
                # Add extracted metadata to the DataFrame
                df["model_name"] = model_name
                df["task"] = task
                df["dataset"] = dataset
                df["timestamp"] = timestamp

            else:
                print("Failed to extract metadata details from the path. Using default values.")
                df['model_name'] = "Unknown"
                df['task'] = "Unknown"
                df['dataset'] = "Unknown"
                df['timestamp'] = "Unknown"

            # Append the DataFrame to the main DataFrame
            df_mdata = pd.concat([df_mdata, df], ignore_index=True)

        except Exception as e:
            print(f"Error loading metadata: {e}")
            continue
    return df_mdata

In [14]:
cat_files = categorize_files([outputfolder_path])

Metrics Report Files:
C:\Users\janbe\Documents\GitHub Repos\pulse\output_to_keep\results_benchmark\20250613_112431_DeepseekR1Llama8b\DeepseekR1Llama8b_metrics_report.json

Metadata Files:
C:\Users\janbe\Documents\GitHub Repos\pulse\output_to_keep\results_benchmark\20250613_112431_DeepseekR1Llama8b\DeepseekR1Llama8b_aki_eicu_20250613_112431_metadata.csv
C:\Users\janbe\Documents\GitHub Repos\pulse\output_to_keep\results_benchmark\20250613_112431_DeepseekR1Llama8b\DeepseekR1Llama8b_mortality_eicu_20250613_112431_metadata.csv
C:\Users\janbe\Documents\GitHub Repos\pulse\output_to_keep\results_benchmark\20250613_112431_DeepseekR1Llama8b\DeepseekR1Llama8b_sepsis_eicu_20250616_152929_metadata.csv

Log Files:
C:\Users\janbe\Documents\GitHub Repos\pulse\output_to_keep\results_benchmark\20250613_112431_DeepseekR1Llama8b\log_20250613_112431.log
C:\Users\janbe\Documents\GitHub Repos\pulse\output_to_keep\results_benchmark\20250613_112431_DeepseekR1Llama8b\log_20250616_152929.log


In [15]:
df_metadata = load_metadata(cat_files["metadata_files"])

Model Name: DeepseekR1Llama8b, Task: aki, Dataset: eicu, Timestamp: 20250613_112431
Model Name: DeepseekR1Llama8b, Task: mortality, Dataset: eicu, Timestamp: 20250613_112431
Model Name: DeepseekR1Llama8b, Task: sepsis, Dataset: eicu, Timestamp: 20250613_112431


In [16]:
models = df_metadata['model_name'].unique()
tasks = df_metadata['task'].unique()
datasets = df_metadata['dataset'].unique()
print(f"Models: {models}")
print(f"Tasks: {tasks}")
print(f"Datasets: {datasets}")

Models: ['DeepseekR1Llama8b']
Tasks: ['aki' 'mortality' 'sepsis']
Datasets: ['eicu']


In [17]:
df_metadata

Unnamed: 0,Input Prompt,Target Label,Predicted Probability,Predicted Diagnosis,Predicted Explanation,Tokenization Time,Inference Time,Input Tokens,Output Tokens,System Message,System Message Index,model_name,task,dataset,timestamp
0,You are an experienced doctor in Intensive Car...,0,0.00,not-diagnosis,The patient's serum creatinine and blood urea ...,0.006719,18.120857,3125,762,You are a helpful assistant and experienced me...,0,DeepseekR1Llama8b,aki,eicu,20250613_112431
1,You are an experienced doctor in Intensive Car...,0,0.10,not-aki,Kidney function appears stable: creatinine rem...,0.005799,17.041536,3285,712,You are a helpful assistant and experienced me...,1,DeepseekR1Llama8b,aki,eicu,20250613_112431
2,You are an experienced doctor in Intensive Car...,0,0.20,not-aki,"Urine output remains above the threshold, and ...",0.005626,18.262427,3370,765,You are a helpful assistant and experienced me...,2,DeepseekR1Llama8b,aki,eicu,20250613_112431
3,You are an experienced doctor in Intensive Car...,0,0.20,not-aki,"Creatinine and BUN within normal limits, urine...",0.005679,14.153721,3401,592,You are a helpful assistant and experienced me...,3,DeepseekR1Llama8b,aki,eicu,20250613_112431
4,You are an experienced doctor in Intensive Car...,0,0.08,not-aki,Kidney function stable: creatinine 0.90 mg/dL ...,0.005858,14.778650,3472,616,You are a helpful assistant and experienced me...,4,DeepseekR1Llama8b,aki,eicu,20250613_112431
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10315,You are an experienced doctor in Intensive Car...,0,0.80,diagnosis,Elevated C-Reactive Protein (CRP) and high neu...,0.005817,29.650258,3134,990,You are a helpful assistant and experienced me...,0,DeepseekR1Llama8b,sepsis,eicu,20250613_112431
10316,You are an experienced doctor in Intensive Car...,0,0.12,not-sepsis,"The patient's SOFA score is 0, which does not ...",0.006450,23.098510,3323,768,You are a helpful assistant and experienced me...,1,DeepseekR1Llama8b,sepsis,eicu,20250613_112431
10317,You are an experienced doctor in Intensive Car...,0,0.85,sepsis,"Patient meets SOFA criteria with a score of 3,...",0.006494,36.840308,3408,1232,You are a helpful assistant and experienced me...,2,DeepseekR1Llama8b,sepsis,eicu,20250613_112431
10318,You are an experienced doctor in Intensive Car...,0,0.60,sepsis,Patient shows signs of sepsis: elevated C-Reac...,0.006686,67.152123,3439,2238,You are a helpful assistant and experienced me...,3,DeepseekR1Llama8b,sepsis,eicu,20250613_112431


In [18]:
import json

results = []
prompting_id = "zhu_2024b_zero_shot_preprocessor"

for model in models:
    for task in tasks:
        for dataset in datasets:
            df_subset = df_metadata[
                (df_metadata['model_name'] == model) &
                (df_metadata['task'] == task) &
                (df_metadata['dataset'] == dataset) &
                (df_metadata['System Message Index'] == 1)
            ]
            if not df_subset.empty:
                metrics = calculate_all_metrics(
                    y_true=df_subset['Target Label'],
                    y_pred=df_subset['Predicted Probability'],
                )
                metrics['model_name'] = model
                metrics['task'] = task
                metrics['dataset'] = dataset
                results.append(metrics)

                run_id = df_subset["timestamp"].iloc[0]

                metrics_overall = {
                    k: float(v)
                    for k, v in metrics.items()
                    if k
                    in [
                        "auroc",
                        "auprc",
                        "normalized_auprc",
                        "specificity",
                        "f1_score",
                        "accuracy",
                        "balanced_accuracy",
                        "precision",
                        "recall",
                        "mcc",
                        "kappa",
                        "minpse",
                    ]
                }

                output_dict = {
                    "model_id": model,
                    "task_id": task,
                    "dataset": dataset,
                    "prompting_id": prompting_id,
                    "run_id": run_id,
                    "metrics_summary": {"overall": metrics_overall},
                }

                metrics_report_path = os.path.join(outputfolder_path, f"{model}_metrics_report.json")
                if os.path.exists(metrics_report_path):
                    with open(metrics_report_path, "r") as f:
                        try:
                            existing_data = json.load(f)
                            if isinstance(existing_data, list):
                                existing_data.append(output_dict)
                            else:
                                existing_data = [existing_data, output_dict]
                        except json.JSONDecodeError:
                            existing_data = [output_dict]
                    with open(metrics_report_path, "w") as f:
                        json.dump(existing_data, f, indent=2)
                else:
                    with open(metrics_report_path, "w") as f:
                        json.dump(output_dict, f, indent=2)

df_metrics = pd.DataFrame(results)
df_metrics

Unnamed: 0,auroc,auprc,normalized_auprc,specificity,f1_score,accuracy,balanced_accuracy,precision,recall,mcc,kappa,minpse,model_name,task,dataset
0,0.586,0.392,1.264,0.82,0.342,0.655,0.554,0.419,0.289,0.123,0.119,0.382,DeepseekR1Llama8b,aki,eicu
1,0.661,0.234,3.344,0.613,0.208,0.62,0.664,0.122,0.714,0.17,0.101,0.135,DeepseekR1Llama8b,mortality,eicu
2,0.52,0.054,1.134,0.569,0.097,0.565,0.529,0.054,0.489,0.025,0.012,0.054,DeepseekR1Llama8b,sepsis,eicu
