<a href="https://colab.research.google.com/github/jasleenkaursandhu/Reproducing-chest-xray-report-generation-boag/blob/main/chexpert_evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Define paths
# base_path = '/Volumes/DATA/DATASET/untitled/content/mimic-cxr-project/' # For local machine
# base_path = '/content/drive/MyDrive/mimic-cxr-project' # For Google Colab

In [1]:
# Import necessary libraries and mount Google Drive
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import os
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
base_path = '/content/drive/MyDrive/mimic-cxr-project'
output_dir = os.path.join(base_path, 'output')

# Print available files to verify
print("Files in output directory:")
print(os.listdir(output_dir))

# Load labeled files
reference_df = pd.read_csv(os.path.join(output_dir, 'labeled_reference.csv'))
random_df = pd.read_csv(os.path.join(output_dir, 'labeled_random.csv'))
ngram_df = pd.read_csv(os.path.join(output_dir, 'labeled_3gram.csv'))

print(f"\nReference shape: {reference_df.shape}")
print(f"Random shape: {random_df.shape}")
print(f"N-gram shape: {ngram_df.shape}")

# Get all the categories (columns)
categories = reference_df.columns.tolist()
print(f"\nCategories: {categories}")

Files in output directory:
['reference_input.csv', 'random_input.csv', '.ipynb_checkpoints', 'densenet121_train.pkl', 'densenet121_test.pkl', 'top100_neighbors.pkl', 'cider_comparison.png', 'bleu_comparison.png', 'ngram_bleu_comparison.png', 'ngram_cider_comparison.png', 'reference.tsv', 'reference_chexpert_format.csv', 'labeled_reference.csv', 'labeled_3gram.csv', 'labeled_random.csv', 'random.tsv', '3-gram.tsv', 'knn.tsv', 'knn_bleu_comparison.png', 'knn_cider_comparison.png']

Reference shape: (940, 15)
Random shape: (382, 15)
N-gram shape: (380, 15)

Categories: ['Reports', 'No Finding', 'Enlarged Cardiomediastinum', 'Cardiomegaly', 'Lung Lesion', 'Lung Opacity', 'Edema', 'Consolidation', 'Pneumonia', 'Atelectasis', 'Pneumothorax', 'Pleural Effusion', 'Pleural Other', 'Fracture', 'Support Devices']


In [None]:
import csv
from time import strftime, gmtime
import os

# Define the base path
base_path = '/Volumes/DATA/DATASET/untitled/content/mimic-cxr-project/'

# input file
input_file = os.path.join(base_path, "output", "reference.csv")

#  output file
output_file = os.path.join(base_path, "output", "reference_headerless.csv")

# Create a dictionary to store the reports
ref_reports = {}

# Read the input file
with open(input_file, 'r') as f:
    # Check if there's a header by reading the first line
    first_line = f.readline().strip()
    has_header = 'dicom_id' in first_line and '\t' in first_line

    # If there's a header, we've already consumed it, if not we need to process the line
    if not has_header:
        # Process the first line as it contains data
        parts = first_line.split('\t')
        if len(parts) >= 2:
            dicom_id = parts[0]
            text = '\t'.join(parts[1:])  # In case there are multiple tabs in the text
            ref_reports[dicom_id] = text

    # Process the rest of the lines
    for line in f:
        parts = line.strip().split('\t')
        if len(parts) >= 2:
            dicom_id = parts[0]
            text = '\t'.join(parts[1:])  # In case there are multiple tabs in the text
            ref_reports[dicom_id] = text

# Now write the reports to the output file in the required format
with open(output_file, 'w', newline='') as f:
    writer = csv.writer(f, quoting=csv.QUOTE_MINIMAL)
    # No header (headerless as required)

    for dicom_id, text in sorted(ref_reports.items()):
        # Only include the text as a single column
        writer.writerow([text])

print(f"Processed {len(ref_reports)} reports")
print(f"Output written to {output_file}")
print(strftime("%Y-%m-%d %H:%M:%S", gmtime()))

Processed 940 reports
Output written to /Volumes/DATA/DATASET/untitled/content/mimic-cxr-project/output/reference_headerless.csv
2025-04-10 22:55:02


In [None]:
docker_image_name = 'uwizeye2/chexpert-labeler:amd64'
!docker run --platform linux/amd64 -v {base_path}output:/data {docker_image_name} python label.py --reports_path /data/reference_headerless.csv --output_path /data/labeled_reference.csv --verbose

100%|██████████| 940/940 [00:00<00:00, 1008.31it/s]
 53%|█████▎    | 500/940 [05:29<09:31,  1.30s/it]ERROR:root:No parse tree for sentence: 0
NoneType: None
 62%|██████▏   | 586/940 [06:37<03:08,  1.88it/s]ERROR:root:Cannot process sentence 152 in 586
Traceback (most recent call last):
  File "/app/chexpert-labeler/NegBio/negbio/pipeline/ptb2ud.py", line 120, in convert_doc
    has_lemmas=self._backend == 'jpype')
TypeError: 'NoneType' object is not iterable
 91%|█████████▏| 859/940 [10:11<01:03,  1.28it/s]ERROR:root:Cannot process sentence 31 in 859
Traceback (most recent call last):
  File "/app/chexpert-labeler/NegBio/negbio/pipeline/ptb2ud.py", line 120, in convert_doc
    has_lemmas=self._backend == 'jpype')
TypeError: 'NoneType' object is not iterable
100%|██████████| 940/940 [11:08<00:00,  1.57it/s]
100%|██████████| 940/940 [00:00<00:00, 71794.12it/s]
Loading mention phrases for 14 observations.
Loading unmention phrases for 3 observations.
Extracting mentions..

In [None]:
# Define evaluation function with sample matching
def calculate_f1(true_df, pred_df):
    # Find common indices
    common_indices = true_df.index.intersection(pred_df.index)
    if len(common_indices) == 0:
        # If no common indices found by index, try matching by report text if available
        if 'Reports' in true_df.columns and 'Reports' in pred_df.columns:
            # Create dictionaries mapping reports to their rows
            true_reports = {report: i for i, report in enumerate(true_df['Reports'].values)}
            pred_reports = {report: i for i, report in enumerate(pred_df['Reports'].values)}
            # Find common reports
            common_reports = set(true_reports.keys()).intersection(set(pred_reports.keys()))
            if common_reports:
                print(f"Found {len(common_reports)} common reports by text matching")
                # Extract rows with common reports
                true_subset = true_df.loc[[true_reports[report] for report in common_reports]]
                pred_subset = pred_df.loc[[pred_reports[report] for report in common_reports]]
            else:
                # No matching by text either, try using the first rows
                print("No common samples found. Using first min(len1, len2) rows.")
                min_rows = min(len(true_df), len(pred_df))
                true_subset = true_df.iloc[:min_rows]
                pred_subset = pred_df.iloc[:min_rows]
        else:
            # No reports column, try matching row by row
            print("No common indices found. Using first min(len1, len2) rows.")
            min_rows = min(len(true_df), len(pred_df))
            true_subset = true_df.iloc[:min_rows]
            pred_subset = pred_df.iloc[:min_rows]
    else:
        print(f"Found {len(common_indices)} common indices")
        true_subset = true_df.loc[common_indices]
        pred_subset = pred_df.loc[common_indices]

    # Ensure all categories exist in both dataframes
    categories = [col for col in true_subset.columns if col in pred_subset.columns and col != 'Reports']

    # Fill missing values with -2 (not mentioned)
    true_subset = true_subset[categories].fillna(-2)
    pred_subset = pred_subset[categories].fillna(-2)

    # For F1 score, only positive mentions (1) count as positive
    true_binary = (true_subset == 1).astype(int)
    pred_binary = (pred_subset == 1).astype(int)

    # Calculate F1 per category
    f1_scores = {}
    for category in categories:
        f1 = f1_score(true_binary[category], pred_binary[category], zero_division=0)
        f1_scores[category] = f1

    # Add macro average
    macro_f1 = np.mean(list(f1_scores.values()))
    f1_scores['Macro Average'] = macro_f1

    return f1_scores

In [None]:
# Evaluate Random Model
print("Evaluating Random Model...")
random_f1_scores = calculate_f1(reference_df, random_df)

# Print results
print("\nRandom model F1 scores:")
for category, score in random_f1_scores.items():
    print(f"{category}: {score:.4f}")

# Paper's reported score
paper_random_f1 = 0.148
print(f"\nComparison with paper:")
print(f"Random model macro-F1: Ours = {random_f1_scores['Macro Average']:.4f}, Paper = {paper_random_f1:.4f}")

Evaluating Random Model...
Found 382 common indices

Random model F1 scores:
No Finding: 0.2446
Enlarged Cardiomediastinum: 0.1124
Cardiomegaly: 0.3016
Lung Lesion: 0.1000
Lung Opacity: 0.4294
Edema: 0.1250
Consolidation: 0.0000
Pneumonia: 0.0000
Atelectasis: 0.2700
Pneumothorax: 0.0000
Pleural Effusion: 0.3448
Pleural Other: 0.0000
Fracture: 0.1081
Support Devices: 0.5160
Macro Average: 0.1823

Comparison with paper:
Random model macro-F1: Ours = 0.1823, Paper = 0.1480


In [None]:
# Evaluate N-gram Model
print("Evaluating N-gram Model...")
ngram_f1_scores = calculate_f1(reference_df, ngram_df)

# Print results
print("\nN-gram model F1 scores:")
for category, score in ngram_f1_scores.items():
    print(f"{category}: {score:.4f}")

# Paper's reported score
paper_ngram_f1 = 0.185
print(f"\nComparison with paper:")
print(f"N-gram model macro-F1: Ours = {ngram_f1_scores['Macro Average']:.4f}, Paper = {paper_ngram_f1:.4f}")

Evaluating N-gram Model...
Found 380 common indices

N-gram model F1 scores:
No Finding: 0.1455
Enlarged Cardiomediastinum: 0.0870
Cardiomegaly: 0.3217
Lung Lesion: 0.0000
Lung Opacity: 0.3295
Edema: 0.1818
Consolidation: 0.0444
Pneumonia: 0.0000
Atelectasis: 0.2485
Pneumothorax: 0.0476
Pleural Effusion: 0.2116
Pleural Other: 0.2222
Fracture: 0.0000
Support Devices: 0.4556
Macro Average: 0.1640

Comparison with paper:
N-gram model macro-F1: Ours = 0.1640, Paper = 0.1850
