<a href="https://colab.research.google.com/github/jasleenkaursandhu/Reproducing-chest-xray-report-generation-boag/blob/referencing-report-findings-with-dicom-files/create_reference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import necessary libraries
# !pip install pydicom
from time import gmtime, strftime
import numpy as np
import pandas as pd
import os
import pydicom
import tqdm
import re

# Import the parser from the existing module
import sys
base_path = "/Users/simeon/Documents/DLH/content/mimic-cxr-project"
sys.path.append(base_path)  # Add base path to Python path
from report_parser import parse_report, MIMIC_RE  # Import existing implementation

# Define report parser class


data_dir = os.path.join(base_path, 'data')
files_path = os.path.join(base_path, 'new_files')
output_dir = os.path.join(base_path, 'output')
os.makedirs(output_dir, exist_ok=True)

# Get test data
test_df = pd.read_csv(os.path.join(data_dir, 'test.tsv'), sep='\t')
print(f"Test data shape: {test_df.shape}")
display(test_df.head())

# Create reference reports
ref_reports = {}

# Process each test case
for i, row in tqdm.tqdm(test_df.iterrows(), total=len(test_df)):
    dicom_id = row['dicom_id']
    subject_id = row['subject_id']
    study_id = row['study_id']

    # Construct path to the report
    subject_prefix = f"p{str(subject_id)[:2]}"
    subject_dir = f"p{subject_id}"
    study_dir = f"s{study_id}"
    report_file = f"{study_dir}.txt"
    report_path = os.path.join(base_path, 'reports', 'files', subject_prefix, subject_dir, report_file)  # Fixed path

    try:
        if os.path.exists(report_path):
            report = parse_report(report_path)
            if 'findings' in report:
                ref_reports[dicom_id] = report['findings']
    except Exception as e:
        print(f"Error processing {dicom_id}: {e}")  # Added error logging

print(f"Created references for {len(ref_reports)} test images")

# Save reference reports
timestamp = strftime("%Y-%m-%d %H:%M:%S", gmtime())
print(timestamp)

pred_file = os.path.join(output_dir, 'reference.tsv')
print(f"Saving references to {pred_file}")

with open(pred_file, 'w') as f:
    print('dicom_id\ttext', file=f)
    for dicom_id, text in sorted(ref_reports.items()):
        # Escape any tab characters in the text
        cleaned_text = text.replace('\t', ' ')
        print(f'{dicom_id}\t{cleaned_text}', file=f)

print(strftime("%Y-%m-%d %H:%M:%S", gmtime()))


Test data shape: (1757, 3)


Unnamed: 0,subject_id,study_id,dicom_id
0,11697435,51704799,20386a2d-1f7a8868-f12e22ac-0d625d27-4c38c8e2
1,17555214,53755869,63100eab-9e8a8d90-392bc822-325de482-69a64e3b
2,17555214,57596800,17269efa-b016a94d-1361e8df-ac428071-d1133672
3,17555214,57596800,247d5e7b-66c77989-ca5fec41-608aaa71-eab4c699
4,17555214,58623720,f0924084-7fde1c46-0709acb5-8273482e-b9d7de1b


100%|██████████| 1757/1757 [00:00<00:00, 4066.23it/s]

Created references for 1757 test images
2025-04-20 00:31:41
Saving references to /Users/simeon/Documents/DLH/content/mimic-cxr-project/output/reference.tsv
2025-04-20 00:31:41



