<a href="https://colab.research.google.com/github/jasleenkaursandhu/Reproducing-chest-xray-report-generation-boag/blob/referencing-report-findings-with-dicom-files/create_reference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Import necessary libraries
!pip install pydicom
from time import gmtime, strftime
import numpy as np
import pandas as pd
import os
import pydicom
import tqdm
import re

# Mount Google Drive if in Colab
from google.colab import drive
drive.mount('/content/drive')

# Define report parser function
class MIMIC_RE(object):
    def __init__(self):
        self._cached = {}

    def get(self, pattern, flags=0):
        key = hash((pattern, flags))
        if key not in self._cached:
            self._cached[key] = re.compile(pattern, flags=flags)

        return self._cached[key]

    def sub(self, pattern, repl, string, flags=0):
        return self.get(pattern, flags=flags).sub(repl, string)

    def rm(self, pattern, string, flags=0):
        return self.sub(pattern, '', string)

    def get_id(self, tag, flags=0):
        return self.get(r'\[\*\*.*{:s}.*?\*\*\]'.format(tag), flags=flags)

    def sub_id(self, tag, repl, string, flags=0):
        return self.get_id(tag).sub(repl, string)

def parse_report(path):
    mimic_re = MIMIC_RE()
    with open(path,'r') as f:
        report = f.read()
    report = report.lower()
    report = mimic_re.sub_id(r'(?:location|address|university|country|state|unit number)', 'LOC', report)
    report = mimic_re.sub_id(r'(?:year|month|day|date)', 'DATE', report)
    report = mimic_re.sub_id(r'(?:hospital)', 'HOSPITAL', report)
    report = mimic_re.sub_id(r'(?:identifier|serial number|medical record number|social security number|md number)', 'ID', report)
    report = mimic_re.sub_id(r'(?:age)', 'AGE', report)
    report = mimic_re.sub_id(r'(?:phone|pager number|contact info|provider number)', 'PHONE', report)
    report = mimic_re.sub_id(r'(?:name|initial|dictator|attending)', 'NAME', report)
    report = mimic_re.sub_id(r'(?:company)', 'COMPANY', report)
    report = mimic_re.sub_id(r'(?:clip number)', 'CLIP_NUM', report)

    report = mimic_re.sub((
        r'\[\*\*(?:'
            r'\d{4}'  # 1970
            r'|\d{0,2}[/-]\d{0,2}'  # 01-01
            r'|\d{0,2}[/-]\d{4}'  # 01-1970
            r'|\d{0,2}[/-]\d{0,2}[/-]\d{4}'  # 01-01-1970
            r'|\d{4}[/-]\d{0,2}[/-]\d{0,2}'  # 1970-01-01
        r')\*\*\]'
    ), 'DATE', report)
    report = mimic_re.sub(r'\[\*\*.*\*\*\]', 'OTHER', report)
    report = mimic_re.sub(r'(?:\d{1,2}:\d{2})', 'TIME', report)

    report = mimic_re.rm(r'_{2,}', report, flags=re.MULTILINE)
    report = mimic_re.rm(r'the study and the report were reviewed by the staff radiologist.', report)


    matches = list(mimic_re.get(r'^(?P<title>[ \w()]+):', flags=re.MULTILINE).finditer(report))
    parsed_report = {}
    for (match, next_match) in zip(matches, matches[1:] + [None]):
        start = match.end()
        end = next_match and next_match.start()

        title = match.group('title')
        title = title.strip()

        paragraph = report[start:end]
        paragraph = mimic_re.sub(r'\s{2,}', ' ', paragraph)
        paragraph = paragraph.strip()

        parsed_report[title] = paragraph.replace('\n', '\\n')

    return parsed_report

# Define paths
base_path = '/content/drive/MyDrive/mimic-cxr-project'
data_dir = os.path.join(base_path, 'data')
files_path = os.path.join(base_path, 'files')
output_dir = os.path.join(base_path, 'output')
os.makedirs(output_dir, exist_ok=True)

# Get test data
test_df = pd.read_csv(os.path.join(data_dir, 'test.tsv'), sep='\t')
print(f"Test data shape: {test_df.shape}")
display(test_df.head())

# Create reference reports
ref_reports = {}

# Process each test case
for i, row in tqdm.tqdm(test_df.iterrows(), total=len(test_df)):
    dicom_id = row['dicom_id']
    subject_id = row['subject_id']
    study_id = row['study_id']

    # Construct path to the report
    subject_prefix = f"p{str(subject_id)[:2]}"
    subject_dir = f"p{subject_id}"
    study_dir = f"s{study_id}"
    report_file = f"{study_dir}.txt"
    report_path = os.path.join(files_path, subject_prefix, subject_dir, report_file)

    try:
        if os.path.exists(report_path):
            report = parse_report(report_path)
            if 'findings' in report:
                ref_reports[dicom_id] = report['findings']
    except Exception as e:
        pass

print(f"Created references for {len(ref_reports)} test images")

# Save reference reports
print(strftime("%Y-%m-%d %H:%M:%S", gmtime()))

pred_file = os.path.join(output_dir, 'reference.tsv')
print(f"Saving references to {pred_file}")

with open(pred_file, 'w') as f:
    print('dicom_id\ttext', file=f)
    for dicom_id, text in sorted(ref_reports.items()):
        # Escape any tab characters in the text
        cleaned_text = text.replace('\t', ' ')
        print(f'{dicom_id}\t{cleaned_text}', file=f)

print(strftime("%Y-%m-%d %H:%M:%S", gmtime()))

Collecting pydicom
  Downloading pydicom-3.0.1-py3-none-any.whl.metadata (9.4 kB)
Downloading pydicom-3.0.1-py3-none-any.whl (2.4 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.4 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.6/2.4 MB[0m [31m18.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m36.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pydicom
Successfully installed pydicom-3.0.1
Mounted at /content/drive
Test data shape: (382, 3)


Unnamed: 0,subject_id,study_id,dicom_id
0,14113248,55638009,fea90b5d-a059ecc8-b5e68f8d-b7f33ed9-1d32d429
1,10094629,54014505,06ba097a-bf8b917c-851d0e95-886936a1-90964781
2,15038651,55622460,648449fa-5e173b2a-87663d57-4a0fbfc0-138e42c7
3,17555813,51695858,7aadbf77-54eda0e2-91288c15-83113d92-e414ded5
4,14175615,50500226,1c50eb4e-cea17351-b9ba5502-b20d7a71-ea827c80


100%|██████████| 382/382 [05:04<00:00,  1.25it/s]


Created references for 349 test images
2025-04-06 02:26:31
Saving references to /content/drive/MyDrive/mimic-cxr-project/output/reference.tsv
2025-04-06 02:26:32
