<a href="https://colab.research.google.com/github/jasleenkaursandhu/Reproducing-chest-xray-report-generation-boag/blob/main/random.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Random Baseline Model for Report Generation
# This notebook implements a simple random baseline model for chest X-ray report generation.
# The baseline randomly selects a report from the training set for each test image

In [2]:
# Import necessary libraries
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from PIL import Image
import tqdm
from collections import defaultdict
import pickle
import gzip
import random
import re
import warnings
!pip install pydicom
import pydicom
from collections import Counter, defaultdict
from time import gmtime, strftime

Collecting pydicom
  Downloading pydicom-3.0.1-py3-none-any.whl.metadata (9.4 kB)
Downloading pydicom-3.0.1-py3-none-any.whl (2.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pydicom
Successfully installed pydicom-3.0.1


In [3]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')
base_path = '/content/drive/MyDrive/mimic-cxr-project'
!mkdir -p {base_path}/data
!mkdir -p {base_path}/output
!mkdir -p {base_path}/features

Mounted at /content/drive


In [4]:
# Import the report parser module
import sys
sys.path.append(f"{base_path}/modules")
from report_parser import parse_report, MIMIC_RE
print("Successfully imported report parser module")

Successfully imported report parser module


In [5]:
# Load train and test data
data_dir = os.path.join(base_path, 'data')
files_path = os.path.join(base_path, 'files')
output_dir = os.path.join(base_path, 'output')
features_dir = os.path.join(base_path, 'features')

train_df = pd.read_csv(os.path.join(data_dir, 'train.tsv'), sep='\t')
test_df = pd.read_csv(os.path.join(data_dir, 'test.tsv'), sep='\t')

print(f"Train data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")

Train data shape: (2243, 3)
Test data shape: (871, 3)


In [6]:
print("Implementing Random Baseline Model")

# Define the path to the files directory
files_path = os.path.join(base_path, 'files')

# Map each dicom to its corresponding report identifier
report_id_column = 'study_id'
if report_id_column in train_df.columns:
    report_lookup = dict(train_df[['dicom_id', report_id_column]].values)
    print(f"Created lookup using {report_id_column}")
else:
    print(f"Warning: {report_id_column} not found in columns: {train_df.columns.tolist()}")
    report_lookup = {}

print("Sample of lookup dictionary:")
print(dict(list(report_lookup.items())[:5]))

Implementing Random Baseline Model
Created lookup using study_id
Sample of lookup dictionary:
{'f214e3d8-00d5e538-62592a72-37cf2660-0ec426a1': 58033516, 'c6f2ef94-1c46417f-1cdb50d7-bbae3a47-5dd45401': 57744330, 'f7a4e18f-004ac053-81024d1b-568dbb86-b20f308b': 50301279, '500bbbdf-966e91a0-474e045e-81e494ac-7c6124f7': 57457041, 'c0eb8f9c-b404b698-4b47abf9-cea216fd-27bea26f': 59435834}


In [7]:
# Generate random reports for each test image
generated_reports = {}

for pred_dicom in tqdm.tqdm(test_df.dicom_id.values):
    found = False
    attempts = 0
    max_attempts = 100  # Limit attempts to avoid infinite loops

    while not found and attempts < max_attempts:
        attempts += 1

        # Randomly select a training image
        nearest_dicom = random.choice(train_df.dicom_id.values)

        if nearest_dicom not in report_lookup:
            continue

        report_id = report_lookup[nearest_dicom]

        # Get corresponding subject_id
        subject_row = train_df[train_df.dicom_id == nearest_dicom]
        if len(subject_row) == 0:
            continue

        subject_id = subject_row.iloc[0]['subject_id']

        # Construct path to the report
        subject_prefix = f"p{str(subject_id)[:2]}"
        subject_dir = f"p{subject_id}"
        study_dir = f"s{report_id}"
        report_file = f"{study_dir}.txt"
        report_path = os.path.join(files_path, subject_prefix, subject_dir, report_file)

        # Parse the report to extract sections
        try:
            if os.path.exists(report_path):
                report = parse_report(report_path)

                # If the report has a findings section, use it
                if 'findings' in report:
                    found = True
                    generated_reports[pred_dicom] = report['findings']
        except Exception as e:
            # Skip this report and try another
            continue

    if not found:
        print(f"Warning: Could not find a valid report for {pred_dicom} after {max_attempts} attempts")

print(f"Generated random reports for {len(generated_reports)}/{len(test_df)} test images")

100%|██████████| 871/871 [12:13<00:00,  1.19it/s]

Generated random reports for 871/871 test images





In [8]:
# Save the generated reports to a TSV file
pred_dir = os.path.join(base_path, 'output')
os.makedirs(pred_dir, exist_ok=True)

pred_file = os.path.join(pred_dir, 'random.tsv')
print(f"Saving predictions to {pred_file}")

with open(pred_file, 'w') as f:
    print('dicom_id\tgenerated', file=f)
    for dicom_id, generated in sorted(generated_reports.items()):
        # Escape any tab characters in the generated text
        cleaned_text = generated.replace('\t', ' ')
        print(f'{dicom_id}\t{cleaned_text}', file=f)

print(f"Saved random baseline predictions to {pred_file}")
print(f"Current time: {strftime('%Y-%m-%d %H:%M:%S', gmtime())}")

Saving predictions to /content/drive/MyDrive/mimic-cxr-project/output/random.tsv
Saved random baseline predictions to /content/drive/MyDrive/mimic-cxr-project/output/random.tsv
Current time: 2025-04-21 02:39:49


In [9]:
# Display sample of generated reports
sample_count = min(3, len(generated_reports))
sample_dicoms = list(generated_reports.keys())[:sample_count]

for dicom_id in sample_dicoms:
    print(f"\nSample report for {dicom_id}:")
    report_text = generated_reports[dicom_id]

    # Print preview of the report (first 200 characters)
    if len(report_text) > 200:
        print(report_text[:200] + "...")
    else:
        print(report_text)


Sample report for 6b31c47c-641e9905-9713bf8c-5d916738-736686e1:
pa and lateral views of the chest provided. compared to prior study from 2 days ago, left lung and right upper lung consolidation have essentially cleared. there is persistent right lung base opacity....

Sample report for ea38e7bb-cba206f9-4189840d-16b60305-bf6c4b9a:
single ap view of the chest demonstrates clear lungs. mild left basilar atelectasis with no pleural effusion or pneumothorax. the cardiac, hilar, and mediastinal contours are normal. tracheal air colu...

Sample report for 0ff89f65-95a04de8-2fe6671c-5fcd970e-32c68b41:
portable ap semi-erect chest film TIME is submitted.
