<a href="https://colab.research.google.com/github/jasleenkaursandhu/Reproducing-chest-xray-report-generation-boag/blob/main/random.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
# Random Baseline Model for Report Generation
# This notebook implements a simple random baseline model for chest X-ray report generation.
# The baseline randomly selects a report from the training set for each test image

In [8]:
# Import necessary libraries
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from PIL import Image
import tqdm
from collections import defaultdict
import pickle
import gzip
import random
import re
import warnings
# !pip install pydicom
import pydicom
from collections import Counter, defaultdict
from time import gmtime, strftime

In [9]:
# Mount Google Drive
# from google.colab import drive
# drive.mount('/content/drive')
base_path = '/Users/simeon/Documents/DLH/content/mimic-cxr-project'
!mkdir -p {base_path}/data
!mkdir -p {base_path}/output
!mkdir -p {base_path}/features

In [10]:
# Import the report parser module
import sys
sys.path.append(f"{base_path}/modules")
from report_parser import parse_report, MIMIC_RE
print("Successfully imported report parser module")

Successfully imported report parser module


In [11]:
# Load train and test data
data_dir = os.path.join(base_path, 'data')
files_path = os.path.join(base_path, 'files')
output_dir = os.path.join(base_path, 'output')
features_dir = os.path.join(base_path, 'features')

train_df = pd.read_csv(os.path.join(data_dir, 'train.tsv'), sep='\t')
test_df = pd.read_csv(os.path.join(data_dir, 'test.tsv'), sep='\t')

print(f"Train data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")
train_df

Train data shape: (4291, 3)
Test data shape: (1757, 3)


Unnamed: 0,subject_id,study_id,dicom_id
0,10294074,52201331,7e95cd84-a6e61229-709150ad-10e6ad91-b535ad52
1,10164170,54545361,42401e7d-fae7b2ef-87642157-68beaada-014bfcc9
2,13832352,56347818,1ef3083e-7ed9110c-e9df3d65-480e18a8-9181ebde
3,17654843,58559853,5000f8fd-684ea279-a1e1308e-cfce9b0c-e1eeae50
4,15803890,52888009,f349a7ef-ee518ad2-d5173f92-cbfa71b2-df530a25
...,...,...,...
4286,10326273,52411457,cd4bf94c-529fc978-8937acbb-998c4d16-e86ac884
4287,10326273,54913222,4b10ef76-4db02379-4f64fcb4-bb9d32bb-bf44e781
4288,10326273,54913222,7dbf38f4-3abd85d3-b6627477-b711c72f-9b5d5f0e
4289,17527219,56922368,d58c2048-57977f42-0e290b59-2ddaefba-8e7ad515


In [12]:
print("Implementing Random Baseline Model")

# Define the path to the files directory
files_path = os.path.join(base_path, 'new_files')

# Map each dicom to its corresponding report identifier
report_id_column = 'study_id'
if report_id_column in train_df.columns:
    report_lookup = dict(train_df[['dicom_id', report_id_column]].values)
    print(f"Created lookup using {report_id_column}")
else:
    print(f"Warning: {report_id_column} not found in columns: {train_df.columns.tolist()}")
    report_lookup = {}

print("Sample of lookup dictionary:")
print(dict(list(report_lookup.items())[:5]))

Implementing Random Baseline Model
Created lookup using study_id
Sample of lookup dictionary:
{'7e95cd84-a6e61229-709150ad-10e6ad91-b535ad52': 52201331, '42401e7d-fae7b2ef-87642157-68beaada-014bfcc9': 54545361, '1ef3083e-7ed9110c-e9df3d65-480e18a8-9181ebde': 56347818, '5000f8fd-684ea279-a1e1308e-cfce9b0c-e1eeae50': 58559853, 'f349a7ef-ee518ad2-d5173f92-cbfa71b2-df530a25': 52888009}


In [13]:
# Generate random reports for each test image
generated_reports = {}

# Define reports directory
reports_dir = os.path.join(base_path, 'reports')

for pred_dicom in tqdm.tqdm(test_df.dicom_id.values):
    found = False
    attempts = 0
    max_attempts = 100  # Limit attempts to avoid infinite loops

    while not found and attempts < max_attempts:
        attempts += 1

        # Randomly select a training image
        nearest_dicom = random.choice(train_df.dicom_id.values)

        if nearest_dicom not in report_lookup:
            continue

        report_id = report_lookup[nearest_dicom]

        # Get corresponding subject_id
        subject_row = train_df[train_df.dicom_id == nearest_dicom]
        if len(subject_row) == 0:
            continue

        subject_id = subject_row.iloc[0]['subject_id']

        # Construct path to the report using the correct structure
        subject_prefix = f"p{str(subject_id)[:2]}"
        subject_dir = f"p{subject_id}"
        study_dir = f"s{report_id}"

        # The correct report path based on our findings
        report_path = os.path.join(reports_dir, 'files', subject_prefix, subject_dir, f"{study_dir}.txt")

        # Parse the report to extract sections
        try:
            if os.path.exists(report_path):
                report = parse_report(report_path)

                # If the report has a findings section, use it
                if 'findings' in report:
                    found = True
                    generated_reports[pred_dicom] = report['findings']
        except Exception as e:
            # Skip this report and try another
            continue

    if not found:
        print(f"Warning: Could not find a valid report for {pred_dicom} after {max_attempts} attempts")

print(f"Generated random reports for {len(generated_reports)}/{len(test_df)} test images")

100%|██████████| 1757/1757 [00:00<00:00, 2636.49it/s]

Generated random reports for 1757/1757 test images





In [14]:
# Save the generated reports to a TSV file
pred_dir = os.path.join(base_path, 'output')
os.makedirs(pred_dir, exist_ok=True)

pred_file = os.path.join(pred_dir, 'random.tsv')
print(f"Saving predictions to {pred_file}")

with open(pred_file, 'w') as f:
    print('dicom_id\tgenerated', file=f)
    for dicom_id, generated in sorted(generated_reports.items()):
        # Escape any tab characters in the generated text
        cleaned_text = generated.replace('\t', ' ')
        print(f'{dicom_id}\t{cleaned_text}', file=f)

print(f"Saved random baseline predictions to {pred_file}")
print(f"Current time: {strftime('%Y-%m-%d %H:%M:%S', gmtime())}")

Saving predictions to /Users/simeon/Documents/DLH/content/mimic-cxr-project/output/random.tsv
Saved random baseline predictions to /Users/simeon/Documents/DLH/content/mimic-cxr-project/output/random.tsv
Current time: 2025-04-20 00:36:32


In [15]:
# Display sample of generated reports
sample_count = min(3, len(generated_reports))
sample_dicoms = list(generated_reports.keys())[:sample_count]

for dicom_id in sample_dicoms:
    print(f"\nSample report for {dicom_id}:")
    report_text = generated_reports[dicom_id]

    # Print preview of the report (first 200 characters)
    if len(report_text) > 200:
        print(report_text[:200] + "...")
    else:
        print(report_text)


Sample report for 20386a2d-1f7a8868-f12e22ac-0d625d27-4c38c8e2:
hyperinflated lungs. biapical, right greater than left pleural thickening/ scarring. multiple old left-sided rib fractures, small left pleural effusion, sternal fracture and retrosternal hematoma bett...

Sample report for 63100eab-9e8a8d90-392bc822-325de482-69a64e3b:
et tube ends 4.1 cm above carina. the patient had a recent left lower lobe lobectomy with the chest tube that projects in upper hemithorax without any visible pneumothorax. left pleural effusion is sm...

Sample report for 17269efa-b016a94d-1361e8df-ac428071-d1133672:
there is an et tube present in good position. other support and monitoring devices remain stable. compared to study from yesterday, there is little overall change with remaining haziness at the bases ...
