<a href="https://colab.research.google.com/github/jasleenkaursandhu/Reproducing-chest-xray-report-generation-boag/blob/main/3gram.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# N-gram Model for Report Generation
# This notebook implements a conditional n-gram language model for chest X-ray report generation

import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import tqdm
from collections import defaultdict, Counter
import pickle
import random
import re
from time import gmtime, strftime

# Set up paths
base_path = '/Users/simeon/Documents/DLH/content/mimic-cxr-project'
# !mkdir -p {base_path}/data
# !mkdir -p {base_path}/output

# Import the report parser module
import sys
sys.path.append(f"{base_path}/modules")
from report_parser import parse_report, MIMIC_RE
print("Successfully imported report parser module")

# Load train and test data
data_dir = os.path.join(base_path, 'data')
files_path = os.path.join(base_path, 'new_files')
output_dir = os.path.join(base_path, 'output')
reports_dir = os.path.join(base_path, 'reports')

train_df = pd.read_csv(os.path.join(data_dir, 'train.tsv'), sep='\t')
test_df = pd.read_csv(os.path.join(data_dir, 'test.tsv'), sep='\t')

print(f"Train data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")

# Load neighbors for different k values (10, 50, 100, 200)
neighbors_dict = {}
k_values = [10, 50, 100, 200]

for k in k_values:
    neighbors_path = os.path.join(output_dir, f'{k}nn_neighbors.pkl')

    if os.path.exists(neighbors_path):
        with open(neighbors_path, 'rb') as f:
            neighbors = pickle.load(f)

        neighbors_dict[k] = neighbors
        print(f"Loaded {k} neighbors for {len(neighbors)} test images")
    else:
        print(f"Warning: Neighbors file not found at {neighbors_path}")

# Map each dicom to its study_id
report_id_column = 'study_id'
report_lookup = dict(train_df[['dicom_id', report_id_column]].values)
print(f"Created lookup dictionary for {len(report_lookup)} training images")

# Define the n-gram model
class ConditionalNGramLM:
    """
    Conditional n-gram language model as described in the paper.

    For each test image, we build a language model based on
    the reports of its closest k training images.
    """

    def __init__(self, n=3):
        """Initialize the n-gram model with specified n."""
        self.n = n
        self.START = "<s>"
        self.END = "</s>"

    def build_lm(self, reports):
        """
        Build an n-gram language model from a collection of reports.

        Args:
            reports (list): List of report texts

        Returns:
            dict: Dictionary mapping n-gram contexts to next word distributions
        """
        if not reports:
            return {}

        # Language model dictionary
        lm = defaultdict(Counter)

        for report in reports:
            if not report or not isinstance(report, str):
                continue

            # Tokenize and preprocess
            tokens = report.lower().split()

            # Handle special case for unigram (1-gram) model
            if self.n == 1:
                # For 1-gram, we just need word frequencies (no context)
                for token in tokens:
                    lm[()][token] += 1
                # Add END token with appropriate frequency
                lm[()][self.END] += 1
            else:
                # Add START and END tokens
                padded_tokens = [self.START] * (self.n - 1) + tokens + [self.END]

                # Build n-grams
                for i in range(len(padded_tokens) - self.n + 1):
                    context = tuple(padded_tokens[i:i+self.n-1])
                    next_word = padded_tokens[i+self.n-1]
                    lm[context][next_word] += 1

        return lm

    def sample(self, lm):
        """
        Generate text by sampling from the language model.

        Args:
            lm (dict): Language model

        Returns:
            str: Generated text
        """
        if not lm:
            return ""

        # Handle special case for unigram model
        if self.n == 1:
            if () not in lm:
                return ""

            # Generate sequence for unigram model
            generated = []
            max_length = 100  # Prevent infinite loops

            # Sample words based on their frequency until END or max_length
            while len(generated) < max_length:
                # Get all words and their counts
                words, counts = zip(*lm[()].items())
                total = sum(counts)
                probs = [count/total for count in counts]

                # Sample a word
                current_word = np.random.choice(words, p=probs)

                # Stop if END token is sampled
                if current_word == self.END:
                    break

                generated.append(current_word)

            return " ".join(generated)

        # Standard n-gram model (n ≥ 2)
        # Start with START tokens
        generated = [self.START] * (self.n - 1)
        current_word = self.START

        # Generate words until END token or max length reached
        max_length = 100  # Prevent infinite loops
        while current_word != self.END and len(generated) < max_length:
            # Get the current context
            context = tuple(generated[-(self.n-1):])

            # If context not in language model, stop generation
            if context not in lm or not lm[context]:
                break

            # Sample from the distribution of next words
            next_words = lm[context]
            words, counts = zip(*next_words.items())
            total = sum(counts)
            probs = [count/total for count in counts]

            current_word = np.random.choice(words, p=probs)
            generated.append(current_word)

        # Remove START tokens and END token if present
        result = generated[(self.n-1):] if self.n > 1 else generated
        if result and result[-1] == self.END:
            result = result[:-1]

        return " ".join(result)

    def generate_report(self, neighbor_reports):
        """
        Generate a report for a test image based on its neighbors' reports.

        Args:
            neighbor_reports (list): Reports from neighboring training images

        Returns:
            str: Generated report
        """
        # Build language model from neighbor reports
        lm = self.build_lm(neighbor_reports)

        # Sample from the language model
        return self.sample(lm)

# Function to retrieve reports for a list of DICOM IDs
def get_reports_for_dicoms(dicom_ids):
    """
    Get the reports for a list of DICOM IDs.

    Args:
        dicom_ids (list): List of DICOM IDs

    Returns:
        list: List of report texts
    """
    reports = []

    for dicom_id in dicom_ids:
        # Skip if no report lookup available
        if dicom_id not in report_lookup:
            continue

        # Get report ID and subject ID
        report_id = report_lookup[dicom_id]
        subject_row = train_df[train_df.dicom_id == dicom_id]

        if len(subject_row) == 0:
            continue

        subject_id = subject_row.iloc[0]['subject_id']

        # Construct path to report
        subject_prefix = f"p{str(subject_id)[:2]}"
        subject_dir = f"p{subject_id}"
        study_dir = f"s{report_id}"
        report_path = os.path.join(reports_dir, 'files', subject_prefix, subject_dir, f"{study_dir}.txt")

        # Parse report
        try:
            if os.path.exists(report_path):
                report = parse_report(report_path)

                # Add findings section if available
                if 'findings' in report:
                    reports.append(report['findings'])
        except Exception as e:
            pass

    return reports

# Generate reports using different n-gram sizes and different numbers of neighbors
for n_value in [1, 2, 3]:  # The paper tested 1-gram, 2-gram, and 3-gram models
    for k in k_values:  # Different numbers of neighbors
        print(f"\nGenerating reports with {n_value}-gram model using {k} nearest neighbors...")

        # Skip if this k value is not available
        if k not in neighbors_dict:
            print(f"Skipping {k} neighbors as the data is not available")
            continue

        # Get neighbors for this k value
        neighbors = neighbors_dict[k]

        # Initialize n-gram model
        ngram_model = ConditionalNGramLM(n=n_value)

        # Generate reports for test images
        generated_reports = {}

        for pred_dicom in tqdm.tqdm(test_df.dicom_id.values):
            # Skip if no neighbors
            if pred_dicom not in neighbors:
                print(f"Warning: No neighbors for {pred_dicom}")
                continue

            # Get closest k training images
            nn_dicoms = neighbors[pred_dicom][:k]  # Ensure we use only k neighbors

            # Get reports for these neighbors
            neighbor_reports = get_reports_for_dicoms(nn_dicoms)

            # Skip if no reports found
            if not neighbor_reports:
                continue

            # Generate report
            generated_text = ngram_model.generate_report(neighbor_reports)
            generated_reports[pred_dicom] = generated_text

        print(f"Generated reports for {len(generated_reports)}/{len(test_df)} test images")

        # Save the generated reports
        print(strftime("%Y-%m-%d %H:%M:%S", gmtime()))

        pred_file = os.path.join(output_dir, f'{n_value}-gram_{k}nn.tsv')
        print(f"Saving predictions to {pred_file}")

        with open(pred_file, 'w') as f:
            print('dicom_id\tgenerated', file=f)
            for dicom_id, generated in sorted(generated_reports.items()):
                # Clean up the text (remove any tabs)
                cleaned_text = generated.replace('\t', ' ')
                print(f'{dicom_id}\t{cleaned_text}', file=f)

        print(strftime("%Y-%m-%d %H:%M:%S", gmtime()))

        # Display sample reports for 3-gram model with each k value
        if n_value == 3:
            print(f"\nSample reports from 3-gram model with {k} nearest neighbors:")
            sample_count = min(3, len(generated_reports))
            sample_dicoms = list(generated_reports.keys())[:sample_count]

            for dicom_id in sample_dicoms:
                print(f"\nSample report for {dicom_id}:")
                report_text = generated_reports[dicom_id]

                # Print preview of the report
                if len(report_text) > 200:
                    print(report_text[:200] + "...")
                else:
                    print(report_text)

Successfully imported report parser module
Train data shape: (4291, 3)
Test data shape: (1757, 3)
Loaded 10 neighbors for 1757 test images
Loaded 50 neighbors for 1757 test images
Loaded 100 neighbors for 1757 test images
Loaded 200 neighbors for 1757 test images
Created lookup dictionary for 4291 training images

Generating reports with 1-gram model using 10 nearest neighbors...


100%|██████████| 1757/1757 [00:08<00:00, 202.73it/s]


Generated reports for 1757/1757 test images
2025-04-25 17:34:54
Saving predictions to /Users/simeon/Documents/DLH/content/mimic-cxr-project/output/1-gram_10nn.tsv
2025-04-25 17:34:54

Generating reports with 1-gram model using 50 nearest neighbors...


100%|██████████| 1757/1757 [00:30<00:00, 57.05it/s]


Generated reports for 1757/1757 test images
2025-04-25 17:35:25
Saving predictions to /Users/simeon/Documents/DLH/content/mimic-cxr-project/output/1-gram_50nn.tsv
2025-04-25 17:35:25

Generating reports with 1-gram model using 100 nearest neighbors...


100%|██████████| 1757/1757 [01:03<00:00, 27.46it/s]


Generated reports for 1757/1757 test images
2025-04-25 17:36:29
Saving predictions to /Users/simeon/Documents/DLH/content/mimic-cxr-project/output/1-gram_100nn.tsv
2025-04-25 17:36:29

Generating reports with 1-gram model using 200 nearest neighbors...


100%|██████████| 1757/1757 [02:01<00:00, 14.52it/s]


Generated reports for 1757/1757 test images
2025-04-25 17:38:30
Saving predictions to /Users/simeon/Documents/DLH/content/mimic-cxr-project/output/1-gram_200nn.tsv
2025-04-25 17:38:30

Generating reports with 2-gram model using 10 nearest neighbors...


100%|██████████| 1757/1757 [00:05<00:00, 331.82it/s]


Generated reports for 1757/1757 test images
2025-04-25 17:38:35
Saving predictions to /Users/simeon/Documents/DLH/content/mimic-cxr-project/output/2-gram_10nn.tsv
2025-04-25 17:38:35

Generating reports with 2-gram model using 50 nearest neighbors...


100%|██████████| 1757/1757 [00:24<00:00, 72.99it/s]


Generated reports for 1757/1757 test images
2025-04-25 17:38:59
Saving predictions to /Users/simeon/Documents/DLH/content/mimic-cxr-project/output/2-gram_50nn.tsv
2025-04-25 17:38:59

Generating reports with 2-gram model using 100 nearest neighbors...


100%|██████████| 1757/1757 [00:47<00:00, 36.97it/s]


Generated reports for 1757/1757 test images
2025-04-25 17:39:47
Saving predictions to /Users/simeon/Documents/DLH/content/mimic-cxr-project/output/2-gram_100nn.tsv
2025-04-25 17:39:47

Generating reports with 2-gram model using 200 nearest neighbors...


100%|██████████| 1757/1757 [01:32<00:00, 18.99it/s]


Generated reports for 1757/1757 test images
2025-04-25 17:41:19
Saving predictions to /Users/simeon/Documents/DLH/content/mimic-cxr-project/output/2-gram_200nn.tsv
2025-04-25 17:41:19

Generating reports with 3-gram model using 10 nearest neighbors...


100%|██████████| 1757/1757 [00:05<00:00, 335.74it/s]


Generated reports for 1757/1757 test images
2025-04-25 17:41:24
Saving predictions to /Users/simeon/Documents/DLH/content/mimic-cxr-project/output/3-gram_10nn.tsv
2025-04-25 17:41:24

Sample reports from 3-gram model with 10 nearest neighbors:

Sample report for 20386a2d-1f7a8868-f12e22ac-0d625d27-4c38c8e2:
heart size is normal. peribronchial opacities in the region of the left pleural drainage catheter. the left pleural effusion appears minimally decreased in size with decreased, adjacent compressive at...

Sample report for 63100eab-9e8a8d90-392bc822-325de482-69a64e3b:
portable ap upright chest film at time is submitted

Sample report for 17269efa-b016a94d-1361e8df-ac428071-d1133672:
as compared to prior there continues to be volume loss at both bases. there is no pneumothorax. the cardiomediastinal and hilar contours are unremarkable. mild interstitial prominence with peribronchi...

Generating reports with 3-gram model using 50 nearest neighbors...


100%|██████████| 1757/1757 [00:24<00:00, 71.98it/s]


Generated reports for 1757/1757 test images
2025-04-25 17:41:49
Saving predictions to /Users/simeon/Documents/DLH/content/mimic-cxr-project/output/3-gram_50nn.tsv
2025-04-25 17:41:49

Sample reports from 3-gram model with 50 nearest neighbors:

Sample report for 20386a2d-1f7a8868-f12e22ac-0d625d27-4c38c8e2:
there is no pleural effusion. chronic posttraumatic change right posterior fifth rib is stable.

Sample report for 63100eab-9e8a8d90-392bc822-325de482-69a64e3b:
portable ap upright chest radiograph at time

Sample report for 17269efa-b016a94d-1361e8df-ac428071-d1133672:
compared the prior study there is redemonstration of free intra-abdominal air, likely related to prior chest radiograph from , lung volumes are slightly reduced. the heart is normal in size.

Generating reports with 3-gram model using 100 nearest neighbors...


100%|██████████| 1757/1757 [00:48<00:00, 36.27it/s]


Generated reports for 1757/1757 test images
2025-04-25 17:42:37
Saving predictions to /Users/simeon/Documents/DLH/content/mimic-cxr-project/output/3-gram_100nn.tsv
2025-04-25 17:42:37

Sample reports from 3-gram model with 100 nearest neighbors:

Sample report for 20386a2d-1f7a8868-f12e22ac-0d625d27-4c38c8e2:
there is some increased perihilar fullness, particularly on the prior study. there is no pleural effusion or pneumothorax. visualized osseous structures demonstrates no acute osseous abnormalities.

Sample report for 63100eab-9e8a8d90-392bc822-325de482-69a64e3b:
elevation of left hemidiaphragm. cardiomediastinal silhouette is normal. the hilar and mediastinal contours are unchanged. no free air below the right costophrenic sulcus is unchanged. the right lung ...

Sample report for 17269efa-b016a94d-1361e8df-ac428071-d1133672:
trace layering pleural effusions are identified. there is no significant interval change.

Generating reports with 3-gram model using 200 nearest neighbors..

100%|██████████| 1757/1757 [01:36<00:00, 18.24it/s]

Generated reports for 1757/1757 test images
2025-04-25 17:44:14
Saving predictions to /Users/simeon/Documents/DLH/content/mimic-cxr-project/output/3-gram_200nn.tsv
2025-04-25 17:44:14

Sample reports from 3-gram model with 200 nearest neighbors:

Sample report for 20386a2d-1f7a8868-f12e22ac-0d625d27-4c38c8e2:
tiny left pneumothorax is seen. staple lines project over the prevascular region of the costophrenic angles are not included. there is a pleural line projecting over the lung bases likely reflect edem...

Sample report for 63100eab-9e8a8d90-392bc822-325de482-69a64e3b:
the heart is normal. imaged osseous structures are unremarkable in appearance.

Sample report for 17269efa-b016a94d-1361e8df-ac428071-d1133672:
patchy, streaky opacities in both lung fields suggest subsegmental atelectasis is noted in the lower cervical spine. et tube with tip in the left lung volume. possible small left and small pleural flu...



