In [7]:
!which python3

/shared/centos7/anaconda3/2022.05/bin/python3


In [3]:
# Importing the required libraries

import os
from glob import glob
from pathlib import Path
from collections import defaultdict

import spacy
import evaluate
import numpy as np
import pandas as pd
from tqdm import tqdm

from src.full_model.train_full_model import get_tokenizer
from src.full_model.generate_reports_for_images import get_model
from src.full_model.generate_reports_for_images import get_image_tensor
from src.full_model.generate_reports_for_images import get_report_for_image
from src.full_model.generate_reports_for_images import write_generated_reports_to_txt

ModuleNotFoundError: No module named 'spacy'

In [2]:
# DATA URLs

BASE_DIR = Path().cwd()
DATA_DIR = (
    BASE_DIR.parent / "data" /
    "mimic-eye-integrating-mimic-datasets-with-reflacx-and-eye-gaze-for-multimodal-deep-learning-applications-1.0.0" /
    "mimic-eye"
)
DATA_DIR

PosixPath('/home/kishoresampath/report-gen-project/data/mimic-eye-integrating-mimic-datasets-with-reflacx-and-eye-gaze-for-multimodal-deep-learning-applications-1.0.0/mimic-eye')

In [3]:
# Number of patient folders

PATIENTS_DIR = DATA_DIR / "patient_*"
patient_folders = glob(str(PATIENTS_DIR))

len(patient_folders)

3192

In [4]:
# Number of X-ray reports

CXR_DICOM_DIR = DATA_DIR / "patient_*" / "CXR-DICOM"

REFERENCE_REPORT_PATHS = glob(str(CXR_DICOM_DIR / "*.txt"))

len(REFERENCE_REPORT_PATHS)

3645

In [5]:
IMAGE_PATHS = []

for ref_report_path in tqdm(REFERENCE_REPORT_PATHS):
    patient_id = ref_report_path.split("/")[-3].split("_")[-1]
    xray_id = ref_report_path.split("/")[-1].split(".")[0]

    meta_file_url = DATA_DIR / f"patient_{patient_id}" / "CXR-JPG" / "cxr_meta.csv"
    
    df = pd.read_csv(meta_file_url)
    studied_xrays = set(df["dicom_id"].values)

    study_xray_images = glob(str(DATA_DIR / f"patient_{patient_id}" / "CXR-JPG" / xray_id / "*.jpg"))

    for xray_image in study_xray_images:
        image_id = xray_image.split("/")[-1].split(".")[0]
        
        if image_id in studied_xrays:
            IMAGE_PATHS.append(xray_image)
    
len(IMAGE_PATHS)

100%|██████████████████████████████████████| 3645/3645 [00:08<00:00, 431.56it/s]


3689

In [6]:
cnt = set()

patient_ids = []

for img_path in IMAGE_PATHS:
    study_id = img_path.split("/")[-2]

    if study_id in cnt:
        patient_id = img_path.split("/")[-4].split("_")[1]
        patient_ids.append(patient_id)
    
    cnt.add(study_id)

len(cnt)

3645

In [7]:
patient_ids

['10896351',
 '15710368',
 '19298963',
 '15165193',
 '11110923',
 '16603694',
 '13739802',
 '18426683',
 '15945590',
 '18962500',
 '11580463',
 '11580463',
 '10490475',
 '16538543',
 '18505436',
 '19371566',
 '13554447',
 '10129815',
 '19381010',
 '13537167',
 '13186935',
 '19808040',
 '14584470',
 '12008763',
 '19693764',
 '10533101',
 '12058581',
 '15325060',
 '14247006',
 '13707073',
 '13306384',
 '11482871',
 '16239007',
 '18065146',
 '17862049',
 '17350587',
 '11878216',
 '19541033',
 '10850680',
 '19391089',
 '18043096',
 '16168308',
 '13528989',
 '11609895']

In [8]:
# Value counts of number of reports per patient

xray_report_counts = defaultdict(int)

for ref_file_path in REFERENCE_REPORT_PATHS:
    patient_id = ref_file_path.split("/")[-3].split("_")[-1]
    xray_report_counts[patient_id] += 1

In [9]:
values, counts = np.unique(list(xray_report_counts.values()), return_counts=True)

In [10]:
list(zip(list(values), list(counts)))

[(1, 2970),
 (2, 157),
 (3, 17),
 (4, 13),
 (5, 8),
 (6, 9),
 (7, 8),
 (8, 4),
 (9, 2),
 (12, 2),
 (14, 1),
 (20, 1)]

In [11]:
checkpoint_path = "full_model_checkpoint_val_loss_19.793_overall_steps_155252.pt"
model = get_model(checkpoint_path)
print("Model instantiated.")

Model instantiated.


In [12]:
bert_score = evaluate.load("bertscore")
sentence_tokenizer = spacy.load("en_core_web_trf")
tokenizer = get_tokenizer()



In [13]:
random_image_paths = np.random.choice(IMAGE_PATHS, size=100, replace=False)
len(random_image_paths)

100

In [23]:
import torch

from src.full_model.generate_reports_for_images import convert_generated_sentences_to_report

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

BERTSCORE_SIMILARITY_THRESHOLD = 0.9
IMAGE_INPUT_SIZE = 512
MAX_NUM_TOKENS_GENERATE = 300
NUM_BEAMS = 4
mean = 0.471  # see get_transforms in src/dataset/compute_mean_std_dataset.py
std = 0.302

def get_report_for_images(model, images_tensor, tokenizer, bert_score, sentence_tokenizer):
    with torch.autocast(device_type="cuda", dtype=torch.float16):
        output = model.generate(
            images_tensor.to(device, non_blocking=True),
            max_length=MAX_NUM_TOKENS_GENERATE,
            num_beams=NUM_BEAMS,
            early_stopping=True,
        )

    beam_search_output, _, _, _ = output

    beam_search_output = beam_search_output[:15, :]
    print(beam_search_output.shape)

    generated_sents_for_selected_regions = tokenizer.batch_decode(
        beam_search_output, skip_special_tokens=True, clean_up_tokenization_spaces=True
    )  # list[str]

    generated_report = convert_generated_sentences_to_report(
        generated_sents_for_selected_regions, bert_score, sentence_tokenizer
    )  # str

    return generated_report

In [24]:
# import shutil

# for image_path in tqdm(random_image_paths):
#     generated_reports = []
    
#     xray_id = image_path.split("/")[-1].split(".")[0]
#     patient_id = image_path.split("/")[-4].split("_")[1]
#     generated_reports_txt_path = os.path.join("gen_reports", f"gen_{xray_id}.txt")

#     subject_id = image_path.split("/")[-2]
#     original_reports_txt_path = os.path.join("orig_reports", f"orig_{xray_id}.txt")
#     report_path = DATA_DIR / f"patient_{patient_id}" / "CXR-DICOM" / f"{subject_id}.txt"

#     shutil.copy(report_path, original_reports_txt_path)
    
#     image_tensor = get_image_tensor(image_path)  # shape (1, 1, 512, 512)
#     generated_report = get_report_for_images(model, image_tensor, tokenizer, bert_score, sentence_tokenizer)
#     generated_reports.append(generated_report)

#     write_generated_reports_to_txt([image_path], generated_reports, generated_reports_txt_path)

#     break

In [25]:
import cv2
import albumentations as A
from albumentations.pytorch import ToTensorV2


def get_image_tensor(image_path):
    # cv2.imread by default loads an image with 3 channels
    # since we have grayscale images, we only have 1 channel and thus use cv2.IMREAD_UNCHANGED to read in the 1 channel
    image = cv2.imread(image_path, cv2.IMREAD_UNCHANGED)  # shape (3056, 2544)

    val_test_transforms = A.Compose(
        [
            A.LongestMaxSize(max_size=IMAGE_INPUT_SIZE, interpolation=cv2.INTER_AREA),
            A.PadIfNeeded(min_height=IMAGE_INPUT_SIZE, min_width=IMAGE_INPUT_SIZE, border_mode=cv2.BORDER_CONSTANT),
            A.Normalize(mean=mean, std=std),
            ToTensorV2(),
        ]
    )

    transform = val_test_transforms(image=image)
    image_transformed = transform["image"]  # shape (1, 512, 512)
    # image_transformed_batch = image_transformed.unsqueeze(0)  # shape (1, 1, 512, 512)

    return image_transformed

In [None]:
import shutil

BATCH_SIZE = 32

size = 1
images_tensor = []
gen_file_paths = []
orig_file_paths = []
image_paths = []


for image_path in tqdm(random_image_paths):
    generated_reports = []
    
    xray_id = image_path.split("/")[-1].split(".")[0]
    patient_id = image_path.split("/")[-4].split("_")[1]
    generated_reports_txt_path = os.path.join("gen_reports", f"gen_{xray_id}.txt")

    subject_id = image_path.split("/")[-2]
    original_reports_txt_path = os.path.join("orig_reports", f"orig_{xray_id}.txt")
    report_path = DATA_DIR / f"patient_{patient_id}" / "CXR-DICOM" / f"{subject_id}.txt"

    shutil.copy(report_path, original_reports_txt_path)
    
    image_tensor = get_image_tensor(image_path)  # shape (1, 1, 512, 512)
    images_tensor.append(image_tensor)

    image_paths.append(image_path)
    gen_file_paths.append(generated_reports_txt_path)
    orig_file_paths.append(original_reports_txt_path)

    if size == BATCH_SIZE:
        images_tensor = torch.stack(images_tensor)
        
        generated_report = get_report_for_images(model, images_tensor, tokenizer, bert_score, sentence_tokenizer)
        generated_reports.append(generated_report)
    
        write_generated_reports_to_txt(image_paths, generated_reports, gen_file_paths[-1])

        size = 0
        images_tensor = []
        gen_file_paths = []
        orig_file_paths = []
        image_paths = []
        
        break

    size += 1

 30%|████████████▌                             | 30/100 [00:15<00:04, 15.76it/s]

In [None]:
def read_txt_file(file_path: str) -> str:
    with open(file_path, "r") as file:
        content = file.read()

    sents = content.split("\n")
    sents = list(map(lambda x: x.strip(), sents))

    return sents

In [46]:
def preprocess_reference(sents: list[str]) -> str:
    def isHeader(sent: str) -> bool:
        if sent == "FINAL REPORT":
            return True

    preprocessed_sents = []

    for sent in sents:
        if sent != '' and not isHeader(sent):
            if ':' in sent:
                sent = sent.split(":")[-1].strip()
                
            preprocessed_sents.append(sent)

    return " ".join(preprocessed_sents)

In [47]:
file_path = os.path.join("orig_reports", "orig_034afc43-2fad16bb-0c7f7afd-78ac6a4f-b2a8e2e2.txt")

reference = read_txt_file(file_path)
reference

['FINAL REPORT',
 'PORTABLE CHEST ___',
 '',
 'COMPARISON:  Radiograph of one day earlier.',
 '',
 'FINDINGS:  Right chest tube remains in place, with a small right apicolateral',
 'pneumothorax, which is decreased in size since the recent radiograph.',
 'Otherwise, little change in the appearance of the chest since the previous',
 'study.',
 '']

In [48]:
reference = preprocess_reference(reference)
reference

'PORTABLE CHEST ___ Radiograph of one day earlier. Right chest tube remains in place, with a small right apicolateral pneumothorax, which is decreased in size since the recent radiograph. Otherwise, little change in the appearance of the chest since the previous study.'

In [58]:
def preprocess_prediction(sents: list[str]) -> str:
    def isHeader(sent: str) -> bool:
        if "Image path:" in sent or '=' in sent:
            return True

    preprocessed_sents = []

    for sent in sents:
        if sent != '' and not isHeader(sent):
            if "Generated report:" in sent:
                sent = sent.split(":")[-1].strip()
                
            preprocessed_sents.append(sent)

    return " ".join(preprocessed_sents)

In [59]:
file_path = os.path.join("gen_reports", "gen_034afc43-2fad16bb-0c7f7afd-78ac6a4f-b2a8e2e2.txt")

prediction = read_txt_file(file_path)
prediction

['Image path: /home/kishoresampath/report-gen-project/data/mimic-eye-integrating-mimic-datasets-with-reflacx-and-eye-gaze-for-multimodal-deep-learning-applications-1.0.0/mimic-eye/patient_17478604/CXR-JPG/s55974954/034afc43-2fad16bb-0c7f7afd-78ac6a4f-b2a8e2e2.jpg',
 'Generated report: In comparison with the study of ___, there is no change in the extent of the right pleural effusion or pneumothorax. Small right apical pneumothorax is unchanged. There is no pneumothorax or pleural effusion. Bibasilar atelectasis is unchanged. There is no pulmonary edema. The cardiomediastinal silhouette is normal. Heart size is normal. No free air below the right hemidiaphragm.',
 '',
 '',
 '']

In [60]:
prediction = preprocess_prediction(prediction)
prediction

'In comparison with the study of ___, there is no change in the extent of the right pleural effusion or pneumothorax. Small right apical pneumothorax is unchanged. There is no pneumothorax or pleural effusion. Bibasilar atelectasis is unchanged. There is no pulmonary edema. The cardiomediastinal silhouette is normal. Heart size is normal. No free air below the right hemidiaphragm.'

In [61]:
def get_reference(file_path: str) -> str:
    reference = read_txt_file(file_path)
    reference = preprocess_reference(reference)

    return reference

In [62]:
def get_prediction(file_path: str) -> str:
    prediction = read_txt_file(file_path)
    prediction = preprocess_prediction(prediction)

    return prediction

In [63]:
xray_ids = []

for file_path in glob(str(Path().cwd() / "gen_reports" / "gen_*.txt")):
    xray_ids.append(
        file_path.split("/")[-1].split(".")[0].split("_")[1]
    )

len(xray_ids)

100

In [64]:
references = []
predictions = []

for id in xray_ids:
    ref_file_path = os.path.join("orig_reports", f"orig_{id}.txt")
    gen_file_path = os.path.join("gen_reports", f"gen_{id}.txt")

    reference = get_reference(ref_file_path)
    prediction = get_prediction(gen_file_path)

    references.append(reference)
    predictions.append(prediction)

In [65]:
references

['___ year old woman with positive PPD/TST and hx cough  // any sign of active or latent TB? Chest PA and lateral Radiograph from ___.  There is a granuloma projecting over the heart on the left.  Otherwise the lungs are well expanded and clear.  No pleural abnormality is seen.  The mediastinum and hilar contours are normal.  Heavily calcified thyroid nodule is again seen, unchanged from prior.  Cardiomegaly appear stable.  1. No acute cardiopulmonary abnormality or acute TB infection.  One calcified granuloma. 2. Cardiomegaly, stable.',
 '___-year-old male with chest pain. PA and lateral chest radiographs. ___. The heart size within normal limits.  The mediastinal contours are not widened and demonstrate a mildly tortuous aorta.  The lungs are clear of consolidation.  There is no pleural effusion or pneumothorax. No acute cardiopulmonary process.',
 '55 PM Nondisplaced right ___ and ___ lateral rib fractures.  Retrocardiac atelectasis.  No definite radiographic evidence for pneumonia.

In [66]:
predictions

['There is no pleural effusion or pneumothorax. No acute cardiopulmonary process. There is mild bibasilar atelectasis. There are no acute osseous abnormalities. The cardiac and mediastinal silhouettes are unremarkable. The cardiomediastinal silhouette is within normal limits. Moderate cardiomegaly.',
 'No acute cardiopulmonary process. Linear opacities in the right lower lobe are consistent with atelectasis. There is no focal consolidation, effusion, or pneumothorax. No free air below the right hemidiaphragm is seen. There is mild bibasilar atelectasis. The mediastinal and hilar contours are unremarkable. There are no acute osseous abnormalities. The cardiomediastinal silhouette is within normal limits.',
 'There is no pleural effusion or pneumothorax. No acute cardiopulmonary process. The mediastinal and hilar contours are unremarkable. The lungs are hyperinflated with flattening of the diaphragms. There are no acute osseous abnormalities. The cardiomediastinal silhouette is within no

In [70]:
rouge = evaluate.load("rouge")

results = rouge.compute(predictions=predictions,
                        references=references)

results

[INFO]: Using default tokenizer.


{'rouge1': 0.346463494134467,
 'rouge2': 0.13622509093055463,
 'rougeL': 0.2060864108410731,
 'rougeLsum': 0.20635480794145408}

In [71]:
bleu = evaluate.load("bleu")

results = bleu.compute(predictions=predictions,
                        references=references)

results

{'bleu': 0.061934602481411825,
 'precisions': [0.47310126582278483,
  0.2022508038585209,
  0.1022875816993464,
  0.050996677740863784],
 'brevity_penalty': 0.4143630909566283,
 'length_ratio': 0.531628532974428,
 'translation_length': 6320,
 'reference_length': 11888}