<a href="https://colab.research.google.com/github/jasleenkaursandhu/Reproducing-chest-xray-report-generation-boag/blob/main/extract_densenet_features.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pydicom
!pip install tensorflow
import numpy as np
import pandas as pd
import os
import pydicom
import tensorflow as tf
from tensorflow.keras.applications.densenet import DenseNet121, preprocess_input
from PIL import Image
import tqdm
import pickle
import gc

Collecting pydicom
  Downloading pydicom-3.0.1-py3-none-any.whl.metadata (9.4 kB)
Downloading pydicom-3.0.1-py3-none-any.whl (2.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pydicom
Successfully installed pydicom-3.0.1


In [2]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Define paths
base_path = '/content/drive/MyDrive/mimic-cxr-project'
data_dir = os.path.join(base_path, 'data')
files_path = os.path.join(base_path, 'files')
output_dir = os.path.join(base_path, 'output')

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Load train and test data
train_df = pd.read_csv(os.path.join(data_dir, 'train.tsv'), sep='\t')
test_df = pd.read_csv(os.path.join(data_dir, 'test.tsv'), sep='\t')

print(f"Train data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")

# Load pre-trained DenseNet121 model
model = DenseNet121(weights='imagenet', include_top=False, pooling='avg')
print("Loaded DenseNet121 model")

# Function to extract features from a DICOM image
def extract_features(dicom_path):
    try:
        # Read the DICOM file
        ds = pydicom.dcmread(dicom_path)

        # Convert to image format
        pixel_array = ds.pixel_array

        # Normalize pixel values
        pixel_array = pixel_array / np.max(pixel_array)

        # Convert to uint8
        img = np.uint8(pixel_array * 255)

        # Convert to RGB (DenseNet expects 3 channels)
        if len(img.shape) == 2:
            # Grayscale to RGB
            img_rgb = np.stack([img, img, img], axis=2)
        elif img.shape[2] == 1:
            # Single channel to RGB
            img_rgb = np.concatenate([img, img, img], axis=2)
        else:
            img_rgb = img

        # Resize to 224x224 (expected by DenseNet)
        pil_img = Image.fromarray(img_rgb)
        pil_img = pil_img.resize((224, 224))

        # Convert to numpy array and preprocess
        img_array = np.array(pil_img)
        img_array = preprocess_input(img_array)

        # Add batch dimension
        img_array = np.expand_dims(img_array, axis=0)

        # Extract features
        features = model.predict(img_array, verbose=0)

        return features.flatten()
    except Exception as e:
        print(f"Error processing {dicom_path}: {e}")
        return None

# Process train images in batches to avoid memory issues
batch_size = 50
densenet_vecs = {}

# Process train images
for idx, row in tqdm.tqdm(train_df.iterrows(), total=len(train_df), desc="Processing train images"):
    if idx % batch_size == 0:
        print(f"Processed {idx}/{len(train_df)} train images")

    dicom_id = row['dicom_id']
    subject_id = row['subject_id']
    study_id = row['study_id']

    # Construct path to the DICOM file
    subject_prefix = f"p{str(subject_id)[:2]}"
    subject_dir = f"p{subject_id}"
    study_dir = f"s{study_id}"
    dicom_file = f"{dicom_id}.dcm"
    dicom_path = os.path.join(files_path, subject_prefix, subject_dir, study_dir, dicom_file)

    if os.path.exists(dicom_path):
        features = extract_features(dicom_path)
        if features is not None:
            densenet_vecs[dicom_id] = features

    # Save intermediate results
    if (idx + 1) % 500 == 0 or idx == len(train_df) - 1:
        print(f"Saving intermediate results: {len(densenet_vecs)} vectors")
        with open(os.path.join(output_dir, 'densenet121_train.pkl'), 'wb') as f:
            pickle.dump(densenet_vecs, f)

    # Clean up memory
    if idx % 100 == 0:
        gc.collect()

# Save the final vectors
with open(os.path.join(output_dir, 'densenet121_train.pkl'), 'wb') as f:
    pickle.dump(densenet_vecs, f)

print(f"Saved DenseNet features for {len(densenet_vecs)} train images")

# Process test images
test_densenet_vecs = {}

for idx, row in tqdm.tqdm(test_df.iterrows(), total=len(test_df), desc="Processing test images"):
    if idx % batch_size == 0:
        print(f"Processed {idx}/{len(test_df)} test images")

    dicom_id = row['dicom_id']
    subject_id = row['subject_id']
    study_id = row['study_id']

    # Construct path to the DICOM file
    subject_prefix = f"p{str(subject_id)[:2]}"
    subject_dir = f"p{subject_id}"
    study_dir = f"s{study_id}"
    dicom_file = f"{dicom_id}.dcm"
    dicom_path = os.path.join(files_path, subject_prefix, subject_dir, study_dir, dicom_file)

    if os.path.exists(dicom_path):
        features = extract_features(dicom_path)
        if features is not None:
            test_densenet_vecs[dicom_id] = features

    # Save intermediate results
    if (idx + 1) % 100 == 0 or idx == len(test_df) - 1:
        print(f"Saving intermediate results: {len(test_densenet_vecs)} vectors")
        with open(os.path.join(output_dir, 'densenet121_test.pkl'), 'wb') as f:
            pickle.dump(test_densenet_vecs, f)

    # Clean up memory
    if idx % 50 == 0:
        gc.collect()

# Save the test vectors
with open(os.path.join(output_dir, 'densenet121_test.pkl'), 'wb') as f:
    pickle.dump(test_densenet_vecs, f)

print(f"Saved DenseNet features for {len(test_densenet_vecs)} test images")

Mounted at /content/drive
Train data shape: (2243, 3)
Test data shape: (871, 3)
Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/densenet/densenet121_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m29084464/29084464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Loaded DenseNet121 model


Processing train images:   0%|          | 0/2243 [00:00<?, ?it/s]

Processed 0/2243 train images


Processing train images:   2%|▏         | 50/2243 [06:29<59:51,  1.64s/it]

Processed 50/2243 train images


Processing train images:   4%|▍         | 100/2243 [08:05<1:33:02,  2.60s/it]

Processed 100/2243 train images


Processing train images:   7%|▋         | 150/2243 [09:27<1:00:15,  1.73s/it]

Processed 150/2243 train images


Processing train images:   9%|▉         | 200/2243 [10:56<56:46,  1.67s/it]

Processed 200/2243 train images


Processing train images:  10%|▉         | 217/2243 [11:25<38:58,  1.15s/it]

Error processing /content/drive/MyDrive/mimic-cxr-project/files/p17/p17585916/s51774836/7f7346e9-c1f9639f-8e83f5bc-f166c421-69f3b162.dcm: The number of bytes of pixel data is less than expected (1274412 vs 15548928 bytes) - the dataset may be corrupted, have an invalid group 0028 element value, or the transfer syntax may be incorrect


Processing train images:  11%|█         | 250/2243 [12:18<58:24,  1.76s/it]

Processed 250/2243 train images


Processing train images:  13%|█▎        | 300/2243 [13:38<49:45,  1.54s/it]

Processed 300/2243 train images


Processing train images:  16%|█▌        | 350/2243 [14:59<58:06,  1.84s/it]  

Processed 350/2243 train images


Processing train images:  18%|█▊        | 400/2243 [16:28<1:15:34,  2.46s/it]

Processed 400/2243 train images


Processing train images:  20%|██        | 450/2243 [18:02<1:04:27,  2.16s/it]

Processed 450/2243 train images


Processing train images:  22%|██▏       | 499/2243 [19:24<52:38,  1.81s/it]  

Saving intermediate results: 499 vectors


Processing train images:  22%|██▏       | 500/2243 [19:26<56:08,  1.93s/it]

Processed 500/2243 train images


Processing train images:  25%|██▍       | 550/2243 [20:52<43:29,  1.54s/it]

Processed 550/2243 train images


Processing train images:  27%|██▋       | 600/2243 [22:18<46:29,  1.70s/it]

Processed 600/2243 train images


Processing train images:  29%|██▉       | 650/2243 [23:44<1:01:56,  2.33s/it]

Processed 650/2243 train images


Processing train images:  29%|██▉       | 652/2243 [23:46<44:14,  1.67s/it]

Error processing /content/drive/MyDrive/mimic-cxr-project/files/p18/p18426683/s56692948/4c7a69d2-8396ed79-2b88c121-cfb19206-e72369d6.dcm: The number of bytes of pixel data is less than expected (2699910 vs 15548928 bytes) - the dataset may be corrupted, have an invalid group 0028 element value, or the transfer syntax may be incorrect


Processing train images:  31%|███       | 700/2243 [25:09<46:37,  1.81s/it]

Processed 700/2243 train images


Processing train images:  33%|███▎      | 750/2243 [26:32<34:32,  1.39s/it]

Processed 750/2243 train images


Processing train images:  36%|███▌      | 800/2243 [28:02<56:15,  2.34s/it]  

Processed 800/2243 train images


Processing train images:  38%|███▊      | 850/2243 [29:32<39:18,  1.69s/it]

Processed 850/2243 train images


Processing train images:  40%|████      | 900/2243 [30:50<31:35,  1.41s/it]

Processed 900/2243 train images


Processing train images:  41%|████      | 910/2243 [31:05<28:36,  1.29s/it]

Error processing /content/drive/MyDrive/mimic-cxr-project/files/p19/p19286907/s51318449/b584431a-4c8ff7f0-3927d5fd-e5135a78-88274d2e.dcm: The number of bytes of pixel data is less than expected (5763910 vs 15548928 bytes) - the dataset may be corrupted, have an invalid group 0028 element value, or the transfer syntax may be incorrect


Processing train images:  42%|████▏     | 950/2243 [32:14<37:09,  1.72s/it]

Processed 950/2243 train images


Processing train images:  45%|████▍     | 999/2243 [33:38<36:24,  1.76s/it]

Saving intermediate results: 997 vectors


Processing train images:  45%|████▍     | 1000/2243 [33:40<37:33,  1.81s/it]

Processed 1000/2243 train images


Processing train images:  46%|████▋     | 1039/2243 [34:47<29:11,  1.46s/it]

Error processing /content/drive/MyDrive/mimic-cxr-project/files/p14/p14187001/s54676352/f0d713e0-4db1800a-4ae3600b-c365a2a5-4985ec73.dcm: The number of bytes of pixel data is less than expected (1094284 vs 15548928 bytes) - the dataset may be corrupted, have an invalid group 0028 element value, or the transfer syntax may be incorrect


Processing train images:  47%|████▋     | 1050/2243 [35:05<30:40,  1.54s/it]

Processed 1050/2243 train images


Processing train images:  49%|████▉     | 1100/2243 [36:30<35:25,  1.86s/it]

Processed 1100/2243 train images


Processing train images:  51%|█████▏    | 1150/2243 [37:55<35:10,  1.93s/it]

Processed 1150/2243 train images


Processing train images:  53%|█████▎    | 1200/2243 [39:27<30:28,  1.75s/it]

Processed 1200/2243 train images


Processing train images:  56%|█████▌    | 1250/2243 [40:50<27:35,  1.67s/it]

Processed 1250/2243 train images


Processing train images:  58%|█████▊    | 1300/2243 [42:14<28:17,  1.80s/it]

Processed 1300/2243 train images


Processing train images:  60%|██████    | 1350/2243 [43:44<22:59,  1.54s/it]

Processed 1350/2243 train images


Processing train images:  62%|██████▏   | 1383/2243 [44:39<18:54,  1.32s/it]

Error processing /content/drive/MyDrive/mimic-cxr-project/files/p10/p10067702/s52093291/7b27c1fb-f3a20e71-a3633833-c56fdbd1-59ceb4c6.dcm: The number of bytes of pixel data is less than expected (7844238 vs 15548928 bytes) - the dataset may be corrupted, have an invalid group 0028 element value, or the transfer syntax may be incorrect


Processing train images:  62%|██████▏   | 1400/2243 [45:07<25:05,  1.79s/it]

Processed 1400/2243 train images


Processing train images:  65%|██████▍   | 1450/2243 [46:29<20:38,  1.56s/it]

Processed 1450/2243 train images


Processing train images:  67%|██████▋   | 1500/2243 [47:49<19:25,  1.57s/it]

Saving intermediate results: 1495 vectors
Processed 1500/2243 train images


Processing train images:  69%|██████▉   | 1550/2243 [49:21<17:34,  1.52s/it]

Processed 1550/2243 train images


Processing train images:  71%|███████▏  | 1600/2243 [50:45<18:05,  1.69s/it]

Processed 1600/2243 train images


Processing train images:  74%|███████▎  | 1650/2243 [52:18<14:51,  1.50s/it]

Processed 1650/2243 train images


Processing train images:  76%|███████▌  | 1700/2243 [53:43<14:55,  1.65s/it]

Processed 1700/2243 train images


Processing train images:  78%|███████▊  | 1750/2243 [55:11<16:45,  2.04s/it]

Processed 1750/2243 train images


Processing train images:  80%|████████  | 1800/2243 [56:32<10:42,  1.45s/it]

Processed 1800/2243 train images


Processing train images:  82%|████████▏ | 1850/2243 [57:52<11:18,  1.73s/it]

Processed 1850/2243 train images


Processing train images:  83%|████████▎ | 1859/2243 [58:07<08:58,  1.40s/it]

Error processing /content/drive/MyDrive/mimic-cxr-project/files/p10/p10862862/s51465215/adedf30a-e7c7f6bf-5f84d327-6a28ec36-1884d9fa.dcm: The number of bytes of pixel data is less than expected (7811632 vs 15548928 bytes) - the dataset may be corrupted, have an invalid group 0028 element value, or the transfer syntax may be incorrect


Processing train images:  84%|████████▎ | 1875/2243 [58:37<09:44,  1.59s/it]

Error processing /content/drive/MyDrive/mimic-cxr-project/files/p12/p12472552/s50755415/1e5ca153-998b2a9b-25707745-bd6ac2ab-24d336a4.dcm: The number of bytes of pixel data is less than expected (5567114 vs 15548928 bytes) - the dataset may be corrupted, have an invalid group 0028 element value, or the transfer syntax may be incorrect


Processing train images:  85%|████████▍ | 1900/2243 [59:19<10:36,  1.86s/it]

Processed 1900/2243 train images


Processing train images:  87%|████████▋ | 1950/2243 [1:00:43<08:03,  1.65s/it]

Processed 1950/2243 train images


Processing train images:  89%|████████▉ | 1999/2243 [1:02:02<06:26,  1.58s/it]

Saving intermediate results: 1993 vectors


Processing train images:  89%|████████▉ | 2000/2243 [1:02:04<06:54,  1.71s/it]

Processed 2000/2243 train images


Processing train images:  91%|█████████▏| 2050/2243 [1:03:23<05:26,  1.69s/it]

Processed 2050/2243 train images


Processing train images:  93%|█████████▎| 2078/2243 [1:04:07<03:13,  1.17s/it]

Error processing /content/drive/MyDrive/mimic-cxr-project/files/p13/p13374841/s58911904/50ea8464-0aa978d0-82d052ac-c25caaa6-f794da21.dcm: The number of bytes of pixel data is less than expected (3994490 vs 14298480 bytes) - the dataset may be corrupted, have an invalid group 0028 element value, or the transfer syntax may be incorrect


Processing train images:  94%|█████████▎| 2100/2243 [1:04:49<04:59,  2.09s/it]

Processed 2100/2243 train images


Processing train images:  96%|█████████▌| 2150/2243 [1:06:07<02:14,  1.44s/it]

Processed 2150/2243 train images


Processing train images:  98%|█████████▊| 2200/2243 [1:07:35<01:13,  1.72s/it]

Processed 2200/2243 train images


Processing train images: 100%|██████████| 2243/2243 [1:08:56<00:00,  1.84s/it]

Saving intermediate results: 2235 vectors





Saved DenseNet features for 2235 train images


Processing test images:   0%|          | 0/871 [00:00<?, ?it/s]

Processed 0/871 test images


Processing test images:   6%|▌         | 50/871 [01:20<22:15,  1.63s/it]

Processed 50/871 test images


Processing test images:   8%|▊         | 67/871 [01:45<15:48,  1.18s/it]

Error processing /content/drive/MyDrive/mimic-cxr-project/files/p16/p16741986/s56541794/fdba0667-faa73efd-da3746a5-2a72a1fa-f5b292b7.dcm: The number of bytes of pixel data is less than expected (3535498 vs 15548928 bytes) - the dataset may be corrupted, have an invalid group 0028 element value, or the transfer syntax may be incorrect


Processing test images:  11%|█▏        | 99/871 [02:39<18:32,  1.44s/it]

Saving intermediate results: 99 vectors


Processing test images:  11%|█▏        | 100/871 [02:41<20:39,  1.61s/it]

Processed 100/871 test images


Processing test images:  16%|█▌        | 141/871 [03:50<22:18,  1.83s/it]

Error processing /content/drive/MyDrive/mimic-cxr-project/files/p10/p10068304/s53442406/a235b65b-e777f854-a98de6d6-2f3d33d8-bca255ec.dcm: The number of bytes of pixel data is less than expected (8892968 vs 15487900 bytes) - the dataset may be corrupted, have an invalid group 0028 element value, or the transfer syntax may be incorrect


Processing test images:  16%|█▋        | 143/871 [03:54<20:46,  1.71s/it]

Error processing /content/drive/MyDrive/mimic-cxr-project/files/p10/p10068304/s54217246/4e5eebbe-0eddc029-b15d4941-d49ec4dd-f18951c0.dcm: The number of bytes of pixel data is less than expected (5796396 vs 15311000 bytes) - the dataset may be corrupted, have an invalid group 0028 element value, or the transfer syntax may be incorrect


Processing test images:  17%|█▋        | 150/871 [04:04<18:55,  1.58s/it]

Processed 150/871 test images


Processing test images:  23%|██▎       | 200/871 [05:29<20:46,  1.86s/it]

Saving intermediate results: 197 vectors
Processed 200/871 test images


Processing test images:  29%|██▊       | 250/871 [06:52<15:32,  1.50s/it]

Processed 250/871 test images


Processing test images:  34%|███▍      | 300/871 [08:10<14:24,  1.51s/it]

Saving intermediate results: 297 vectors
Processed 300/871 test images


Processing test images:  40%|████      | 350/871 [09:35<15:19,  1.77s/it]

Processed 350/871 test images


Processing test images:  46%|████▌     | 400/871 [11:07<12:42,  1.62s/it]

Saving intermediate results: 397 vectors
Processed 400/871 test images


Processing test images:  52%|█████▏    | 450/871 [12:39<12:00,  1.71s/it]

Processed 450/871 test images


Processing test images:  57%|█████▋    | 500/871 [14:03<10:14,  1.66s/it]

Saving intermediate results: 497 vectors
Processed 500/871 test images


Processing test images:  63%|██████▎   | 550/871 [15:24<08:22,  1.56s/it]

Processed 550/871 test images


Processing test images:  69%|██████▉   | 600/871 [16:44<07:17,  1.61s/it]

Saving intermediate results: 597 vectors
Processed 600/871 test images


Processing test images:  75%|███████▍  | 650/871 [18:05<06:04,  1.65s/it]

Processed 650/871 test images


Processing test images:  80%|████████  | 700/871 [19:25<04:00,  1.41s/it]

Saving intermediate results: 697 vectors
Processed 700/871 test images


Processing test images:  86%|████████▌ | 750/871 [20:51<02:59,  1.48s/it]

Processed 750/871 test images


Processing test images:  92%|█████████▏| 800/871 [22:13<01:36,  1.37s/it]

Saving intermediate results: 797 vectors
Processed 800/871 test images


Processing test images:  98%|█████████▊| 850/871 [23:35<00:34,  1.65s/it]

Processed 850/871 test images


Processing test images: 100%|██████████| 871/871 [24:07<00:00,  1.66s/it]

Saving intermediate results: 868 vectors
Saved DenseNet features for 868 test images





In [3]:
# Find nearest neighbors based on feature similarity
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Load the feature vectors if you're running this in a separate session
train_features_path = os.path.join(output_dir, 'densenet121_train.pkl')
test_features_path = os.path.join(output_dir, 'densenet121_test.pkl')

with open(train_features_path, 'rb') as f:
    densenet_vecs = pickle.load(f)

with open(test_features_path, 'rb') as f:
    test_densenet_vecs = pickle.load(f)

print(f"Loaded {len(densenet_vecs)} train features and {len(test_densenet_vecs)} test features")

# Convert train features to numpy array for faster processing
train_dicom_ids = list(densenet_vecs.keys())
train_features = np.array([densenet_vecs[dicom_id] for dicom_id in train_dicom_ids])

# Find top 100 nearest neighbors for each test image
top100_neighbors = {}

for test_dicom, test_features in tqdm.tqdm(test_densenet_vecs.items(), desc="Finding neighbors"):
    # Reshape test features for cosine similarity
    test_features_reshaped = test_features.reshape(1, -1)

    # Compute similarity to all training images
    similarities = cosine_similarity(test_features_reshaped, train_features)[0]

    # Get indices of top 100 most similar images
    top_indices = similarities.argsort()[-100:][::-1]

    # Get the corresponding dicom IDs
    top_dicom_ids = [train_dicom_ids[i] for i in top_indices]

    # Store in our neighbors dictionary
    top100_neighbors[test_dicom] = top_dicom_ids

# Save the neighbors
neighbors_path = os.path.join(output_dir, 'top100_neighbors.pkl')
with open(neighbors_path, 'wb') as f:
    pickle.dump(top100_neighbors, f)

print(f"Saved top 100 neighbors for {len(top100_neighbors)} test images")

Loaded 2235 train features and 868 test features


Finding neighbors: 100%|██████████| 868/868 [00:24<00:00, 34.73it/s]


Saved top 100 neighbors for 868 test images
