<a href="https://colab.research.google.com/github/jasleenkaursandhu/Reproducing-chest-xray-report-generation-boag/blob/densenet121-features-nmodels/extract_densenet_features.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
!pip install pydicom
!pip install tensorflow
import numpy as np
import pandas as pd
import os
import pydicom
import tensorflow as tf
from tensorflow.keras.applications.densenet import DenseNet121, preprocess_input
from PIL import Image
import tqdm
import pickle
import gc



In [15]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Define paths
base_path = '/content/drive/MyDrive/mimic-cxr-project'
data_dir = os.path.join(base_path, 'data')
files_path = os.path.join(base_path, 'files')
output_dir = os.path.join(base_path, 'output')

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Load train and test data
train_df = pd.read_csv(os.path.join(data_dir, 'train.tsv'), sep='\t')
test_df = pd.read_csv(os.path.join(data_dir, 'test.tsv'), sep='\t')

print(f"Train data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")

# Load pre-trained DenseNet121 model
model = DenseNet121(weights='imagenet', include_top=False, pooling='avg')
print("Loaded DenseNet121 model")

# Function to extract features from a DICOM image
def extract_features(dicom_path):
    try:
        # Read the DICOM file
        ds = pydicom.dcmread(dicom_path)

        # Convert to image format
        pixel_array = ds.pixel_array

        # Normalize pixel values
        pixel_array = pixel_array / np.max(pixel_array)

        # Convert to uint8
        img = np.uint8(pixel_array * 255)

        # Convert to RGB (DenseNet expects 3 channels)
        if len(img.shape) == 2:
            # Grayscale to RGB
            img_rgb = np.stack([img, img, img], axis=2)
        elif img.shape[2] == 1:
            # Single channel to RGB
            img_rgb = np.concatenate([img, img, img], axis=2)
        else:
            img_rgb = img

        # Resize to 224x224 (expected by DenseNet)
        pil_img = Image.fromarray(img_rgb)
        pil_img = pil_img.resize((224, 224))

        # Convert to numpy array and preprocess
        img_array = np.array(pil_img)
        img_array = preprocess_input(img_array)

        # Add batch dimension
        img_array = np.expand_dims(img_array, axis=0)

        # Extract features
        features = model.predict(img_array, verbose=0)

        return features.flatten()
    except Exception as e:
        print(f"Error processing {dicom_path}: {e}")
        return None

# Process train images in batches to avoid memory issues
batch_size = 50
densenet_vecs = {}

# Process train images
for idx, row in tqdm.tqdm(train_df.iterrows(), total=len(train_df), desc="Processing train images"):
    if idx % batch_size == 0:
        print(f"Processed {idx}/{len(train_df)} train images")

    dicom_id = row['dicom_id']
    subject_id = row['subject_id']
    study_id = row['study_id']

    # Construct path to the DICOM file
    subject_prefix = f"p{str(subject_id)[:2]}"
    subject_dir = f"p{subject_id}"
    study_dir = f"s{study_id}"
    dicom_file = f"{dicom_id}.dcm"
    dicom_path = os.path.join(files_path, subject_prefix, subject_dir, study_dir, dicom_file)

    if os.path.exists(dicom_path):
        features = extract_features(dicom_path)
        if features is not None:
            densenet_vecs[dicom_id] = features

    # Save intermediate results
    if (idx + 1) % 500 == 0 or idx == len(train_df) - 1:
        print(f"Saving intermediate results: {len(densenet_vecs)} vectors")
        with open(os.path.join(output_dir, 'densenet121_train.pkl'), 'wb') as f:
            pickle.dump(densenet_vecs, f)

    # Clean up memory
    if idx % 100 == 0:
        gc.collect()

# Save the final vectors
with open(os.path.join(output_dir, 'densenet121_train.pkl'), 'wb') as f:
    pickle.dump(densenet_vecs, f)

print(f"Saved DenseNet features for {len(densenet_vecs)} train images")

# Process test images
test_densenet_vecs = {}

for idx, row in tqdm.tqdm(test_df.iterrows(), total=len(test_df), desc="Processing test images"):
    if idx % batch_size == 0:
        print(f"Processed {idx}/{len(test_df)} test images")

    dicom_id = row['dicom_id']
    subject_id = row['subject_id']
    study_id = row['study_id']

    # Construct path to the DICOM file
    subject_prefix = f"p{str(subject_id)[:2]}"
    subject_dir = f"p{subject_id}"
    study_dir = f"s{study_id}"
    dicom_file = f"{dicom_id}.dcm"
    dicom_path = os.path.join(files_path, subject_prefix, subject_dir, study_dir, dicom_file)

    if os.path.exists(dicom_path):
        features = extract_features(dicom_path)
        if features is not None:
            test_densenet_vecs[dicom_id] = features

    # Save intermediate results
    if (idx + 1) % 100 == 0 or idx == len(test_df) - 1:
        print(f"Saving intermediate results: {len(test_densenet_vecs)} vectors")
        with open(os.path.join(output_dir, 'densenet121_test.pkl'), 'wb') as f:
            pickle.dump(test_densenet_vecs, f)

    # Clean up memory
    if idx % 50 == 0:
        gc.collect()

# Save the test vectors
with open(os.path.join(output_dir, 'densenet121_test.pkl'), 'wb') as f:
    pickle.dump(test_densenet_vecs, f)

print(f"Saved DenseNet features for {len(test_densenet_vecs)} test images")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Train data shape: (824, 3)
Test data shape: (382, 3)
Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/densenet/densenet121_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m29084464/29084464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 0us/step
Loaded DenseNet121 model


Processing train images:   0%|          | 0/824 [00:00<?, ?it/s]

Processed 0/824 train images


Processing train images:   4%|▎         | 29/824 [03:39<29:27,  2.22s/it]

Error processing /content/drive/MyDrive/mimic-cxr-project/files/p14/p14187001/s54676352/f0d713e0-4db1800a-4ae3600b-c365a2a5-4985ec73.dcm: The number of bytes of pixel data is less than expected (1094284 vs 15548928 bytes) - the dataset may be corrupted, have an invalid group 0028 element value, or the transfer syntax may be incorrect


Processing train images:   6%|▌         | 50/824 [04:57<34:05,  2.64s/it]

Processed 50/824 train images


Processing train images:  12%|█▏        | 100/824 [07:00<31:13,  2.59s/it]

Processed 100/824 train images


Processing train images:  18%|█▊        | 150/824 [09:24<28:32,  2.54s/it]

Processed 150/824 train images


Processing train images:  24%|██▍       | 200/824 [11:31<26:42,  2.57s/it]

Processed 200/824 train images


Processing train images:  30%|███       | 250/824 [13:36<20:43,  2.17s/it]

Processed 250/824 train images


Processing train images:  36%|███▋      | 300/824 [15:32<17:01,  1.95s/it]

Processed 300/824 train images


Processing train images:  42%|████▏     | 350/824 [17:35<19:15,  2.44s/it]

Processed 350/824 train images


Processing train images:  49%|████▊     | 400/824 [19:37<15:20,  2.17s/it]

Processed 400/824 train images


Processing train images:  55%|█████▍    | 450/824 [21:45<16:52,  2.71s/it]

Processed 450/824 train images


Processing train images:  61%|██████    | 500/824 [23:49<13:12,  2.45s/it]

Saving intermediate results: 499 vectors
Processed 500/824 train images


Processing train images:  67%|██████▋   | 550/824 [25:44<11:54,  2.61s/it]

Processed 550/824 train images


Processing train images:  73%|███████▎  | 600/824 [27:52<09:45,  2.61s/it]

Processed 600/824 train images


Processing train images:  76%|███████▌  | 628/824 [29:00<07:32,  2.31s/it]

Error processing /content/drive/MyDrive/mimic-cxr-project/files/p15/p15065955/s57440443/097b23d1-80c1433e-72156058-d40e3101-d41cb853.dcm: The number of bytes of pixel data is less than expected (10384010 vs 15548928 bytes) - the dataset may be corrupted, have an invalid group 0028 element value, or the transfer syntax may be incorrect


Processing train images:  79%|███████▉  | 650/824 [29:50<06:30,  2.25s/it]

Processed 650/824 train images


Processing train images:  85%|████████▍ | 700/824 [31:51<04:37,  2.24s/it]

Processed 700/824 train images


Processing train images:  91%|█████████ | 750/824 [33:59<03:03,  2.48s/it]

Processed 750/824 train images


Processing train images:  97%|█████████▋| 800/824 [36:01<00:56,  2.33s/it]

Processed 800/824 train images


Processing train images: 100%|██████████| 824/824 [36:55<00:00,  2.69s/it]


Saving intermediate results: 822 vectors
Saved DenseNet features for 822 train images


Processing test images:   0%|          | 0/382 [00:00<?, ?it/s]

Processed 0/382 test images


Processing test images:  12%|█▏        | 44/382 [01:45<09:37,  1.71s/it]

Error processing /content/drive/MyDrive/mimic-cxr-project/files/p17/p17585916/s51774836/7f7346e9-c1f9639f-8e83f5bc-f166c421-69f3b162.dcm: The number of bytes of pixel data is less than expected (1274412 vs 15548928 bytes) - the dataset may be corrupted, have an invalid group 0028 element value, or the transfer syntax may be incorrect


Processing test images:  13%|█▎        | 50/382 [01:58<11:34,  2.09s/it]

Processed 50/382 test images


Processing test images:  26%|██▌       | 100/382 [04:10<12:47,  2.72s/it]

Saving intermediate results: 99 vectors
Processed 100/382 test images


Processing test images:  39%|███▉      | 150/382 [06:16<10:31,  2.72s/it]

Processed 150/382 test images


Processing test images:  52%|█████▏    | 200/382 [08:19<08:49,  2.91s/it]

Saving intermediate results: 199 vectors
Processed 200/382 test images


Processing test images:  65%|██████▌   | 250/382 [10:16<05:41,  2.59s/it]

Processed 250/382 test images


Processing test images:  79%|███████▊  | 300/382 [12:15<03:14,  2.37s/it]

Saving intermediate results: 299 vectors
Processed 300/382 test images


Processing test images:  90%|████████▉ | 343/382 [13:57<01:27,  2.23s/it]

Error processing /content/drive/MyDrive/mimic-cxr-project/files/p16/p16741986/s56541794/fdba0667-faa73efd-da3746a5-2a72a1fa-f5b292b7.dcm: The number of bytes of pixel data is less than expected (3535498 vs 15548928 bytes) - the dataset may be corrupted, have an invalid group 0028 element value, or the transfer syntax may be incorrect


Processing test images:  92%|█████████▏| 350/382 [14:12<01:04,  2.02s/it]

Processed 350/382 test images


Processing test images: 100%|██████████| 382/382 [15:36<00:00,  2.45s/it]

Saving intermediate results: 380 vectors
Saved DenseNet features for 380 test images





In [16]:
# Find nearest neighbors based on feature similarity
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Load the feature vectors if you're running this in a separate session
train_features_path = os.path.join(output_dir, 'densenet121_train.pkl')
test_features_path = os.path.join(output_dir, 'densenet121_test.pkl')

with open(train_features_path, 'rb') as f:
    densenet_vecs = pickle.load(f)

with open(test_features_path, 'rb') as f:
    test_densenet_vecs = pickle.load(f)

print(f"Loaded {len(densenet_vecs)} train features and {len(test_densenet_vecs)} test features")

# Convert train features to numpy array for faster processing
train_dicom_ids = list(densenet_vecs.keys())
train_features = np.array([densenet_vecs[dicom_id] for dicom_id in train_dicom_ids])

# Find top 100 nearest neighbors for each test image
top100_neighbors = {}

for test_dicom, test_features in tqdm.tqdm(test_densenet_vecs.items(), desc="Finding neighbors"):
    # Reshape test features for cosine similarity
    test_features_reshaped = test_features.reshape(1, -1)

    # Compute similarity to all training images
    similarities = cosine_similarity(test_features_reshaped, train_features)[0]

    # Get indices of top 100 most similar images
    top_indices = similarities.argsort()[-100:][::-1]

    # Get the corresponding dicom IDs
    top_dicom_ids = [train_dicom_ids[i] for i in top_indices]

    # Store in our neighbors dictionary
    top100_neighbors[test_dicom] = top_dicom_ids

# Save the neighbors
neighbors_path = os.path.join(output_dir, 'top100_neighbors.pkl')
with open(neighbors_path, 'wb') as f:
    pickle.dump(top100_neighbors, f)

print(f"Saved top 100 neighbors for {len(top100_neighbors)} test images")

Loaded 822 train features and 380 test features


Finding neighbors: 100%|██████████| 380/380 [00:02<00:00, 134.00it/s]

Saved top 100 neighbors for 380 test images



