In [1]:
# !pipenv install pydicom
# !pipenv install tensorflow
import numpy as np
import pandas as pd
import os
import pydicom
import tensorflow as tf  # type: ignore
from tensorflow.keras.applications.densenet import DenseNet121, preprocess_input
from PIL import Image
import tqdm
import pickle
import gc

In [2]:
#  Temporarily disable certificate verification (Not recommended for production)
import ssl
os.environ['PYTHONHTTPSVERIFY'] = '0'
ssl._create_default_https_context = ssl._create_unverified_context

In [3]:
# Mount Google Drive
# from google.colab import drive
# drive.mount('/content/drive')

# Define paths
# base_path = '/content/drive/MyDrive/mimic-cxr-project'
base_path = '/Users/simeon/Documents/DLH/content/mimic-cxr-project'
data_dir = os.path.join(base_path, 'data')
files_path = os.path.join(base_path, 'new_files')
output_dir = os.path.join(base_path, 'output')

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Load train and test data
train_df = pd.read_csv(os.path.join(data_dir, 'train.tsv'), sep='\t')
test_df = pd.read_csv(os.path.join(data_dir, 'test.tsv'), sep='\t')

print(f"Train data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")

Train data shape: (4291, 3)
Test data shape: (1757, 3)


In [4]:
# Load pre-trained DenseNet121 model
model = DenseNet121(weights='imagenet', include_top=False, pooling='avg')
print("Loaded DenseNet121 model")

Loaded DenseNet121 model


In [5]:
# Function to extract features from a DICOM image
def extract_features(dicom_path):
    try:
        # Read the DICOM file
        ds = pydicom.dcmread(dicom_path)

        # Convert to image format
        pixel_array = ds.pixel_array

        # Normalize pixel values
        pixel_array = pixel_array / np.max(pixel_array)

        # Convert to uint8
        img = np.uint8(pixel_array * 255)

        # Convert to RGB (DenseNet expects 3 channels)
        if len(img.shape) == 2:
            # Grayscale to RGB
            img_rgb = np.stack([img, img, img], axis=2)
        elif img.shape[2] == 1:
            # Single channel to RGB
            img_rgb = np.concatenate([img, img, img], axis=2)
        else:
            img_rgb = img

        # Resize to 224x224 (expected by DenseNet)
        pil_img = Image.fromarray(img_rgb)
        pil_img = pil_img.resize((224, 224))

        # Convert to numpy array and preprocess
        img_array = np.array(pil_img)
        img_array = preprocess_input(img_array)

        # Add batch dimension
        img_array = np.expand_dims(img_array, axis=0)

        # Extract features
        features = model.predict(img_array, verbose=0)

        return features.flatten()
    except Exception as e:
        print(f"Error processing {dicom_path}: {e}")
        return None

# Process train images in batches to avoid memory issues
batch_size = 50
densenet_vecs = {}

# Process train images
for idx, row in tqdm.tqdm(train_df.iterrows(), total=len(train_df), desc="Processing train images"):
    if idx % batch_size == 0:
        print(f"Processed {idx}/{len(train_df)} train images")

    dicom_id = row['dicom_id']
    subject_id = row['subject_id']
    study_id = row['study_id']

    # Construct path to the DICOM file
    subject_prefix = f"p{str(subject_id)[:2]}"
    subject_dir = f"p{subject_id}"
    study_dir = f"s{study_id}"
    dicom_file = f"{dicom_id}.dcm"
    dicom_path = os.path.join(files_path, subject_prefix, subject_dir, study_dir, dicom_file)

    if os.path.exists(dicom_path):
        features = extract_features(dicom_path)
        if features is not None:
            densenet_vecs[dicom_id] = features

    # Save intermediate results
    if (idx + 1) % 500 == 0 or idx == len(train_df) - 1:
        print(f"Saving intermediate results: {len(densenet_vecs)} vectors")
        with open(os.path.join(output_dir, 'densenet121_train.pkl'), 'wb') as f:
            pickle.dump(densenet_vecs, f)

    # Clean up memory
    if idx % 100 == 0:
        gc.collect()

# Save the final vectors
with open(os.path.join(output_dir, 'densenet121_train.pkl'), 'wb') as f:
    pickle.dump(densenet_vecs, f)

print(f"Saved DenseNet features for {len(densenet_vecs)} train images")

# Process test images
test_densenet_vecs = {}

for idx, row in tqdm.tqdm(test_df.iterrows(), total=len(test_df), desc="Processing test images"):
    if idx % batch_size == 0:
        print(f"Processed {idx}/{len(test_df)} test images")

    dicom_id = row['dicom_id']
    subject_id = row['subject_id']
    study_id = row['study_id']

    # Construct path to the DICOM file
    subject_prefix = f"p{str(subject_id)[:2]}"
    subject_dir = f"p{subject_id}"
    study_dir = f"s{study_id}"
    dicom_file = f"{dicom_id}.dcm"
    dicom_path = os.path.join(files_path, subject_prefix, subject_dir, study_dir, dicom_file)

    if os.path.exists(dicom_path):
        features = extract_features(dicom_path)
        if features is not None:
            test_densenet_vecs[dicom_id] = features

    # Save intermediate results
    if (idx + 1) % 100 == 0 or idx == len(test_df) - 1:
        print(f"Saving intermediate results: {len(test_densenet_vecs)} vectors")
        with open(os.path.join(output_dir, 'densenet121_test.pkl'), 'wb') as f:
            pickle.dump(test_densenet_vecs, f)

    # Clean up memory
    if idx % 50 == 0:
        gc.collect()

# Save the test vectors
with open(os.path.join(output_dir, 'densenet121_test.pkl'), 'wb') as f:
    pickle.dump(test_densenet_vecs, f)

print(f"Saved DenseNet features for {len(test_densenet_vecs)} test images")

Processing train images:   0%|          | 0/4291 [00:00<?, ?it/s]

Processed 0/4291 train images


Processing train images:   1%|          | 51/4291 [00:07<09:02,  7.81it/s] 

Processed 50/4291 train images


Processing train images:   2%|▏         | 100/4291 [00:13<08:53,  7.85it/s]

Processed 100/4291 train images


Processing train images:   4%|▎         | 151/4291 [00:20<08:34,  8.05it/s]

Processed 150/4291 train images


Processing train images:   5%|▍         | 200/4291 [00:26<09:04,  7.51it/s]

Processed 200/4291 train images


Processing train images:   6%|▌         | 251/4291 [00:33<09:17,  7.24it/s]

Processed 250/4291 train images


Processing train images:   7%|▋         | 300/4291 [00:40<08:40,  7.67it/s]

Processed 300/4291 train images


Processing train images:   8%|▊         | 351/4291 [00:47<07:51,  8.36it/s]

Processed 350/4291 train images


Processing train images:   9%|▉         | 400/4291 [00:53<08:03,  8.04it/s]

Processed 400/4291 train images


Processing train images:  11%|█         | 451/4291 [00:59<08:07,  7.88it/s]

Processed 450/4291 train images


Processing train images:  12%|█▏        | 500/4291 [01:05<08:07,  7.78it/s]

Saving intermediate results: 500 vectors
Processed 500/4291 train images


Processing train images:  13%|█▎        | 551/4291 [01:12<11:39,  5.35it/s]

Processed 550/4291 train images


Processing train images:  14%|█▍        | 600/4291 [01:19<09:29,  6.48it/s]

Processed 600/4291 train images


Processing train images:  15%|█▌        | 651/4291 [01:26<07:40,  7.91it/s]

Processed 650/4291 train images


Processing train images:  16%|█▋        | 700/4291 [01:33<07:40,  7.80it/s]

Processed 700/4291 train images


Processing train images:  18%|█▊        | 751/4291 [01:40<07:22,  7.99it/s]

Processed 750/4291 train images


Processing train images:  19%|█▊        | 800/4291 [01:46<06:53,  8.44it/s]

Processed 800/4291 train images


Processing train images:  20%|█▉        | 851/4291 [01:53<07:36,  7.53it/s]

Processed 850/4291 train images


Processing train images:  21%|██        | 900/4291 [02:00<07:48,  7.24it/s]

Processed 900/4291 train images


Processing train images:  22%|██▏       | 951/4291 [02:07<07:30,  7.41it/s]

Processed 950/4291 train images


Processing train images:  23%|██▎       | 1000/4291 [02:14<07:16,  7.55it/s]

Saving intermediate results: 1000 vectors
Processed 1000/4291 train images


Processing train images:  24%|██▍       | 1051/4291 [02:21<06:53,  7.83it/s]

Processed 1050/4291 train images


Processing train images:  26%|██▌       | 1100/4291 [02:28<06:59,  7.60it/s]

Processed 1100/4291 train images


Processing train images:  27%|██▋       | 1151/4291 [02:35<06:27,  8.10it/s]

Processed 1150/4291 train images


Processing train images:  28%|██▊       | 1200/4291 [02:41<07:06,  7.25it/s]

Processed 1200/4291 train images


Processing train images:  29%|██▉       | 1251/4291 [02:48<06:25,  7.88it/s]

Processed 1250/4291 train images


Processing train images:  30%|███       | 1300/4291 [02:54<06:03,  8.22it/s]

Processed 1300/4291 train images


Processing train images:  31%|███▏      | 1351/4291 [03:00<06:18,  7.77it/s]

Processed 1350/4291 train images


Processing train images:  33%|███▎      | 1401/4291 [03:07<07:06,  6.78it/s]

Processed 1400/4291 train images


Processing train images:  34%|███▍      | 1451/4291 [03:14<06:14,  7.58it/s]

Processed 1450/4291 train images


Processing train images:  35%|███▍      | 1500/4291 [03:20<06:03,  7.67it/s]

Saving intermediate results: 1500 vectors
Processed 1500/4291 train images


Processing train images:  36%|███▌      | 1551/4291 [03:27<05:44,  7.95it/s]

Processed 1550/4291 train images


Processing train images:  37%|███▋      | 1601/4291 [03:33<06:52,  6.52it/s]

Processed 1600/4291 train images


Processing train images:  38%|███▊      | 1651/4291 [03:41<06:13,  7.06it/s]

Processed 1650/4291 train images


Processing train images:  40%|███▉      | 1700/4291 [03:47<05:28,  7.88it/s]

Processed 1700/4291 train images


Processing train images:  41%|████      | 1751/4291 [03:54<05:46,  7.33it/s]

Processed 1750/4291 train images


Processing train images:  42%|████▏     | 1800/4291 [04:01<05:52,  7.07it/s]

Processed 1800/4291 train images


Processing train images:  43%|████▎     | 1851/4291 [04:08<05:30,  7.37it/s]

Processed 1850/4291 train images


Processing train images:  44%|████▍     | 1901/4291 [04:15<06:24,  6.22it/s]

Processed 1900/4291 train images


Processing train images:  45%|████▌     | 1951/4291 [04:21<05:00,  7.78it/s]

Processed 1950/4291 train images


Processing train images:  47%|████▋     | 2000/4291 [04:27<05:07,  7.45it/s]

Saving intermediate results: 2000 vectors
Processed 2000/4291 train images


Processing train images:  48%|████▊     | 2051/4291 [04:34<04:50,  7.72it/s]

Processed 2050/4291 train images


Processing train images:  49%|████▉     | 2100/4291 [04:40<04:37,  7.88it/s]

Processed 2100/4291 train images


Processing train images:  50%|█████     | 2151/4291 [04:47<04:37,  7.70it/s]

Processed 2150/4291 train images


Processing train images:  51%|█████▏    | 2200/4291 [04:54<04:18,  8.08it/s]

Processed 2200/4291 train images


Processing train images:  52%|█████▏    | 2251/4291 [05:01<04:18,  7.89it/s]

Processed 2250/4291 train images


Processing train images:  54%|█████▎    | 2300/4291 [05:07<04:59,  6.66it/s]

Processed 2300/4291 train images


Processing train images:  55%|█████▍    | 2351/4291 [05:14<04:12,  7.68it/s]

Processed 2350/4291 train images


Processing train images:  56%|█████▌    | 2400/4291 [05:20<04:03,  7.77it/s]

Processed 2400/4291 train images


Processing train images:  57%|█████▋    | 2451/4291 [05:27<03:50,  7.98it/s]

Processed 2450/4291 train images


Processing train images:  58%|█████▊    | 2500/4291 [05:33<03:48,  7.83it/s]

Saving intermediate results: 2500 vectors
Processed 2500/4291 train images


Processing train images:  59%|█████▉    | 2551/4291 [05:40<03:40,  7.88it/s]

Processed 2550/4291 train images


Processing train images:  61%|██████    | 2600/4291 [05:46<03:36,  7.83it/s]

Processed 2600/4291 train images


Processing train images:  62%|██████▏   | 2651/4291 [05:53<03:26,  7.93it/s]

Processed 2650/4291 train images


Processing train images:  63%|██████▎   | 2700/4291 [05:59<03:46,  7.02it/s]

Processed 2700/4291 train images


Processing train images:  64%|██████▍   | 2751/4291 [06:06<03:15,  7.88it/s]

Processed 2750/4291 train images


Processing train images:  65%|██████▌   | 2801/4291 [06:12<03:43,  6.66it/s]

Processed 2800/4291 train images


Processing train images:  66%|██████▋   | 2851/4291 [06:19<03:04,  7.81it/s]

Processed 2850/4291 train images


Processing train images:  68%|██████▊   | 2901/4291 [06:25<03:20,  6.92it/s]

Processed 2900/4291 train images


Processing train images:  69%|██████▉   | 2951/4291 [06:31<02:54,  7.68it/s]

Processed 2950/4291 train images


Processing train images:  70%|██████▉   | 3000/4291 [06:38<03:01,  7.11it/s]

Saving intermediate results: 3000 vectors
Processed 3000/4291 train images


Processing train images:  71%|███████   | 3051/4291 [06:45<02:40,  7.73it/s]

Processed 3050/4291 train images


Processing train images:  72%|███████▏  | 3101/4291 [06:51<02:54,  6.80it/s]

Processed 3100/4291 train images


Processing train images:  73%|███████▎  | 3151/4291 [06:57<02:26,  7.76it/s]

Processed 3150/4291 train images


Processing train images:  75%|███████▍  | 3200/4291 [07:04<02:18,  7.87it/s]

Processed 3200/4291 train images


Processing train images:  76%|███████▌  | 3251/4291 [07:10<02:13,  7.77it/s]

Processed 3250/4291 train images


Processing train images:  77%|███████▋  | 3301/4291 [07:17<02:28,  6.66it/s]

Processed 3300/4291 train images


Processing train images:  78%|███████▊  | 3351/4291 [07:23<01:59,  7.87it/s]

Processed 3350/4291 train images


Processing train images:  79%|███████▉  | 3401/4291 [07:30<02:12,  6.74it/s]

Processed 3400/4291 train images


Processing train images:  80%|████████  | 3451/4291 [07:36<01:49,  7.67it/s]

Processed 3450/4291 train images


Processing train images:  82%|████████▏ | 3500/4291 [07:42<01:51,  7.10it/s]

Saving intermediate results: 3500 vectors
Processed 3500/4291 train images


Processing train images:  83%|████████▎ | 3551/4291 [07:49<01:35,  7.72it/s]

Processed 3550/4291 train images


Processing train images:  84%|████████▍ | 3601/4291 [07:56<01:43,  6.70it/s]

Processed 3600/4291 train images


Processing train images:  85%|████████▌ | 3651/4291 [08:02<01:21,  7.81it/s]

Processed 3650/4291 train images


Processing train images:  86%|████████▋ | 3701/4291 [08:08<01:28,  6.70it/s]

Processed 3700/4291 train images


Processing train images:  87%|████████▋ | 3751/4291 [08:15<01:09,  7.78it/s]

Processed 3750/4291 train images


Processing train images:  89%|████████▊ | 3800/4291 [08:21<01:02,  7.87it/s]

Processed 3800/4291 train images


Processing train images:  90%|████████▉ | 3851/4291 [08:28<00:55,  7.95it/s]

Processed 3850/4291 train images


Processing train images:  91%|█████████ | 3901/4291 [08:34<00:58,  6.70it/s]

Processed 3900/4291 train images


Processing train images:  92%|█████████▏| 3951/4291 [08:40<00:43,  7.79it/s]

Processed 3950/4291 train images


Processing train images:  93%|█████████▎| 4000/4291 [08:47<00:39,  7.41it/s]

Saving intermediate results: 4000 vectors
Processed 4000/4291 train images


Processing train images:  94%|█████████▍| 4051/4291 [08:53<00:30,  7.95it/s]

Processed 4050/4291 train images


Processing train images:  96%|█████████▌| 4101/4291 [09:00<00:27,  6.93it/s]

Processed 4100/4291 train images


Processing train images:  97%|█████████▋| 4151/4291 [09:06<00:17,  7.99it/s]

Processed 4150/4291 train images


Processing train images:  98%|█████████▊| 4201/4291 [09:12<00:13,  6.56it/s]

Processed 4200/4291 train images


Processing train images:  99%|█████████▉| 4251/4291 [09:19<00:05,  7.96it/s]

Processed 4250/4291 train images


Processing train images: 100%|██████████| 4291/4291 [09:24<00:00,  7.60it/s]


Saving intermediate results: 4291 vectors
Saved DenseNet features for 4291 train images


Processing test images:   0%|          | 0/1757 [00:00<?, ?it/s]

Processed 0/1757 test images


Processing test images:   3%|▎         | 51/1757 [00:06<04:07,  6.90it/s]

Processed 50/1757 test images


Processing test images:   6%|▌         | 100/1757 [00:12<03:31,  7.82it/s]

Saving intermediate results: 100 vectors
Processed 100/1757 test images


Processing test images:   9%|▊         | 151/1757 [00:19<03:50,  6.98it/s]

Processed 150/1757 test images


Processing test images:  11%|█▏        | 201/1757 [00:25<03:51,  6.73it/s]

Saving intermediate results: 200 vectors
Processed 200/1757 test images


Processing test images:  14%|█▍        | 251/1757 [00:32<03:43,  6.73it/s]

Processed 250/1757 test images


Processing test images:  17%|█▋        | 301/1757 [00:38<03:36,  6.71it/s]

Saving intermediate results: 300 vectors
Processed 300/1757 test images


Processing test images:  20%|█▉        | 350/1757 [00:44<02:58,  7.86it/s]

Processed 350/1757 test images


Processing test images:  23%|██▎       | 400/1757 [00:51<02:53,  7.83it/s]

Saving intermediate results: 400 vectors
Processed 400/1757 test images


Processing test images:  26%|██▌       | 451/1757 [00:57<03:14,  6.72it/s]

Processed 450/1757 test images


Processing test images:  28%|██▊       | 500/1757 [01:04<02:40,  7.83it/s]

Saving intermediate results: 500 vectors
Processed 500/1757 test images


Processing test images:  31%|███▏      | 551/1757 [01:10<02:58,  6.77it/s]

Processed 550/1757 test images


Processing test images:  34%|███▍      | 600/1757 [01:16<02:26,  7.88it/s]

Saving intermediate results: 600 vectors
Processed 600/1757 test images


Processing test images:  37%|███▋      | 651/1757 [01:23<02:44,  6.74it/s]

Processed 650/1757 test images


Processing test images:  40%|███▉      | 700/1757 [01:29<02:16,  7.76it/s]

Saving intermediate results: 700 vectors
Processed 700/1757 test images


Processing test images:  43%|████▎     | 750/1757 [01:36<02:09,  7.78it/s]

Processed 750/1757 test images


Processing test images:  46%|████▌     | 800/1757 [01:42<02:01,  7.88it/s]

Saving intermediate results: 800 vectors
Processed 800/1757 test images


Processing test images:  48%|████▊     | 851/1757 [01:49<02:15,  6.71it/s]

Processed 850/1757 test images


Processing test images:  51%|█████     | 900/1757 [01:55<01:49,  7.84it/s]

Saving intermediate results: 900 vectors
Processed 900/1757 test images


Processing test images:  54%|█████▍    | 951/1757 [02:02<02:00,  6.69it/s]

Processed 950/1757 test images


Processing test images:  57%|█████▋    | 1000/1757 [02:08<01:35,  7.94it/s]

Saving intermediate results: 1000 vectors
Processed 1000/1757 test images


Processing test images:  60%|█████▉    | 1050/1757 [02:14<01:29,  7.86it/s]

Processed 1050/1757 test images


Processing test images:  63%|██████▎   | 1100/1757 [02:21<01:25,  7.70it/s]

Saving intermediate results: 1100 vectors
Processed 1100/1757 test images


Processing test images:  66%|██████▌   | 1151/1757 [02:28<01:31,  6.64it/s]

Processed 1150/1757 test images


Processing test images:  68%|██████▊   | 1200/1757 [02:34<01:11,  7.76it/s]

Saving intermediate results: 1200 vectors
Processed 1200/1757 test images


Processing test images:  71%|███████   | 1251/1757 [02:41<01:15,  6.69it/s]

Processed 1250/1757 test images


Processing test images:  74%|███████▍  | 1301/1757 [02:47<01:07,  6.73it/s]

Saving intermediate results: 1300 vectors
Processed 1300/1757 test images


Processing test images:  77%|███████▋  | 1351/1757 [02:53<01:01,  6.58it/s]

Processed 1350/1757 test images


Processing test images:  80%|███████▉  | 1400/1757 [02:59<00:45,  7.88it/s]

Saving intermediate results: 1400 vectors
Processed 1400/1757 test images


Processing test images:  83%|████████▎ | 1451/1757 [03:05<00:43,  7.02it/s]

Processed 1450/1757 test images


Processing test images:  85%|████████▌ | 1500/1757 [03:11<00:32,  8.03it/s]

Saving intermediate results: 1500 vectors
Processed 1500/1757 test images


Processing test images:  88%|████████▊ | 1551/1757 [03:18<00:29,  7.09it/s]

Processed 1550/1757 test images


Processing test images:  91%|█████████ | 1601/1757 [03:24<00:22,  6.98it/s]

Saving intermediate results: 1600 vectors
Processed 1600/1757 test images


Processing test images:  94%|█████████▍| 1651/1757 [03:30<00:14,  7.18it/s]

Processed 1650/1757 test images


Processing test images:  97%|█████████▋| 1700/1757 [03:36<00:07,  7.76it/s]

Saving intermediate results: 1700 vectors
Processed 1700/1757 test images


Processing test images: 100%|█████████▉| 1751/1757 [03:42<00:00,  7.03it/s]

Processed 1750/1757 test images


Processing test images: 100%|██████████| 1757/1757 [03:43<00:00,  7.85it/s]

Saving intermediate results: 1757 vectors
Saved DenseNet features for 1757 test images





In [6]:
# Find nearest neighbors based on feature similarity
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Load the feature vectors if you're running this in a separate session
train_features_path = os.path.join(output_dir, 'densenet121_train.pkl')
test_features_path = os.path.join(output_dir, 'densenet121_test.pkl')

with open(train_features_path, 'rb') as f:
    densenet_vecs = pickle.load(f)

with open(test_features_path, 'rb') as f:
    test_densenet_vecs = pickle.load(f)

print(f"Loaded {len(densenet_vecs)} train features and {len(test_densenet_vecs)} test features")

# Convert train features to numpy array for faster processing
train_dicom_ids = list(densenet_vecs.keys())
train_features = np.array([densenet_vecs[dicom_id] for dicom_id in train_dicom_ids])

# Find top 100 nearest neighbors for each test image
top100_neighbors = {}

for test_dicom, test_features in tqdm.tqdm(test_densenet_vecs.items(), desc="Finding neighbors"):
    # Reshape test features for cosine similarity
    test_features_reshaped = test_features.reshape(1, -1)

    # Compute similarity to all training images
    similarities = cosine_similarity(test_features_reshaped, train_features)[0]

    # Get indices of top 100 most similar images
    top_indices = similarities.argsort()[-100:][::-1]

    # Get the corresponding dicom IDs
    top_dicom_ids = [train_dicom_ids[i] for i in top_indices]

    # Store in our neighbors dictionary
    top100_neighbors[test_dicom] = top_dicom_ids

# Save the neighbors
neighbors_path = os.path.join(output_dir, 'top100_neighbors.pkl')
with open(neighbors_path, 'wb') as f:
    pickle.dump(top100_neighbors, f)

print(f"Saved top 100 neighbors for {len(top100_neighbors)} test images")

Loaded 4291 train features and 1757 test features


Finding neighbors: 100%|██████████| 1757/1757 [00:20<00:00, 84.59it/s] 

Saved top 100 neighbors for 1757 test images



