In [7]:
import os
import pydicom
import numpy as np
from monai.transforms import Compose, LoadImage, EnsureChannelFirst, Spacing, ScaleIntensity

def load_dicom_series(dicom_dir):
    # Get list of DICOM files
    dicom_files = [os.path.join(dicom_dir, f) for f in os.listdir(dicom_dir) if f.endswith('.dcm')]
    dicom_files.sort()  # Ensure correct slice order (may need to sort by InstanceNumber)

    # Read DICOM files
    slices = [pydicom.dcmread(f) for f in dicom_files]
    slices.sort(key=lambda x: x.InstanceNumber)  # Sort by instance number

    # Stack slices into 3D volume
    volume = np.stack([s.pixel_array for s in slices], axis=-1)  # Shape: (H, W, Z)

    return volume

# Paths to your DICOM folders
dicom_dirs = [
    r"C:\Users\jaspe\OneDrive - TU Eindhoven\TUe\Master\Year 2\MASTER PROJECT\TEST CT SCANS\thx ax mip 10 8",
    r"C:\Users\jaspe\OneDrive - TU Eindhoven\TUe\Master\Year 2\MASTER PROJECT\TEST CT SCANS\thx bb cor 3 3",
    r"C:\Users\jaspe\OneDrive - TU Eindhoven\TUe\Master\Year 2\MASTER PROJECT\TEST CT SCANS\thx bb sag 3 3"
]

# Load each series
volumes = [load_dicom_series(d) for d in dicom_dirs]

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [14]:
import os
import torch
from ct_clip.ct_clip import CTCLIP, VisionTransformer
from transformers import BertModel, BertTokenizer

# Define model parameters
dim_text = 768  # Matches BiomedVLP-CXR-BERT-specialized output dimension
dim_image = 294912  # Adjust based on VisionTransformer output
dim_latent = 512
num_text_tokens = 28897
text_seq_len = 512
visual_image_size = 256
visual_patch_size = 32  # Adjust if needed (see below)
channels = 3

# Initialize text transformer using BertModel
text_transformer = BertModel.from_pretrained('microsoft/BiomedVLP-CXR-BERT-specialized')

# Initialize vision transformer
# The shape mismatch for to_visual_latent.weight ([512, 294912]) suggests a larger output dimension
# Calculate expected dim_image: 294912 / (256 // 32) ** 2 = 294912 / 64 = 4608 tokens
# This implies a different patch size or additional features
vision_transformer = VisionTransformer(
    dim=dim_image,
    image_size=visual_image_size,
    patch_size=visual_patch_size,  # May need adjustment (e.g., smaller patch size)
    channels=channels,
    depth=6,
    heads=8,
    dim_head=64,
    patch_dropout=0.5
)

# Initialize CT-CLIP
model = CTCLIP(
    image_encoder=vision_transformer,
    text_encoder=text_transformer,
    dim_text=dim_text,
    dim_image=dim_image,
    dim_latent=dim_latent,
    num_text_tokens=num_text_tokens,
    text_seq_len=text_seq_len,
    visual_image_size=visual_image_size,
    visual_patch_size=visual_patch_size,
    channels=channels,
    use_mlm=False,
    use_visual_ssl=False,
    visual_ssl_type='simsiam',
    text_ssl_loss_weight=0.05,
    image_ssl_loss_weight=0.05,
    multiview_loss_weight=0.1
)

# Move to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Load pretrained weights
state_dict = torch.load(
    r"C:\Users\20203686\OneDrive - TU Eindhoven\TUe\Master\Year 2\MASTER PROJECT\CT-CLIP\CT_VocabFine_v2.pt",
    map_location=device
)
model.load_state_dict(state_dict, strict=False)  # Use strict=False to ignore missing/unexpected keys

You are using a model of type cxr-bert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of BertModel were not initialized from the model checkpoint at microsoft/BiomedVLP-CXR-BERT-specialized and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RuntimeError: [enforce fail at alloc_cpu.cpp:114] data. DefaultCPUAllocator: not enough memory: you tried to allocate 3623878656 bytes.