In [7]:
import pydicom
import numpy as np
import os
from pathlib import Path
import torch
from torchvision import transforms
import glob

def load_dicom_volume(dicom_folder, target_size=(256, 256, 256), channels=3):
    # Get all .dcm files in the folder
    dicom_files = sorted(glob.glob(os.path.join(dicom_folder, "*.dcm")))
    if not dicom_files:
        raise ValueError(f"No DICOM files found in {dicom_folder}")

    # Read DICOM files and stack slices
    slices = [pydicom.dcmread(f).pixel_array for f in dicom_files]
    volume = np.stack(slices, axis=-1)  # Shape: (height, width, depth)

    # Normalize to [0, 1]
    volume = (volume - volume.min()) / (volume.max() - volume.min() + 1e-6)

    # Convert single-channel to 3 channels if needed
    if channels == 3:
        volume = np.repeat(volume[..., np.newaxis], 3, axis=-1)  # Shape: (height, width, depth, channels)
    else:
        volume = volume[..., np.newaxis]  # Shape: (height, width, depth, 1)

    # Resize to target size (requires interpolation, e.g., using scipy or torchvision)
    from scipy.ndimage import zoom
    factors = [t / s for t, s in zip(target_size, volume.shape[:-1])]
    volume = zoom(volume, factors + [1], order=1)  # Linear interpolation, preserve channel dimension

    # Convert to PyTorch tensor and adjust dimensions
    volume = torch.tensor(volume, dtype=torch.float32).permute(3, 0, 1, 2)  # Shape: (channels, height, width, depth)
    return volume

# Example: Load all volumes from subfolders
data_dir = r"C:\Users\20203686\OneDrive - TU Eindhoven\TUe\Master\Year 2\MASTER PROJECT\TEST CT SCANS"
subfolders = [f for f in Path(data_dir).iterdir() if f.is_dir()]
image_tensors = []

for folder in subfolders:
    volume = load_dicom_volume(str(folder))
    image_tensors.append(volume)

# Stack into a batch
image_batch = torch.stack(image_tensors)  # Shape: (batch, channels, height, width, depth)
print(f"Loaded batch shape: {image_batch.shape}")

Loaded batch shape: torch.Size([3, 3, 256, 256, 256])


In [None]:
import torch
from ct_clip import CTCLIP

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize CT-CLIP model
model = CTCLIP(
    dim_text=512,              # Text embedding dimension
    dim_image=512,             # Image embedding dimension
    dim_latent=512,            # Latent space dimension
    num_text_tokens=28897,     # Vocabulary size for text
    text_seq_len=512,          # Max text sequence length
    visual_image_size=256,     # Image size (matches your 256x256x256 input)
    visual_patch_size=32,      # Patch size for VisionTransformer
    channels=3,                # Number of channels (matches your 3-channel input)
    use_mlm=False,             # Disable MLM since no text reports
    use_visual_ssl=False,       # Disable visual SSL for inference
    text_has_cls_token=False,  # Adjust based on model checkpoint
    visual_has_cls_token=False # Adjust based on model checkpoint
).to(device)

# Load pretrained weights
checkpoint_path = r"C:\Users\20203686\OneDrive - TU Eindhoven\TUe\Master\Year 2\MASTER PROJECT\CT-CLIP\CT_VocabFine_v2.pt"
model.load(checkpoint_path)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'CXRBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.


In [9]:
from transformers import BertTokenizer

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained('microsoft/BiomedVLP-CXR-BERT-specialized', do_lower_case=True)

# Create placeholder text for the batch (3 images)
texts = ["CT scan" for _ in range(3)]  # Match batch size of images
text_tokens = tokenizer(
    texts,
    return_tensors="pt",
    padding="max_length",
    truncation=True,
    max_length=512
).to(device)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'CXRBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.


In [10]:
# Assume image_batch is already prepared with shape [3, 3, 256, 256, 256]
image_batch = image_batch.to(device)

# Set model to evaluation mode
model.eval()

# Extract features
with torch.no_grad():
    text_latents, image_latents, encoded_images = model(
        text=text_tokens,           # Placeholder text
        image=image_batch,         # Your CT scan batch
        device=device,
        return_loss=False,
        return_encodings=False,
        return_latents=True
    )

# Print shapes
print(f"Image latents shape: {image_latents.shape}")
print(f"Encoded images shape: {encoded_images.shape}")

TypeError: TextTransformer.forward() got an unexpected keyword argument 'attention_mask'