In [1]:
import os
import numpy as np
import cv2
from glob import glob
from tqdm import tqdm
import pydicom

# Set the directories for test and training data
TEST_DIR = "/home/fgj4kw/Downloads/stage_2_test_images"
TRAIN_DIR = "/home/fgj4kw/Downloads/stage_2_train_images"

# Get all the DICOM image paths from the directories
s2_test_images = glob(os.path.join(TEST_DIR, "**", "*.dcm"), recursive=True)
s2_train_images = glob(os.path.join(TRAIN_DIR, "**", "*.dcm"), recursive=True)

# Function to preprocess the dataset
def preprocess_full_dataset(image_paths, target_size=(224, 224)):
    images = []
    for path in tqdm(image_paths, desc="Preprocessing dataset"):
        try:
            ds = pydicom.dcmread(path)  # Read the DICOM file
            img = ds.pixel_array  # Get the pixel array (image)
            img_resized = cv2.resize(img, target_size)  # Resize the image to 224x224
            img_normalized = img_resized.astype(np.float32) / np.max(img_resized)  # Normalize to [0, 1]
            images.append(img_normalized)
        except Exception as e:
            print(f"Error processing {path}: {e}")
    return np.array(images)

# Preprocess the datasets
X_train_full = preprocess_full_dataset(s2_train_images)
X_test_full = preprocess_full_dataset(s2_test_images)

# Add a channel dimension for grayscale images (from (224, 224) to (224, 224, 1))
X_train_full = np.expand_dims(X_train_full, axis=-1)
X_test_full = np.expand_dims(X_test_full, axis=-1)

# Check the shape of the preprocessed data
print(f"Training data shape: {X_train_full.shape}")
print(f"Test data shape: {X_test_full.shape}")


Preprocessing dataset: 100%|██████████| 26684/26684 [11:49<00:00, 37.60it/s]
Preprocessing dataset: 100%|██████████| 3000/3000 [02:06<00:00, 23.79it/s]


Training data shape: (26684, 224, 224, 1)
Test data shape: (3000, 224, 224, 1)


In [16]:
import pandas as pd

# Load the CSV file
file_path = ''
labels_df = pd.read_csv('Downloads/stage_2_detailed_class_info.csv')

# Display the first few rows of the file
labels_df.head()


Unnamed: 0,patientId,class
0,0004cfab-14fd-4e49-80ba-63a80b6bddd6,No Lung Opacity / Not Normal
1,00313ee0-9eaa-42f4-b0ab-c148ed3241cd,No Lung Opacity / Not Normal
2,00322d4d-1c29-4943-afc9-b6754be640eb,No Lung Opacity / Not Normal
3,003d8fa0-6bf1-40ed-b54c-ac657f8495c5,Normal
4,00436515-870c-4b36-a041-de91049b9ab4,Lung Opacity


In [17]:
# Create a dictionary to map patientId to class labels
labels_dict = {}
label_mapping = {
    "No Lung Opacity / Not Normal": 0,
    "Normal": 1,
    "Lung Opacity": 2
}

# Populate the dictionary with patientId and corresponding label
for _, row in labels_df.iterrows():
    labels_dict[row['patientId']] = label_mapping.get(row['class'], -1)  # Use -1 for any unknown class

# Check the first few labels
print("Sample labels dictionary:", dict(list(labels_dict.items())[:5]))


Sample labels dictionary: {'0004cfab-14fd-4e49-80ba-63a80b6bddd6': 0, '00313ee0-9eaa-42f4-b0ab-c148ed3241cd': 0, '00322d4d-1c29-4943-afc9-b6754be640eb': 0, '003d8fa0-6bf1-40ed-b54c-ac657f8495c5': 1, '00436515-870c-4b36-a041-de91049b9ab4': 2}


In [9]:
# Step 1: Imports

import numpy as np
import cv2
import pydicom
from tensorflow.keras.utils import Sequence
import os
from sklearn.utils.class_weight import compute_class_weight  # For class weights
from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau  # For early stopping and learning rate scheduler

# Check if the imports are working
print("Imports loaded successfully.")


Imports loaded successfully.


In [10]:
# Step 2: Define DICOMDataGenerator Class (Modified for Preprocessed Data)

class DICOMDataGenerator(Sequence):
    def __init__(self, image_paths, labels_dict, batch_size=32, target_size=(224, 224), shuffle=True):
        self.image_paths = image_paths
        self.labels_dict = labels_dict
        self.batch_size = batch_size
        self.target_size = target_size
        self.shuffle = shuffle
        self.indexes = np.arange(len(self.image_paths))

    def __len__(self):
        return int(np.floor(len(self.image_paths) / self.batch_size))

    def __getitem__(self, index):
        batch_paths = self.image_paths[index * self.batch_size:(index + 1) * self.batch_size]
        images, labels = self.__data_generation(batch_paths)
        return np.array(images), np.array(labels)

    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.indexes)

    def __data_generation(self, batch_paths):
        images = []
        labels = []
        for path in batch_paths:
            filename = os.path.basename(path).split('.')[0]  # Extract filename without extension
            try:
                # Get the preprocessed image based on the filename
                idx = self.image_paths.index(path)  # Find the index of the current image path
                img = X_train_full[idx]  # Get the preprocessed image from X_train_full or X_test_full
                
                # Normalize (if necessary), although it's already normalized
                img_normalized = img.astype(np.float32)

                images.append(img_normalized)
                label = self.labels_dict.get(filename, -1)  # Get the label from the dictionary
                labels.append(label)
            except Exception as e:
                print(f"Error processing {path}: {e}")
                continue
        return images, labels

# Check if the data generator works as expected (without actual data)
print("DICOMDataGenerator class defined successfully.")


DICOMDataGenerator class defined successfully.


In [11]:
# Step 3: Compute Class Weights

# Example labels_dict (you would use your actual labels dictionary here)
labels_dict = {
    'image1': 0,  # Example label for image1
    'image2': 1,  # Example label for image2
    'image3': 0,  # Example label for image3
    # Add your actual labels here...
}

# Compute class weights
class_weights = compute_class_weight(
    'balanced',
    classes=np.unique(list(labels_dict.values())),
    y=list(labels_dict.values())
)

# Create a dictionary of class weights
class_weight_dict = {i: class_weights[i] for i in range(len(class_weights))}

# Display the class weights dictionary
print("Class weight dictionary:", class_weight_dict)

Class weight dictionary: {0: 0.75, 1: 1.5}


In [12]:
# Step 4: Image-Level Classification Model

def build_classification_model(input_shape=(224, 224, 1)):  # Grayscale input (1 channel)
    model = models.Sequential([
        layers.Conv2D(32, (3, 3), activation='relu', input_shape=input_shape),
        layers.MaxPooling2D((2, 2)),
        layers.Conv2D(64, (3, 3), activation='relu'),
        layers.MaxPooling2D((2, 2)),
        layers.Flatten(),
        layers.Dense(1, activation='sigmoid')  # Binary output for classification
    ])
    
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Build the new classification model
segmentation_model = build_classification_model(input_shape=(224, 224, 1))  # Use grayscale images

# Check the model summary
segmentation_model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [13]:
# Step 5: Early Stopping and Learning Rate Scheduler

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, verbose=1)

# Check if the callbacks are defined
print("Callbacks defined successfully.")


Callbacks defined successfully.


In [22]:
# Step 6: Initialize Train and Test Generators (Using X_train_full and X_test_full)

# Initialize the DICOMDataGenerator class for both training and testing
class DICOMDataGenerator(Sequence):
    def __init__(self, images, labels_dict, batch_size=32, target_size=(224, 224), shuffle=True):
        self.images = images  # Pass in the actual preprocessed images (X_train_full or X_test_full)
        self.labels_dict = labels_dict
        self.batch_size = batch_size
        self.target_size = target_size
        self.shuffle = shuffle
        self.indexes = np.arange(len(self.images))  # This now refers to the images array

    def __len__(self):
        return int(np.floor(len(self.images) / self.batch_size))  # Return the number of batches

    def __getitem__(self, index):
        # Get the batch of images and their corresponding labels
        batch_images = self.images[index * self.batch_size:(index + 1) * self.batch_size]
        batch_labels = self.__get_labels(batch_images)  # Fetch labels based on images
        return np.array(batch_images), np.array(batch_labels)

    def on_epoch_end(self):
        # Shuffle the dataset after every epoch if shuffle is enabled
        if self.shuffle:
            np.random.shuffle(self.indexes)

    def __get_labels(self, batch_images):
        # Generate the corresponding labels for the batch of images
        labels = []
        for img in batch_images:
            # Assuming filenames match the key in labels_dict
            filename = os.path.basename(img)  # This should match the patientId in your labels_dict
            label = self.labels_dict.get(filename, -1)  # Get label from the dictionary, defaulting to -1
            labels.append(label)
        return labels

# Initialize the train and test generators using the preprocessed images (X_train_full and X_test_full)
train_generator = DICOMDataGenerator(
    X_train_full, labels_dict, batch_size=32, target_size=(224, 224), shuffle=True
)

test_generator = DICOMDataGenerator(
    X_test_full, labels_dict, batch_size=32, target_size=(224, 224), shuffle=False
)

# Check if the generators are initialized correctly
print("Train and Test Generators have been initialized successfully.")


NameError: name 'X_train_full' is not defined