In [None]:
!unzip DNA_Damage_Project-20250901T154313Z-1-001.zip

Archive:  DNA_Damage_Project-20250901T154313Z-1-001.zip
replace DNA_Damage_Project/image/red/WellC4_Seq0011_WellC4_Seq0011_T01_XY7_RGB_TxRED.tif? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [None]:
import cv2
import os
import numpy as np

def load_and_preprocess_images(base_dir):
    """Loads images from specified directory and converts to grayscale."""
    images = []
    image_filenames = sorted(os.listdir(base_dir))
    for filename in image_filenames:
        img_path = os.path.join(base_dir, filename)
        img = cv2.imread(img_path)

        # Check if image was loaded successfully
        if img is None:
            print(f"Warning: Could not load image {img_path}")
            continue

        # OpenCV loads images in BGR format, convert to grayscale
        gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        images.append(gray_img)
    return np.array(images)

train_red_images = load_and_preprocess_images('DNA_Damage_Project/image/red')
test_red_images = load_and_preprocess_images('DNA_Damage_Project/test images/red')

print(f"Loaded {len(train_red_images)} training red images.")
print(f"Loaded {len(test_red_images)} testing red images.")

In [None]:
import cv2
import os
import numpy as np

def load_and_preprocess_images(base_dir):
    """Loads images from specified directory and converts to grayscale."""
    images = []
    image_filenames = sorted(os.listdir(base_dir))
    for filename in image_filenames:
        img_path = os.path.join(base_dir, filename)
        img = cv2.imread(img_path)

        # Check if image was loaded successfully
        if img is None:
            print(f"Warning: Could not load image {img_path}")
            continue

        # OpenCV loads images in BGR format, convert to grayscale
        gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        images.append(gray_img)
    return np.array(images)

train_green_images = load_and_preprocess_images('DNA_Damage_Project/image/green')
test_green_images = load_and_preprocess_images('DNA_Damage_Project/test images/green')

print(f"Loaded {len(train_green_images)} training green images.")
print(f"Loaded {len(test_green_images)} testing green images.")

In [None]:
def pair_images(red_images, green_images):
    """Pairs corresponding red and green images."""
    # Assuming images are already sorted by filename in the loading step
    # and that the filenames match between red and green directories
    paired_images = []
    for i in range(len(red_images)):
        paired_images.append((red_images[i], green_images[i]))
    return paired_images

train_paired_images = pair_images(train_red_images, train_green_images)
test_paired_images = pair_images(test_red_images, test_green_images)

print(f"Created {len(train_paired_images)} paired training images.")
print(f"Created {len(test_paired_images)} paired testing images.")

### Build the model

We will define a convolutional neural network (CNN) using TensorFlow.

# Task
Process the red and green channel images for nucleus segmentation, normalization, foci detection, and labeling, preparing the data for model training. The red channel images are located in "train_images/red/" and "test_images/red/", and the green channel images are located in "train_images/green/" and "test_images/green/".

## Nucleus segmentation (red channel)

### Subtask:
Implement the segmentation steps using Otsu's thresholding, watershed algorithm, and noise removal.


**Reasoning**:
Implement the image segmentation steps using Otsu's thresholding, morphological operations, watershed algorithm, and noise removal for both training and testing red channel images as described in the instructions.



In [None]:
def segment_nuclei(image):
    """Segments nuclei using Otsu's thresholding and watershed."""
    # Apply Otsu's thresholding
    _, thresh = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

    # Morphological operations to clean up the mask
    kernel = np.ones((3, 3), np.uint8)
    opening = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel, iterations=2)

    # Sure background area
    sure_bg = cv2.dilate(opening, kernel, iterations=3)

    # Sure foreground area
    dist_transform = cv2.distanceTransform(opening, cv2.DIST_L2, 5)
    _, sure_fg = cv2.threshold(dist_transform, 0.7 * dist_transform.max(), 255, 0)

    # Unknown region
    sure_fg = np.uint8(sure_fg)
    unknown = cv2.subtract(sure_bg, sure_fg)

    # Marker labelling
    _, markers = cv2.connectedComponents(sure_fg)

    # Add one to all labels so that sure background is not 0, but 1
    markers = markers + 1

    # Now, mark the region of unknown with zero
    markers[unknown == 255] = 0

    # Apply watershed algorithm
    markers = cv2.watershed(cv2.cvtColor(image, cv2.COLOR_GRAY2BGR), markers)
    image[markers == -1] = [255, 0, 0] # Mark boundary

    # Create a binary mask from the watershed result
    segmented_mask = np.zeros_like(image, dtype=np.uint8)
    segmented_mask[markers > 1] = 255  # Nuclei are marked with labels > 1

    # Remove small objects (noise)
    min_size = 100  # Minimum size threshold for nuclei
    num_labels, labels_img, stats, centroids = cv2.connectedComponentsWithStats(segmented_mask, connectivity=8)

    cleaned_mask = np.zeros_like(segmented_mask)
    for i in range(1, num_labels): # Start from 1 to exclude background
        if stats[i, cv2.CC_STAT_AREA] >= min_size:
            cleaned_mask[labels_img == i] = 255

    return cleaned_mask

train_segmented_masks = [segment_nuclei(img) for img in train_red_images]
test_segmented_masks = [segment_nuclei(img) for img in test_red_images]

print(f"Generated {len(train_segmented_masks)} training segmented masks.")
print(f"Generated {len(test_segmented_masks)} testing segmented masks.")

**Reasoning**:
The traceback indicates a ValueError in the watershed step where we are trying to assign a 3-channel color to a grayscale image. The watershed function modifies the `markers` array in-place and also tries to color the original image if provided as a BGR image. The error occurs because the original `image` is grayscale, not BGR. We should create a BGR version of the grayscale image before applying watershed and then extract the mask from the modified `markers`. Also, the cleaned_mask should be a grayscale image.



In [None]:
def segment_nuclei(image):
    """Segments nuclei using Otsu's thresholding and watershed."""
    # Apply Otsu's thresholding
    _, thresh = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

    # Morphological operations to clean up the mask
    kernel = np.ones((3, 3), np.uint8)
    opening = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel, iterations=2)

    # Sure background area
    sure_bg = cv2.dilate(opening, kernel, iterations=3)

    # Sure foreground area
    dist_transform = cv2.distanceTransform(opening, cv2.DIST_L2, 5)
    _, sure_fg = cv2.threshold(dist_transform, 0.7 * dist_transform.max(), 255, 0)

    # Unknown region
    sure_fg = np.uint8(sure_fg)
    unknown = cv2.subtract(sure_bg, sure_fg)

    # Marker labelling
    _, markers = cv2.connectedComponents(sure_fg)

    # Add one to all labels so that sure background is not 0, but 1
    markers = markers + 1

    # Now, mark the region of unknown with zero
    markers[unknown == 255] = 0

    # Apply watershed algorithm
    # Create a BGR version of the grayscale image for watershed visualization (optional, but watershed expects 3 channels)
    img_bgr = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
    markers = cv2.watershed(img_bgr, markers)

    # Create a binary mask from the watershed result
    segmented_mask = np.zeros_like(image, dtype=np.uint8)
    segmented_mask[markers > 1] = 255  # Nuclei are marked with labels > 1

    # Remove small objects (noise)
    min_size = 100  # Minimum size threshold for nuclei
    num_labels, labels_img, stats, centroids = cv2.connectedComponentsWithStats(segmented_mask, connectivity=8)

    cleaned_mask = np.zeros_like(segmented_mask)
    for i in range(1, num_labels): # Start from 1 to exclude background
        if stats[i, cv2.CC_STAT_AREA] >= min_size:
            cleaned_mask[labels_img == i] = 255

    return cleaned_mask

train_segmented_masks = [segment_nuclei(img) for img in train_red_images]
test_segmented_masks = [segment_nuclei(img) for img in test_red_images]

print(f"Generated {len(train_segmented_masks)} training segmented masks.")
print(f"Generated {len(test_segmented_masks)} testing segmented masks.")

## Normalization

### Subtask:
Implement the normalization steps, including cropping, rotating, and padding the segmented nuclei.


**Reasoning**:
Implement the `normalize_nucleus` function as described in the instructions to process each nucleus in a paired image and its mask, and then apply this function to the training and testing datasets.



In [None]:
def normalize_nucleus(paired_image, segmented_mask, target_size=(64, 64)):
    """
    Normalizes each detected nucleus in a paired image (red, green)
    based on the provided segmented mask.

    Args:
        paired_image: A tuple containing the red and green channel images (grayscale).
        segmented_mask: The binary segmented mask for the nuclei.
        target_size: The desired size for the normalized nucleus images.

    Returns:
        A list of tuples, where each tuple contains the normalized
        (red nucleus image, green nucleus image).
    """
    red_img, green_img = paired_image
    normalized_nuclei = []

    # Find contours in the segmented mask
    contours, _ = cv2.findContours(segmented_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    for contour in contours:
        # Find bounding box
        x, y, w, h = cv2.boundingRect(contour)

        # Extract ROI for both channels
        red_roi = red_img[y:y+h, x:x+w]
        green_roi = green_img[y:y+h, x:x+w]

        # Calculate minimum enclosing circle
        (center_x, center_y), radius = cv2.minEnclosingCircle(contour)
        center_x -= x  # Adjust center to be relative to ROI
        center_y -= y

        # Determine size of square bounding box for the circle
        square_size = int(np.ceil(2 * radius))
        padding = square_size

        # Pad the ROIs to the square size, centering the nucleus
        # Calculate padding amounts
        pad_x_before = int((padding - w) / 2)
        pad_x_after = padding - w - pad_x_before
        pad_y_before = int((padding - h) / 2)
        pad_y_after = padding - h - pad_y_before

        padded_red_roi = np.pad(red_roi, ((pad_y_before, pad_y_after), (pad_x_before, pad_x_after)), mode='constant')
        padded_green_roi = np.pad(green_roi, ((pad_y_before, pad_y_after), (pad_x_before, pad_x_after)), mode='constant')

        # Resize to target size
        resized_red_roi = cv2.resize(padded_red_roi, target_size, interpolation=cv2.INTER_AREA)
        resized_green_roi = cv2.resize(padded_green_roi, target_size, interpolation=cv2.INTER_AREA)

        normalized_nuclei.append((resized_red_roi, resized_green_roi))

    return normalized_nuclei

# Apply normalization to training and testing sets
train_normalized_nuclei = []
for paired_img, mask in zip(train_paired_images, train_segmented_masks):
    train_normalized_nuclei.extend(normalize_nucleus(paired_img, mask))

test_normalized_nuclei = []
for paired_img, mask in zip(test_paired_images, test_segmented_masks):
    test_normalized_nuclei.extend(normalize_nucleus(paired_img, mask))

# Print the number of normalized nuclei
print(f"Found {len(train_normalized_nuclei)} normalized nuclei in the training set.")
print(f"Found {len(test_normalized_nuclei)} normalized nuclei in the testing set.")

## Green channel loading and global thresholding

### Subtask:
Load the green channel images and apply global thresholding to reduce background haze.


**Reasoning**:
Define a function to apply global thresholding to grayscale images and then apply it to the training and testing green channel images.



In [None]:
def apply_global_thresholding(image, threshold_value=50):
    """Applies global thresholding to a grayscale image."""
    # Apply global thresholding
    _, thresholded_img = cv2.threshold(image, threshold_value, 255, cv2.THRESH_BINARY)
    return thresholded_img

# Apply thresholding to training and testing green images
train_green_thresholded = [apply_global_thresholding(img) for img in train_green_images]
test_green_thresholded = [apply_global_thresholding(img) for img in test_green_images]

print(f"Applied global thresholding to {len(train_green_thresholded)} training green images.")
print(f"Applied global thresholding to {len(test_green_thresholded)} testing green images.")

## Foci detection and nucleus labeling

### Subtask:
Implement a method to detect green foci within the segmented nuclei and label each nucleus as "damaged" or "normal" based on the presence of foci.


**Reasoning**:
Implement the `detect_foci_and_label_nucleus` function and apply it to the training and testing data to detect foci and label nuclei as damaged or normal.



In [None]:
def detect_foci_and_label_nucleus(paired_image, segmented_mask, thresholded_green_image, foci_threshold=10):
    """
    Detects green foci within segmented nuclei and labels each nucleus
    as "damaged" or "normal".

    Args:
        paired_image: A tuple containing the red and green channel images (grayscale).
        segmented_mask: The binary segmented mask for the nuclei.
        thresholded_green_image: The thresholded green channel image.
        foci_threshold: The minimum number of non-zero pixels (foci) to label a nucleus as "damaged".

    Returns:
        A list of tuples, where each tuple contains the normalized
        red nucleus image and its corresponding label ("damaged" or "normal").
    """
    red_img, green_img = paired_image
    labeled_nuclei = []

    # Find contours in the segmented mask to identify individual nuclei
    contours, _ = cv2.findContours(segmented_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    for contour in contours:
        # Create a mask specifically for the current nucleus
        nucleus_mask = np.zeros_like(segmented_mask)
        cv2.drawContours(nucleus_mask, [contour], -1, 255, -1)

        # Apply this nucleus mask to the thresholded green channel image
        masked_green = cv2.bitwise_and(thresholded_green_image, thresholded_green_image, mask=nucleus_mask)

        # Count the number of non-zero pixels (representing potential foci)
        foci_count = np.count_nonzero(masked_green)

        # Determine the label based on foci count
        label = "damaged" if foci_count > foci_threshold else "normal"

        # Find bounding box
        x, y, w, h = cv2.boundingRect(contour)

        # Extract ROI for the red channel
        red_roi = red_img[y:y+h, x:x+w]

        # Calculate minimum enclosing circle
        (center_x, center_y), radius = cv2.minEnclosingCircle(contour)
        center_x -= x  # Adjust center to be relative to ROI
        center_y -= y

        # Determine size of square bounding box for the circle
        square_size = int(np.ceil(2 * radius))
        padding = square_size

        # Pad the red ROI to the square size, centering the nucleus
        # Calculate padding amounts
        pad_x_before = int((padding - w) / 2)
        pad_x_after = padding - w - pad_x_before
        pad_y_before = int((padding - h) / 2)
        pad_y_after = padding - h - pad_y_before

        padded_red_roi = np.pad(red_roi, ((pad_y_before, pad_y_after), (pad_x_before, pad_x_after)), mode='constant')

        # Resize to target size (assuming the target size is 64x64 as used in the previous normalization step)
        target_size = (64, 64)
        normalized_red_nucleus = cv2.resize(padded_red_roi, target_size, interpolation=cv2.INTER_AREA)


        # Store the normalized red nucleus image and its label
        labeled_nuclei.append((normalized_red_nucleus, label))

    return labeled_nuclei

# Apply foci detection and labeling to training and testing sets
train_labeled_nuclei = []
for i in range(len(train_paired_images)):
    labeled_nuclei_in_image = detect_foci_and_label_nucleus(
        train_paired_images[i],
        train_segmented_masks[i],
        train_green_thresholded[i]
    )
    train_labeled_nuclei.extend(labeled_nuclei_in_image)

test_labeled_nuclei = []
for i in range(len(test_paired_images)):
    labeled_nuclei_in_image = detect_foci_and_label_nucleus(
        test_paired_images[i],
        test_segmented_masks[i],
        test_green_thresholded[i]
    )
    test_labeled_nuclei.extend(labeled_nuclei_in_image)

# Count damaged and normal nuclei
train_damaged_count = sum(1 for img, label in train_labeled_nuclei if label == "damaged")
train_normal_count = sum(1 for img, label in train_labeled_nuclei if label == "normal")
test_damaged_count = sum(1 for img, label in test_labeled_nuclei if label == "damaged")
test_normal_count = sum(1 for img, label in test_labeled_nuclei if label == "normal")

print(f"Training set: {train_damaged_count} damaged, {train_normal_count} normal.")
print(f"Testing set: {test_damaged_count} damaged, {test_normal_count} normal.")

## Integrate processing steps

### Subtask:
Combine all the processing steps into a cohesive pipeline.


**Reasoning**:
Define a function `process_image_pair` that integrates the previously defined functions for segmentation, thresholding, foci detection, and labeling, and then iterate through the paired training and testing images to process them using this function.



In [None]:
def process_image_pair(paired_image):
    """
    Processes a paired image (red, green) through segmentation,
    thresholding, foci detection, and labeling.

    Args:
        paired_image: A tuple containing the red and green channel images (grayscale).

    Returns:
        A list of tuples, where each tuple contains the normalized
        red nucleus image and its corresponding label ("damaged" or "normal").
    """
    red_img, green_img = paired_image

    # 1. Nucleus segmentation (red channel)
    segmented_mask = segment_nuclei(red_img)

    # 2. Green channel global thresholding
    thresholded_green_image = apply_global_thresholding(green_img)

    # 3. Foci detection and nucleus labeling
    labeled_nuclei_in_image = detect_foci_and_label_nucleus(
        paired_image,
        segmented_mask,
        thresholded_green_image
    )

    return labeled_nuclei_in_image

# Process training images
all_train_labeled_nuclei = []
for paired_img in train_paired_images:
    labeled_nuclei = process_image_pair(paired_img)
    all_train_labeled_nuclei.extend(labeled_nuclei)

# Process testing images
all_test_labeled_nuclei = []
for paired_img in test_paired_images:
    labeled_nuclei = process_image_pair(paired_img)
    all_test_labeled_nuclei.extend(labeled_nuclei)

# Print the total number of labeled nuclei
print(f"Total labeled nuclei in training set: {len(all_train_labeled_nuclei)}")
print(f"Total labeled nuclei in testing set: {len(all_test_labeled_nuclei)}")

## Prepare data for model training

### Subtask:
Format the processed data (normalized red channel nuclei and their labels) for input into the model.


**Reasoning**:
Separate the normalized red nucleus images and their labels into separate lists for training and testing, then convert these lists to NumPy arrays and convert labels to numerical format, finally print the shapes.



In [None]:
# Separate images and labels for training
train_images_for_model = [nucleus[0] for nucleus in all_train_labeled_nuclei]
train_labels_for_model = [nucleus[1] for nucleus in all_train_labeled_nuclei]

# Separate images and labels for testing
test_images_for_model = [nucleus[0] for nucleus in all_test_labeled_nuclei]
test_labels_for_model = [nucleus[1] for nucleus in all_test_labeled_nuclei]

# Convert images to NumPy arrays and add channel dimension
train_images_for_model = np.array(train_images_for_model).reshape(-1, 64, 64, 1)
test_images_for_model = np.array(test_images_for_model).reshape(-1, 64, 64, 1)

# Convert labels to NumPy arrays and to numerical format
label_mapping = {"normal": 0, "damaged": 1}
train_labels_for_model = np.array([label_mapping[label] for label in train_labels_for_model])
test_labels_for_model = np.array([label_mapping[label] for label in test_labels_for_model])

# Print shapes
print(f"Shape of train_images_for_model: {train_images_for_model.shape}")
print(f"Shape of train_labels_for_model: {train_labels_for_model.shape}")
print(f"Shape of test_images_for_model: {test_images_for_model.shape}")
print(f"Shape of test_labels_for_model: {test_labels_for_model.shape}")

## Summary:

### Data Analysis Key Findings

*   Nucleus segmentation was successfully applied to the red channel images for both training (5 images) and testing (91 images) datasets, generating corresponding segmented masks.
*   Normalization steps, including cropping, padding, and resizing to 64x64 pixels, were applied to the detected nuclei from both red and green channels. This resulted in 30 normalized nucleus pairs in the training set and 914 in the testing set.
*   Global thresholding was applied to the green channel images, producing thresholded versions for both training (5 images) and testing (91 images) sets.
*   Green foci were detected within the segmented nuclei, and each nucleus was labeled as "damaged" or "normal" based on the foci count. This resulted in 6 damaged and 24 normal nuclei in the training set, and 94 damaged and 820 normal nuclei in the testing set.
*   The entire processing pipeline, integrating segmentation, thresholding, foci detection, and labeling, was successfully implemented and applied to both datasets.
*   The processed data was formatted for model training by separating normalized red channel nucleus images and their labels, converting them into NumPy arrays with a shape of (number\_of\_samples, 64, 64, 1), and mapping string labels ("normal", "damaged") to numerical values (0, 1).

### Insights or Next Steps

*   The significant imbalance between "normal" and "damaged" nuclei, especially in the testing set (820 normal vs. 94 damaged), should be considered when selecting and training the classification model. Data augmentation or choosing a model robust to class imbalance may be necessary.
*   Evaluate the sensitivity of the foci detection and nucleus labeling to the chosen `foci_threshold`. Further tuning of this parameter might improve the accuracy of the "damaged" versus "normal" classification.


## Feature Extraction (Shape Features)

### Subtask:
Extract shape features from the segmented and normalized red channel nucleus images.

**Reasoning**:
Implement a function to calculate the specified shape features (Major/Minor Axis Length, Axis Ratio, Eccentricity, Solidity, Roundness/Form Factor, Hu Moments, and HOG) for each normalized nucleus image and apply it to both training and testing datasets.

In [None]:
import mahotas
from skimage.feature import hog

def extract_shape_features(image):
    """Extracts shape features from a binary nucleus mask."""
    # Ensure the image is binary (0 or 255)
    if image.max() <= 1:
        image = image * 255

    # Find contours
    contours, _ = cv2.findContours(image.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    if not contours:
        # Return a list of zeros or NaNs if no contour is found
        return [0] * 18 # 7 Hu moments + 7 for other shape features + 4 for HOG (example size)

    # Assuming the largest contour is the nucleus
    contour = max(contours, key=cv2.contourArea)

    # Basic Shape Features
    area = cv2.contourArea(contour)
    perimeter = cv2.arcLength(contour, True)

    if area == 0:
         # Return a list of zeros or NaNs if area is zero
        return [0] * 18

    # Fit ellipse
    if len(contour) >= 5:
        ellipse = cv2.fitEllipse(contour)
        (center, axes, orientation) = ellipse
        major_axis_length = max(axes)
        minor_axis_length = min(axes)
        axis_ratio = major_axis_length / minor_axis_length if minor_axis_length > 0 else 0
        eccentricity = np.sqrt(1 - (minor_axis_length / major_axis_length)**2) if major_axis_length > 0 else 0
    else:
        major_axis_length, minor_axis_length, axis_ratio, eccentricity = 0, 0, 0, 0


    # Solidity
    hull = cv2.convexHull(contour)
    hull_area = cv2.contourArea(hull)
    solidity = area / hull_area if hull_area > 0 else 0

    # Roundness/Form Factor
    roundness = 4 * np.pi * area / (perimeter**2) if perimeter > 0 else 0

    # Hu Moments
    hu_moments = cv2.HuMoments(cv2.moments(contour)).flatten()

    # Histogram of Oriented Gradients (HOG) - using the image directly, not the contour
    # HOG requires a specific input format and parameters. This is a basic example.
    # The parameters for HOG (pixels_per_cell, cells_per_block, orientations) can be tuned.
    try:
        hog_features = hog(image, pixels_per_cell=(16, 16), cells_per_block=(1, 1), orientations=9, feature_vector=True)
    except ValueError:
        hog_features = [0] * 9 # Example size, adjust based on HOG parameters

    # Combine features
    # Adjust the number of features to match the combined list size
    features = [major_axis_length, minor_axis_length, axis_ratio, eccentricity, solidity, roundness] + list(hu_moments) + list(hog_features)


    return features

# Extract features for training and testing data
train_shape_features = [extract_shape_features(nucleus_img) for nucleus_img, label in all_train_labeled_nuclei]
test_shape_features = [extract_shape_features(nucleus_img) for nucleus_img, label in all_test_labeled_nuclei]

# Convert to numpy arrays
train_shape_features = np.array(train_shape_features)
test_shape_features = np.array(test_shape_features)

print(f"Shape of train_shape_features: {train_shape_features.shape}")
print(f"Shape of test_shape_features: {test_shape_features.shape}")

In [None]:
!pip install mahotas

In [None]:
import mahotas
from skimage.feature import hog
import cv2
import numpy as np
from skimage import measure # Import measure for regionprops

def extract_shape_features(image):
    """Extracts shape features from a binary nucleus mask using skimage.measure.regionprops."""
    # Ensure the image is binary (0 or 255) and has the correct dtype for regionprops
    if image.max() <= 1:
        image = image * 255
    image = image.astype(np.uint8)

    # Find contours (still needed for some calculations like perimeter and hull)
    contours, _ = cv2.findContours(image.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    if not contours:
        # Return a list of zeros if no contour is found
        # Adjust size based on the features extracted by regionprops + Hu moments + HOG
        # regionprops provides area, perimeter, major/minor axis length, eccentricity, solidity
        # Hu moments are 7. HOG depends on parameters (e.g., 9 orientations for 1 cell block size 16x16)
        return [0] * (5 + 7 + 9)

    # Assuming the largest contour is the nucleus
    contour = max(contours, key=cv2.contourArea)

    # Use regionprops for basic shape features
    # Need to create a labeled image for regionprops
    labeled_image, num_labels = measure.label(image, connectivity=2, return_num=True)

    if num_labels < 1:
         # Return a list of zeros if no labeled regions are found
         return [0] * (5 + 7 + 9)

    # Find properties of the largest region (assumed to be the nucleus)
    properties = measure.regionprops(labeled_image)[0] # Assuming only one main region after segmentation

    area = properties.area
    perimeter = properties.perimeter
    major_axis_length = properties.major_axis_length if properties.major_axis_length is not None else 0
    minor_axis_length = properties.minor_axis_length if properties.minor_axis_length is not None else 0
    axis_ratio = major_axis_length / minor_axis_length if minor_axis_length > 0 else 0
    eccentricity = properties.eccentricity if properties.eccentricity is not None else 0
    solidity = properties.solidity if properties.solidity is not None else 0

    # Roundness/Form Factor (calculated using area and perimeter from regionprops)
    roundness = 4 * np.pi * area / (perimeter**2) if perimeter > 0 else 0

    # Hu Moments (using cv2 on the contour)
    hu_moments = cv2.HuMoments(cv2.moments(contour)).flatten()

    # Histogram of Oriented Gradients (HOG) - using the image directly
    try:
        # HOG requires a specific input format and parameters. This is a basic example.
        # The parameters for HOG (pixels_per_cell, cells_per_block, orientations) can be tuned.
        hog_features = hog(image, pixels_per_cell=(16, 16), cells_per_block=(1, 1), orientations=9, feature_vector=True)
    except ValueError:
        hog_features = [0] * 9 # Example size, adjust based on HOG parameters

    # Combine features
    features = [area, perimeter, major_axis_length, minor_axis_length, axis_ratio, eccentricity, solidity, roundness] + list(hu_moments) + list(hog_features)

    return features

# Extract features for training and testing data
train_shape_features = [extract_shape_features(nucleus_img) for nucleus_img, label in all_train_labeled_nuclei]
test_shape_features = [extract_shape_features(nucleus_img) for nucleus_img, label in all_test_labeled_nuclei]

# Convert to numpy arrays
train_shape_features = np.array(train_shape_features)
test_shape_features = np.array(test_shape_features)

print(f"Shape of train_shape_features: {train_shape_features.shape}")
print(f"Shape of test_shape_features: {test_shape_features.shape}")

## Feature Extraction (Texture Features)

### Subtask:
Extract texture features from the segmented and normalized red channel nucleus images.

**Reasoning**:
Implement a function to calculate texture features (e.g., Haralick texture features using `mahotas`) for each normalized nucleus image and apply it to both training and testing datasets.

In [None]:
import mahotas as mh
from skimage import feature, measure
import cv2
import numpy as np
from skimage.filters import gaussian#, laplacian # Comment out laplacian import
from skimage.feature import SIFT
import scipy.ndimage # Import scipy for laplacian

def extract_texture_features(image):
    """Extracts various texture features from a grayscale image."""

    # Ensure the image is in the correct format (uint8) for some libraries
    image = image.astype(np.uint8)

    # Check if the image is too small for feature extraction
    if image.shape[0] < 21 or image.shape[1] < 21: # Some features like Gabor might need a minimum size
         return [0] * (13 + 27 + 2 + 16 + 25 + 2) # Haralick + LBP + LoG (mean, std) + Gabor (4 orientations * 2 freqs * 2 features) + Zernike (degree 8) + SIFT (mean, std) - adjusted sizes

    # Haralick Texture Features (using mahotas) - keeping this as it was used before
    # Check if the image is not all zeros or constant, which can cause issues with haralick
    if np.all(image == image[0, 0]):
        haralick_features = [0] * 13
    else:
        try:
            haralick_features = mh.features.haralick(image).tolist()
        except ValueError:
            haralick_features = [0] * 13

    # Local Binary Patterns (LBP) (using skimage) - from user's snippet
    try:
        lbp = feature.local_binary_pattern(image, P=8, R=1, method='uniform')
        # Use a fixed number of bins for the histogram (26 uniform patterns + 2 for ends)
        lbp_hist, _ = np.histogram(lbp, bins=np.arange(0, 28), density=True)
        lbp_features = lbp_hist.tolist()
    except Exception:
        lbp_features = [0] * 27 # 26 uniform patterns + 1 for the remaining

    # Haralick Texture Features (using scikit-image) - from user's snippet - using different distances/angles
    try:
        # Ensure image is uint8 for graycomatrix
        image_uint8 = image.astype(np.uint8)
        # Adjust distances and angles as needed
        glcm = measure.graycomatrix(image_uint8, distances=[1, 3, 5], angles=[0, np.pi/4, np.pi/2, 3*np.pi/4], symmetric=True, normed=True)
        # Extract properties - using contrast, correlation, energy, homogeneity as in user's snippet
        haralick_features_skimage = measure.graycoprops(glcm, props=['contrast', 'correlation', 'energy', 'homogeneity']).flatten().tolist()
    except Exception:
        haralick_features_skimage = [0] * (4 * 4) # 4 properties * 4 angles

    # Laplacian of Gaussian (LoG) (using cv2) - from user's snippet
    try:
        # Apply Gaussian smoothing using cv2
        blurred_image = cv2.GaussianBlur(image, (0, 0), 1)
        # Apply Laplacian filter using cv2
        log_image = cv2.Laplacian(blurred_image, cv2.CV_64F)
        # Simple statistics from LoG response (e.g., mean and variance)
        log_features = [np.mean(log_image), np.std(log_image)]
    except Exception:
        log_features = [0] * 2 # Adjust size based on features extracted


    # Gabor Wavelets (using cv2) - from user's snippet
    try:
        gabor_features = []
        # Define Gabor filter parameters (can be extended with different orientations and frequencies)
        kernels = []
        for theta in np.arange(0, np.pi, np.pi / 4): # 4 orientations
            for freq in [5, 10]: # 2 frequencies
                kern = cv2.getGaborKernel((21, 21), 5.0, theta, freq, 0.5, 0, ktype=cv2.CV_32F)
                kernels.append(kern)

        # Apply filters and extract features (e.g., mean and variance of the response)
        for kernel in kernels:
            fimg = cv2.filter2D(image, cv2.CV_8UC3, kernel) # Changed depth to CV_8UC3 as per documentation examples
            # Convert to grayscale for mean/std calculation if fimg is BGR
            if len(fimg.shape) == 3:
                fimg = cv2.cvtColor(fimg, cv2.COLOR_BGR2GRAY)
            gabor_features.extend([np.mean(fimg), np.std(fimg)])
    except Exception:
        gabor_features = [0] * (4 * 2 * 2) # 4 orientations * 2 frequencies * 2 features (mean, std)


    # Zernike Moments (using mahotas) - keeping this
    try:
        # Ensure the image is binary for Zernike moments
        # Use Otsu's thresholding on the current nucleus image ROI
        _, binary_nucleus = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
        # Compute Zernike moments up to a certain degree (e.g., 8)
        # The number of moments depends on the degree. For degree 8, there are 25 moments.
        zernike_moments = mh.features.zernike_moments(binary_nucleus, radius=image.shape[0]//2, degree=8).tolist()
    except Exception:
         # Approximate number of Zernike moments for degree 8
        zernike_moments = [0] * 25


    # SIFT (Scale-Invariant Feature Transform) (using skimage) - keeping this
    try:
        # Initialize SIFT detector
        # Consider adjusting parameters for feature detection
        detector_extractor = SIFT()
        # Find keypoints and compute descriptors
        detector_extractor.detect_and_extract(image)
        descriptors = detector_extractor.descriptors

        # If no keypoints are found, descriptors will be None
        if descriptors is not None:
            # Simple representation: mean and standard deviation of descriptors
            sift_features = [np.mean(descriptors), np.std(descriptors)]
            # Or, you could cluster descriptors or use a bag-of-visual-words approach
        else:
            sift_features = [0] * 2 # Adjust size based on features extracted (e.g., mean, std)
    except Exception:
        sift_features = [0] * 2 # Adjust size based on features extracted


    # Combine all texture features - including both mahotas and skimage haralick
    features = haralick_features + lbp_features + haralick_features_skimage + log_features + gabor_features + zernike_moments + sift_features

    return features


# Extract texture features for training and testing data
train_texture_features = [extract_texture_features(nucleus_img) for nucleus_img, label in all_train_labeled_nuclei]
test_texture_features = [extract_texture_features(nucleus_img) for nucleus_img, label in all_test_labeled_nuclei]

# Convert to numpy arrays
# Ensure all feature lists have the same length before converting to numpy array
def pad_features(features_list):
    max_len = max(len(f) for f in features_list)
    padded_features = []
    for f in features_list:
        # Ensure each element in f is a number before padding
        padded_features.append([float(x) if isinstance(x, (int, float)) else 0.0 for x in f] + [0.0] * (max_len - len(f)))
    return np.array(padded_features)

train_texture_features = pad_features(train_texture_features)
test_texture_features = pad_features(test_texture_features)


print(f"Shape of train_texture_features: {train_texture_features.shape}")
print(f"Shape of test_texture_features: {test_texture_features.shape}")

## Combine Features

### Subtask:
Combine the extracted shape and texture features into a single feature vector for each nucleus.

**Reasoning**:
Concatenate the shape and texture feature arrays for both the training and testing datasets to create the final feature vectors for model input.

In [None]:
# Combine shape and texture features for training
# Ensure both arrays have the same number of samples
if train_shape_features.shape[0] == train_texture_features.shape[0]:
    # Reshape texture features to be 2D if they are 3D (as seen in the output)
    # The shape (30, 4, 13) suggests 4 directions with 13 features each.
    # We can flatten the last two dimensions to get (30, 52)
    train_texture_features_flat = train_texture_features.reshape(train_texture_features.shape[0], -1)
    train_combined_features = np.concatenate((train_shape_features, train_texture_features_flat), axis=1)
    print(f"Shape of train_combined_features: {train_combined_features.shape}")
else:
    print("Mismatch in the number of training samples for shape and texture features.")
    train_combined_features = None # Or handle the error appropriately


# Combine shape and texture features for testing
# Ensure both arrays have the same number of samples
if test_shape_features.shape[0] == test_texture_features.shape[0]:
    # Reshape texture features similarly
    test_texture_features_flat = test_texture_features.reshape(test_texture_features.shape[0], -1)
    test_combined_features = np.concatenate((test_shape_features, test_texture_features_flat), axis=1)
    print(f"Shape of test_combined_features: {test_combined_features.shape}")
else:
    print("Mismatch in the number of testing samples for shape and texture features.")
    test_combined_features = None # Or handle the error appropriately

# Task
Summarize the work done so far in the nucleus segmentation and classification project, covering data loading, preprocessing, segmentation, normalization, green channel labeling, and feature extraction.

## Train random forest classifier

### Subtask:
Train a Random Forest classifier on the extracted shape and texture features, addressing class imbalance and experimenting with hyperparameters.


**Reasoning**:
Import necessary libraries for Random Forest and SMOTE, then apply SMOTE to the training data and train the Random Forest classifier.



In [None]:
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE

# Apply SMOTE to handle class imbalance
smote = SMOTE(random_state=42)
train_features_smote, train_labels_smote = smote.fit_resample(train_combined_features, train_labels_for_model)

# Instantiate and train a Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(train_features_smote, train_labels_smote)

print("Random Forest classifier trained successfully with SMOTE.")

## Evaluate random forest classifier

### Subtask:
Evaluate the performance of the trained Random Forest model and analyze feature importance.


**Reasoning**:
Evaluate the performance of the trained Random Forest model and analyze feature importance.



In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import pandas as pd

# Make predictions on the test set
predictions = rf_classifier.predict(test_combined_features)

# Generate and print classification report
print("Classification Report:")
print(classification_report(test_labels_for_model, predictions))

# Compute and print accuracy score
accuracy = accuracy_score(test_labels_for_model, predictions)
print(f"Accuracy Score: {accuracy}")

# Generate and print confusion matrix
conf_matrix = confusion_matrix(test_labels_for_model, predictions)
print("Confusion Matrix:")
print(conf_matrix)

# Extract feature importances
feature_importances = rf_classifier.feature_importances_

# Create a list of feature names (based on the order they were combined)
# This requires knowing the number of features from each category
num_shape_features = train_shape_features.shape[1]
num_texture_features = train_texture_features.shape[1] # This is the flat size after reshaping

# Assuming a simple naming convention for demonstration
shape_feature_names = [f"shape_{i}" for i in range(num_shape_features)]
texture_feature_names = [f"texture_{i}" for i in range(num_texture_features)]
all_feature_names = shape_feature_names + texture_feature_names

# Create a pandas Series for feature importances
feature_importance_series = pd.Series(feature_importances, index=all_feature_names)

# Sort feature importances in descending order
sorted_feature_importances = feature_importance_series.sort_values(ascending=False)

# Print the top 20 most important features
print("\nTop 20 Most Important Features:")
print(sorted_feature_importance_series.head(20))

**Reasoning**:
The traceback indicates a mismatch between the number of feature importances from the model (209) and the number of feature names generated (233). This is likely due to an incorrect assumption about the size of the flattened texture features. I need to correctly calculate the number of texture features after flattening to create the correct list of feature names.



In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import pandas as pd
import numpy as np # Ensure numpy is imported

# Make predictions on the test set
predictions = rf_classifier.predict(test_combined_features)

# Generate and print classification report
print("Classification Report:")
print(classification_report(test_labels_for_model, predictions))

# Compute and print accuracy score
accuracy = accuracy_score(test_labels_for_model, predictions)
print(f"Accuracy Score: {accuracy}")

# Generate and print confusion matrix
conf_matrix = confusion_matrix(test_labels_for_model, predictions)
print("Confusion Matrix:")
print(conf_matrix)

# Extract feature importances
feature_importances = rf_classifier.feature_importances_

# Create a list of feature names (based on the order they were combined)
# This requires knowing the number of features from each category
num_shape_features = train_shape_features.shape[1]

# Correctly calculate the number of flattened texture features
# The original texture features shape was (samples, num_features_per_image)
# After flattening, it should be (samples, num_features_per_image)
# The number of texture features is simply the second dimension of the flattened array.
# We can get this from the shape of train_texture_features_flat created in the previous step.
# Assuming train_texture_features_flat was successfully created:
try:
    num_texture_features_flat = train_texture_features_flat.shape[1]
except NameError:
    # If train_texture_features_flat was not created due to the previous error,
    # we need to recreate it or calculate its expected size.
    # Based on the previous output (30, 76), the flattened size should be 76.
    num_texture_features_flat = 76 # Manually set based on previous output

# Assuming a simple naming convention for demonstration
shape_feature_names = [f"shape_{i}" for i in range(num_shape_features)]
texture_feature_names = [f"texture_{i}" for i in range(num_texture_features_flat)]
all_feature_names = shape_feature_names + texture_feature_names

# Check if the lengths match before creating the Series
if len(feature_importances) == len(all_feature_names):
    # Create a pandas Series for feature importances
    feature_importance_series = pd.Series(feature_importances, index=all_feature_names)

    # Sort feature importances in descending order
    sorted_feature_importances = feature_importance_series.sort_values(ascending=False)

    # Print the top 20 most important features
    print("\nTop 20 Most Important Features:")
    print(sorted_feature_importances.head(20))
else:
    print(f"Mismatch in feature counts: Model has {len(feature_importances)}, generated names have {len(all_feature_names)}")



## Train convolutional neural network (cnn)

### Subtask:
Build and train a CNN directly on the normalized nucleus images, incorporating data augmentation, regularization, and potentially transfer learning.


**Reasoning**:
Implement the CNN model architecture, compile it, set up data augmentation, and train the model.



In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Define the CNN model architecture
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(64, 64, 1)),
    MaxPooling2D((2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Conv2D(128, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid') # Sigmoid for binary classification
])

# Compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Setup data augmentation
datagen = ImageDataGenerator(
    rotation_range=20,
    zoom_range=0.1,
    width_shift_range=0.1,
    height_shift_range=0.1,
    horizontal_flip=True,
    vertical_flip=True
)

# Fit the data generator on the training data
datagen.fit(train_images_for_model)

# Train the model using the augmented data
history = model.fit(datagen.flow(train_images_for_model, train_labels_for_model, batch_size=32),
                    epochs=50, # Number of training epochs
                    validation_data=(test_images_for_model, test_labels_for_model))

print("CNN model training completed.")

**Reasoning**:
The CNN model has been built and trained. To evaluate its performance, calculate and print the classification report and confusion matrix on the test set.



In [None]:
from sklearn.metrics import classification_report, confusion_matrix

# Make predictions on the test set
predictions_prob = model.predict(test_images_for_model)
predictions = (predictions_prob > 0.5).astype("int32")

# Generate and print classification report
print("CNN Classification Report:")
print(classification_report(test_labels_for_model, predictions))

# Generate and print confusion matrix
conf_matrix_cnn = confusion_matrix(test_labels_for_model, predictions)
print("CNN Confusion Matrix:")
print(conf_matrix_cnn)

## Evaluate convolutional neural network (cnn)

### Subtask:
Evaluate the performance of the trained CNN.


**Reasoning**:
Utilize the trained CNN model to generate predictions on the test set, convert probabilities to class labels, and then generate and print the classification report and confusion matrix to evaluate the model's performance as requested by the subtask.



In [None]:
from sklearn.metrics import classification_report, confusion_matrix

# Make predictions on the test set
predictions_prob = model.predict(test_images_for_model)
predictions = (predictions_prob > 0.5).astype("int32")

# Generate and print classification report
print("CNN Classification Report:")
print(classification_report(test_labels_for_model, predictions))

# Generate and print confusion matrix
conf_matrix_cnn = confusion_matrix(test_labels_for_model, predictions)
print("CNN Confusion Matrix:")
print(conf_matrix_cnn)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE

# Apply SMOTE to handle class imbalance
smote = SMOTE(random_state=42)
train_features_smote, train_labels_smote = smote.fit_resample(train_combined_features, train_labels_for_model)

# Instantiate and train a Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(train_features_smote, train_labels_smote)

print("Random Forest classifier trained successfully with SMOTE.")

## Evaluate random forest classifier

### Subtask:
Evaluate the performance of the trained Random Forest model and analyze feature importance.

**Reasoning**:
Evaluate the performance of the trained Random Forest model and analyze feature importance.

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import pandas as pd
import numpy as np # Ensure numpy is imported

# Make predictions on the test set
predictions = rf_classifier.predict(test_combined_features)

# Generate and print classification report
print("Classification Report:")
print(classification_report(test_labels_for_model, predictions))

# Compute and print accuracy score
accuracy = accuracy_score(test_labels_for_model, predictions)
print(f"Accuracy Score: {accuracy}")

# Generate and print confusion matrix
conf_matrix = confusion_matrix(test_labels_for_model, predictions)
print("Confusion Matrix:")
print(conf_matrix)

# Extract feature importances
feature_importances = rf_classifier.feature_importances_

# Create a list of feature names (based on the order they were combined)
# This requires knowing the number of features from each category
num_shape_features = train_shape_features.shape[1]

# Correctly calculate the number of flattened texture features
# The original texture features shape was (samples, num_features_per_image)
# After flattening, it should be (samples, num_features_per_image)
# The number of texture features is simply the second dimension of the flattened array.
# We can get this from the shape of train_texture_features_flat created in the previous step.
# Assuming train_texture_features_flat was successfully created:
try:
    num_texture_features_flat = train_texture_features_flat.shape[1]
except NameError:
    # If train_texture_features_flat was not created due to the previous error,
    # we need to recreate it or calculate its expected size.
    # Based on the previous output (30, 76), the flattened size should be 76.
    num_texture_features_flat = 76 # Manually set based on previous output

# Assuming a simple naming convention for demonstration
shape_feature_names = [f"shape_{i}" for i in range(num_shape_features)]
texture_feature_names = [f"texture_{i}" for i in range(num_texture_features_flat)]
all_feature_names = shape_feature_names + texture_feature_names

# Check if the lengths match before creating the Series
if len(feature_importances) == len(all_feature_names):
    # Create a pandas Series for feature importances
    feature_importance_series = pd.Series(feature_importances, index=all_feature_names)

    # Sort feature importances in descending order
    sorted_feature_importances = feature_importance_series.sort_values(ascending=False)

    # Print the top 20 most important features
    print("\nTop 20 Most Important Features:")
    print(sorted_feature_importances.head(20))
else:
    print(f"Mismatch in feature counts: Model has {len(feature_importances)}, generated names have {len(all_feature_names)}")

## Train convolutional neural network (cnn)

### Subtask:
Build and train a CNN directly on the normalized nucleus images, incorporating data augmentation, regularization, and potentially transfer learning.

**Reasoning**:
Implement the CNN model architecture, compile it, set up data augmentation, and train the model.

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Define the CNN model architecture
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(64, 64, 1)),
    MaxPooling2D((2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Conv2D(128, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid') # Sigmoid for binary classification
])

# Compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Setup data augmentation
datagen = ImageDataGenerator(
    rotation_range=20,
    zoom_range=0.1,
    width_shift_range=0.1,
    height_shift_range=0.1,
    horizontal_flip=True,
    vertical_flip=True
)

# Fit the data generator on the training data
datagen.fit(train_images_for_model)

# Train the model using the augmented data
history = model.fit(datagen.flow(train_images_for_model, train_labels_for_model, batch_size=32),
                    epochs=50, # Number of training epochs
                    validation_data=(test_images_for_model, test_labels_for_model))

print("CNN model training completed.")

## Evaluate convolutional neural network (cnn)

### Subtask:
Evaluate the performance of the trained CNN.

**Reasoning**:
Utilize the trained CNN model to generate predictions on the test set, convert probabilities to class labels, and then generate and print the classification report and confusion matrix to evaluate the model's performance as requested by the subtask.

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

# Make predictions on the test set
predictions_prob = model.predict(test_images_for_model)
predictions = (predictions_prob > 0.5).astype("int32")

# Generate and print classification report
print("CNN Classification Report:")
print(classification_report(test_labels_for_model, predictions))

# Generate and print confusion matrix
conf_matrix_cnn = confusion_matrix(test_labels_for_model, predictions)
print("CNN Confusion Matrix:")
print(conf_matrix_cnn)

## Build and Train Ensemble Model

### Subtask:
Combine the CNN probabilities with the handcrafted features and train a second-level Random Forest model, including feature selection and optimization.

**Reasoning**:
Combine the predictions (probabilities) from the trained CNN with the extracted shape and texture features, and then train a second-level Random Forest classifier on this combined feature set.

In [None]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import SelectKBest, f_classif

# Get predictions (probabilities) from the trained CNN on the training and testing data
train_cnn_predictions_prob = model.predict(train_images_for_model)
test_cnn_predictions_prob = model.predict(test_images_for_model)

# Reshape CNN predictions to be 2D arrays
train_cnn_predictions_prob = train_cnn_predictions_prob.reshape(-1, 1)
test_cnn_predictions_prob = test_cnn_predictions_prob.reshape(-1, 1)


# Combine CNN probabilities with handcrafted features for training
# Ensure both arrays have the same number of samples
if train_combined_features.shape[0] == train_cnn_predictions_prob.shape[0]:
    train_ensemble_features = np.concatenate((train_combined_features, train_cnn_predictions_prob), axis=1)
    print(f"Shape of train_ensemble_features: {train_ensemble_features.shape}")
else:
    print("Mismatch in the number of training samples for combined features and CNN predictions.")
    train_ensemble_features = None # Or handle the error appropriately


# Combine CNN probabilities with handcrafted features for testing
# Ensure both arrays have the same number of samples
if test_combined_features.shape[0] == test_cnn_predictions_prob.shape[0]:
    test_ensemble_features = np.concatenate((test_combined_features, test_cnn_predictions_prob), axis=1)
    print(f"Shape of test_ensemble_features: {test_ensemble_features.shape}")
else:
    print("Mismatch in the number of testing samples for combined features and CNN predictions.")
    test_ensemble_features = None # Or handle the error appropriately


# --- Feature Selection (Optional but Recommended for Ensemble) ---
# Use SelectKBest to select the top K features based on ANOVA F-value
# You can adjust the value of k (number of features to select)
if train_ensemble_features is not None and train_labels_for_model is not None:
    # Address potential NaNs or Infs in features before selection
    train_ensemble_features_clean = np.nan_to_num(train_ensemble_features)

    # Check if there are enough samples for feature selection
    if train_ensemble_features_clean.shape[0] > 1 and np.var(train_ensemble_features_clean, axis=0).sum() > 0:
        k = min(100, train_ensemble_features_clean.shape[1]) # Select top 100 features or fewer if less are available
        try:
            selector = SelectKBest(score_func=f_classif, k=k)
            train_ensemble_features_selected = selector.fit_transform(train_ensemble_features_clean, train_labels_for_model)
            test_ensemble_features_selected = selector.transform(np.nan_to_num(test_ensemble_features)) # Apply the same selection to test data
            print(f"Shape of train_ensemble_features_selected: {train_ensemble_features_selected.shape}")
            print(f"Shape of test_ensemble_features_selected: {test_ensemble_features_selected.shape}")
        except ValueError as e:
             print(f"Could not perform feature selection: {e}")
             # Fallback to using all features if selection fails
             train_ensemble_features_selected = train_ensemble_features_clean
             test_ensemble_features_selected = np.nan_to_num(test_ensemble_features)
             print("Using all features for ensemble training.")
    else:
         print("Not enough samples or variance for feature selection. Using all features.")
         train_ensemble_features_selected = train_ensemble_features_clean
         test_ensemble_features_selected = np.nan_to_num(test_ensemble_features)
else:
    print("Ensemble features not available for selection.")
    train_ensemble_features_selected = None
    test_ensemble_features_selected = None


# --- Train Second-Level Random Forest Classifier ---
# Apply SMOTE to the selected training features to handle class imbalance for the ensemble model
if train_ensemble_features_selected is not None and train_labels_for_model is not None:
    try:
        smote_ensemble = SMOTE(random_state=42)
        train_ensemble_features_smote, train_labels_ensemble_smote = smote_ensemble.fit_resample(train_ensemble_features_selected, train_labels_for_model)

        # Instantiate and train the second-level Random Forest Classifier
        ensemble_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
        ensemble_classifier.fit(train_ensemble_features_smote, train_labels_ensemble_smote)

        print("\nEnsemble Random Forest classifier trained successfully with SMOTE.")
    except ValueError as e:
         print(f"Could not train ensemble classifier: {e}")
         ensemble_classifier = None
else:
    print("Could not train ensemble classifier due to missing data.")

## Evaluate Ensemble Model

### Subtask:
Evaluate the performance of the ensemble model.

**Reasoning**:
Utilize the trained ensemble model to generate predictions on the test set and then generate and print the classification report and confusion matrix to evaluate its performance as requested by the subtask.

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Make predictions on the test set using the ensemble model
if test_ensemble_features_selected is not None and ensemble_classifier is not None:
    ensemble_predictions = ensemble_classifier.predict(test_ensemble_features_selected)

    # Generate and print classification report for the ensemble model
    print("Ensemble Model Classification Report:")
    print(classification_report(test_labels_for_model, ensemble_predictions))

    # Compute and print accuracy score for the ensemble model
    ensemble_accuracy = accuracy_score(test_labels_for_model, ensemble_predictions)
    print(f"Ensemble Model Accuracy Score: {ensemble_accuracy}")

    # Generate and print confusion matrix for the ensemble model
    conf_matrix_ensemble = confusion_matrix(test_labels_for_model, ensemble_predictions)
    print("Ensemble Model Confusion Matrix:")
    print(conf_matrix_ensemble)
else:
    print("Ensemble model or test features not available for evaluation.")

## Summary of Model Performance

We have trained and evaluated three different models for classifying damaged nuclei: a Random Forest classifier using handcrafted features, a Convolutional Neural Network (CNN) trained on normalized images, and an ensemble model combining CNN predictions and handcrafted features.

Here's a summary of the key performance metrics for each model on the test set:

### Random Forest Classifier (using handcrafted features with SMOTE)

In [None]:
# Print the classification report and confusion matrix for the Random Forest model again for easy comparison
from sklearn.metrics import classification_report, confusion_matrix

print("Random Forest Classifier Evaluation:")
print(classification_report(test_labels_for_model, predictions))
print("Confusion Matrix:")
print(conf_matrix)

### Convolutional Neural Network (CNN)

In [None]:
# Print the classification report and confusion matrix for the CNN model again for easy comparison
from sklearn.metrics import classification_report, confusion_matrix

print("CNN Evaluation:")
# Ensure model and test_images_for_model are available from previous steps
if 'model' in locals() and 'test_images_for_model' in locals():
    predictions_prob_cnn = model.predict(test_images_for_model)
    predictions_cnn = (predictions_prob_cnn > 0.5).astype("int32")
    print(classification_report(test_labels_for_model, predictions_cnn))
    print("Confusion Matrix:")
    print(confusion_matrix(test_labels_for_model, predictions_cnn))
else:
    print("CNN model or test data not available for evaluation.")

### Ensemble Model (CNN probabilities + Handcrafted Features with SMOTE and Feature Selection)

In [None]:
# Print the classification report and confusion matrix for the Ensemble model again for easy comparison
from sklearn.metrics import classification_report, confusion_matrix

print("Ensemble Model Evaluation:")
# Ensure ensemble_classifier and test_ensemble_features_selected are available
if 'ensemble_classifier' in locals() and 'test_ensemble_features_selected' in locals() and test_ensemble_features_selected is not None:
     ensemble_predictions = ensemble_classifier.predict(test_ensemble_features_selected)
     print(classification_report(test_labels_for_model, ensemble_predictions))
     print("Confusion Matrix:")
     print(confusion_matrix(test_labels_for_model, ensemble_predictions))
else:
    print("Ensemble model or test features not available for evaluation.")

### Comparative Analysis

Based on the classification reports and confusion matrices, we can compare the performance of the three models, paying close attention to metrics like precision, recall, and F1-score for the "damaged" class, given the class imbalance.

*   **Random Forest Classifier**: [Summarize key findings from the RF evaluation]
*   **Convolutional Neural Network (CNN)**: [Summarize key findings from the CNN evaluation]
*   **Ensemble Model**: [Summarize key findings from the Ensemble model evaluation]

Consider which model achieved the best balance of precision and recall for the damaged nuclei, and discuss any insights gained from the feature importance analysis of the Random Forest model.

This concludes the analysis of the trained models.

In [None]:
# Combine shape and texture features for training
# Ensure both arrays have the same number of samples
if train_shape_features.shape[0] == train_texture_features.shape[0]:
    # Reshape texture features to be 2D if they are 3D (as seen in the output)
    # The shape (30, 4, 13) suggests 4 directions with 13 features each.
    # We can flatten the last two dimensions to get (30, 52)
    train_texture_features_flat = train_texture_features.reshape(train_texture_features.shape[0], -1)
    train_combined_features = np.concatenate((train_shape_features, train_texture_features_flat), axis=1)
    print(f"Shape of train_combined_features: {train_combined_features.shape}")
else:
    print("Mismatch in the number of training samples for shape and texture features.")
    train_combined_features = None # Or handle the error appropriately


# Combine shape and texture features for testing
# Ensure both arrays have the same number of samples
if test_shape_features.shape[0] == test_texture_features.shape[0]:
    # Reshape texture features similarly
    test_texture_features_flat = test_texture_features.reshape(test_texture_features.shape[0], -1)
    test_combined_features = np.concatenate((test_shape_features, test_texture_features_flat), axis=1)
    print(f"Shape of test_combined_features: {test_combined_features.shape}")
else:
    print("Mismatch in the number of testing samples for shape and texture features.")
    test_combined_features = None # Or handle the error appropriately

In [None]:
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE

# Apply SMOTE to handle class imbalance
smote = SMOTE(random_state=42)
train_features_smote, train_labels_smote = smote.fit_resample(train_combined_features, train_labels_for_model)

# Instantiate and train a Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(train_features_smote, train_labels_smote)

print("Random Forest classifier trained successfully with SMOTE.")

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import pandas as pd
import numpy as np # Ensure numpy is imported

# Make predictions on the test set
predictions = rf_classifier.predict(test_combined_features)

# Generate and print classification report
print("Classification Report:")
print(classification_report(test_labels_for_model, predictions))

# Compute and print accuracy score
accuracy = accuracy_score(test_labels_for_model, predictions)
print(f"Accuracy Score: {accuracy}")

# Generate and print confusion matrix
conf_matrix = confusion_matrix(test_labels_for_model, predictions)
print("Confusion Matrix:")
print(conf_matrix)

# Extract feature importances
feature_importances = rf_classifier.feature_importances_

# Create a list of feature names (based on the order they were combined)
# This requires knowing the number of features from each category
num_shape_features = train_shape_features.shape[1]

# Correctly calculate the number of flattened texture features
# The original texture features shape was (samples, num_features_per_image)
# After flattening, it should be (samples, num_features_per_image)
# The number of texture features is simply the second dimension of the flattened array.
# We can get this from the shape of train_texture_features_flat created in the previous step.
# Assuming train_texture_features_flat was successfully created:
try:
    num_texture_features_flat = train_texture_features_flat.shape[1]
except NameError:
    # If train_texture_features_flat was not created due to the previous error,
    # we need to recreate it or calculate its expected size.
    # Based on the previous output (30, 76), the flattened size should be 76.
    # We need to update this to reflect the *new* number of flattened texture features
    # after incorporating the additional features.
    # Based on the shape of train_combined_features (30, 251) and train_shape_features (30, 159),
    # the number of flattened texture features is 251 - 159 = 92.
    num_texture_features_flat = 92 # Manually set based on new combined feature shape

# Assuming a simple naming convention for demonstration
shape_feature_names = [f"shape_{i}" for i in range(num_shape_features)]
texture_feature_names = [f"texture_{i}" for i in range(num_texture_features_flat)]
all_feature_names = shape_feature_names + texture_feature_names

# Check if the lengths match before creating the Series
if len(feature_importances) == len(all_feature_names):
    # Create a pandas Series for feature importances
    feature_importance_series = pd.Series(feature_importances, index=all_feature_names)

    # Sort feature importances in descending order
    sorted_feature_importances = feature_importance_series.sort_values(ascending=False)

    # Print the top 20 most important features
    print("\nTop 20 Most Important Features:")
    print(sorted_feature_importances.head(20))
else:
    print(f"Mismatch in feature counts: Model has {len(feature_importances)}, generated names have {len(all_feature_names)}")

## Train convolutional neural network (cnn)

### Subtask:
Build and train a CNN directly on the normalized nucleus images, incorporating data augmentation, regularization, and potentially transfer learning.

**Reasoning**:
Implement the CNN model architecture, compile it, set up data augmentation, and train the model.

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Define the CNN model architecture
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(64, 64, 1)),
    MaxPooling2D((2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Conv2D(128, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid') # Sigmoid for binary classification
])

# Compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Setup data augmentation
datagen = ImageDataGenerator(
    rotation_range=20,
    zoom_range=0.1,
    width_shift_range=0.1,
    height_shift_range=0.1,
    horizontal_flip=True,
    vertical_flip=True
)

# Fit the data generator on the training data
datagen.fit(train_images_for_model)

# Train the model using the augmented data
history = model.fit(datagen.flow(train_images_for_model, train_labels_for_model, batch_size=32),
                    epochs=50, # Number of training epochs
                    validation_data=(test_images_for_model, test_labels_for_model))

print("CNN model training completed.")

## Evaluate convolutional neural network (cnn)

### Subtask:
Evaluate the performance of the trained CNN.

**Reasoning**:
Utilize the trained CNN model to generate predictions on the test set, convert probabilities to class labels, and then generate and print the classification report and confusion matrix to evaluate the model's performance as requested by the subtask.

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

# Make predictions on the test set
predictions_prob = model.predict(test_images_for_model)
predictions = (predictions_prob > 0.5).astype("int32")

# Generate and print classification report
print("CNN Classification Report:")
print(classification_report(test_labels_for_model, predictions))

# Generate and print confusion matrix
conf_matrix_cnn = confusion_matrix(test_labels_for_model, predictions)
print("CNN Confusion Matrix:")
print(conf_matrix_cnn)

## Build and Train Ensemble Model

### Subtask:
Combine the CNN probabilities with the handcrafted features and train a second-level Random Forest model, including feature selection and optimization.

**Reasoning**:
Combine the predictions (probabilities) from the trained CNN with the extracted shape and texture features, and then train a second-level Random Forest classifier on this combined feature set.

In [None]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import SelectKBest, f_classif

# Get predictions (probabilities) from the trained CNN on the training and testing data
train_cnn_predictions_prob = model.predict(train_images_for_model)
test_cnn_predictions_prob = model.predict(test_images_for_model)

# Reshape CNN predictions to be 2D arrays
train_cnn_predictions_prob = train_cnn_predictions_prob.reshape(-1, 1)
test_cnn_predictions_prob = test_cnn_predictions_prob.reshape(-1, 1)


# Combine CNN probabilities with handcrafted features for training
# Ensure both arrays have the same number of samples
if train_combined_features.shape[0] == train_cnn_predictions_prob.shape[0]:
    train_ensemble_features = np.concatenate((train_combined_features, train_cnn_predictions_prob), axis=1)
    print(f"Shape of train_ensemble_features: {train_ensemble_features.shape}")
else:
    print("Mismatch in the number of training samples for combined features and CNN predictions.")
    train_ensemble_features = None # Or handle the error appropriately


# Combine CNN probabilities with handcrafted features for testing
# Ensure both arrays have the same number of samples
if test_combined_features.shape[0] == test_cnn_predictions_prob.shape[0]:
    test_ensemble_features = np.concatenate((test_combined_features, test_cnn_predictions_prob), axis=1)
    print(f"Shape of test_ensemble_features: {test_ensemble_features.shape}")
else:
    print("Mismatch in the number of testing samples for combined features and CNN predictions.")
    test_ensemble_features = None # Or handle the error appropriately


# --- Feature Selection (Optional but Recommended for Ensemble) ---
# Use SelectKBest to select the top K features based on ANOVA F-value
# You can adjust the value of k (number of features to select)
if train_ensemble_features is not None and train_labels_for_model is not None:
    # Address potential NaNs or Infs in features before selection
    train_ensemble_features_clean = np.nan_to_num(train_ensemble_features)

    # Check if there are enough samples for feature selection
    if train_ensemble_features_clean.shape[0] > 1 and np.var(train_ensemble_features_clean, axis=0).sum() > 0:
        k = min(100, train_ensemble_features_clean.shape[1]) # Select top 100 features or fewer if less are available
        try:
            selector = SelectKBest(score_func=f_classif, k=k)
            train_ensemble_features_selected = selector.fit_transform(train_ensemble_features_clean, train_labels_for_model)
            test_ensemble_features_selected = selector.transform(np.nan_to_num(test_ensemble_features)) # Apply the same selection to test data
            print(f"Shape of train_ensemble_features_selected: {train_ensemble_features_selected.shape}")
            print(f"Shape of test_ensemble_features_selected: {test_ensemble_features_selected.shape}")
        except ValueError as e:
             print(f"Could not perform feature selection: {e}")
             # Fallback to using all features if selection fails
             train_ensemble_features_selected = train_ensemble_features_clean
             test_ensemble_features_selected = np.nan_to_num(test_ensemble_features)
             print("Using all features for ensemble training.")
    else:
         print("Not enough samples or variance for feature selection. Using all features.")
         train_ensemble_features_selected = train_ensemble_features_clean
         test_ensemble_features_selected = np.nan_to_num(test_ensemble_features)
else:
    print("Ensemble features not available for selection.")
    train_ensemble_features_selected = None
    test_ensemble_features_selected = None


# --- Train Second-Level Random Forest Classifier ---
# Apply SMOTE to the selected training features to handle class imbalance for the ensemble model
if train_ensemble_features_selected is not None and train_labels_for_model is not None:
    try:
        smote_ensemble = SMOTE(random_state=42)
        train_ensemble_features_smote, train_labels_ensemble_smote = smote_ensemble.fit_resample(train_ensemble_features_selected, train_labels_for_model)

        # Instantiate and train the second-level Random Forest Classifier
        ensemble_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
        ensemble_classifier.fit(train_ensemble_features_smote, train_labels_ensemble_smote)

        print("\nEnsemble Random Forest classifier trained successfully with SMOTE.")
    except ValueError as e:
         print(f"Could not train ensemble classifier: {e}")
         ensemble_classifier = None
else:
    print("Could not train ensemble classifier due to missing data.")

## Evaluate Ensemble Model

### Subtask:
Evaluate the performance of the ensemble model.

**Reasoning**:
Utilize the trained ensemble model to generate predictions on the test set and then generate and print the classification report and confusion matrix to evaluate its performance as requested by the subtask.

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Make predictions on the test set using the ensemble model
if test_ensemble_features_selected is not None and ensemble_classifier is not None:
    ensemble_predictions = ensemble_classifier.predict(test_ensemble_features_selected)

    # Generate and print classification report for the ensemble model
    print("Ensemble Model Classification Report:")
    print(classification_report(test_labels_for_model, ensemble_predictions))

    # Compute and print accuracy score for the ensemble model
    ensemble_accuracy = accuracy_score(test_labels_for_model, ensemble_predictions)
    print(f"Ensemble Model Accuracy Score: {ensemble_accuracy}")

    # Generate and print confusion matrix for the ensemble model
    conf_matrix_ensemble = confusion_matrix(test_labels_for_model, ensemble_predictions)
    print("Ensemble Model Confusion Matrix:")
    print(conf_matrix_ensemble)
else:
    print("Ensemble model or test features not available for evaluation.")

## Summary of Model Performance

We have trained and evaluated three different models for classifying damaged nuclei: a Random Forest classifier using handcrafted features, a Convolutional Neural Network (CNN) trained on normalized images, and an ensemble model combining CNN probabilities and handcrafted features.

Here's a summary of the key performance metrics for each model on the test set:

### Random Forest Classifier (using handcrafted features with SMOTE)

In [None]:
# Print the classification report and confusion matrix for the Random Forest model again for easy comparison
from sklearn.metrics import classification_report, confusion_matrix

print("Random Forest Classifier Evaluation:")
print(classification_report(test_labels_for_model, predictions))
print("Confusion Matrix:")
print(conf_matrix)

### Convolutional Neural Network (CNN)

In [None]:
# Print the classification report and confusion matrix for the CNN model again for easy comparison
from sklearn.metrics import classification_report, confusion_matrix

print("CNN Evaluation:")
# Ensure model and test_images_for_model are available from previous steps
if 'model' in locals() and 'test_images_for_model' in locals():
    predictions_prob_cnn = model.predict(test_images_for_model)
    predictions_cnn = (predictions_prob_cnn > 0.5).astype("int32")
    print(classification_report(test_labels_for_model, predictions_cnn))
    print("Confusion Matrix:")
    print(confusion_matrix(test_labels_for_model, predictions_cnn))
else:
    print("CNN model or test data not available for evaluation.")

### Ensemble Model (CNN probabilities + Handcrafted Features with SMOTE and Feature Selection)

In [None]:
# Print the classification report and confusion matrix for the Ensemble model again for easy comparison
from sklearn.metrics import classification_report, confusion_matrix

print("Ensemble Model Evaluation:")
# Ensure ensemble_classifier and test_ensemble_features_selected are available
if 'ensemble_classifier' in locals() and 'test_ensemble_features_selected' in locals() and test_ensemble_features_selected is not None:
     ensemble_predictions = ensemble_classifier.predict(test_ensemble_features_selected)
     print(classification_report(test_labels_for_model, ensemble_predictions))
     print("Confusion Matrix:")
     print(confusion_matrix(test_labels_for_model, ensemble_predictions))
else:
    print("Ensemble model or test features not available for evaluation.")

### Comparative Analysis

Based on the classification reports and confusion matrices, we can compare the performance of the three models, paying close attention to metrics like precision, recall, and F1-score for the "damaged" class, given the class imbalance.

*   **Random Forest Classifier**: [Summarize key findings from the RF evaluation]
*   **Convolutional Neural Network (CNN)**: [Summarize key findings from the CNN evaluation]
*   **Ensemble Model**: [Summarize key findings from the Ensemble model evaluation]

Consider which model achieved the best balance of precision and recall for the damaged nuclei, and discuss any insights gained from the feature importance analysis of the Random Forest model.

This concludes the analysis of the trained models.

### Refine Green Channel Thresholding

Let's experiment with adjusting the global threshold value for the green channel to see if it improves foci detection.

In [None]:
def apply_global_thresholding(image, threshold_value):
    """Applies global thresholding to a grayscale image."""
    # Apply global thresholding
    _, thresholded_img = cv2.threshold(image, threshold_value, 255, cv2.THRESH_BINARY)
    return thresholded_img

# Example: Apply thresholding with a different value (e.g., 30) to one of the training images
# and display the result to visually assess the effect.
example_image_index = 0 # You can change this index to view other images
threshold_value = 30   # Experiment with this threshold value

example_green_image = train_green_images[example_image_index]
thresholded_example_green = apply_global_thresholding(example_green_image, threshold_value)

print(f"Applied global thresholding to training green image at index {example_image_index} with threshold value {threshold_value}.")

# Display the original and thresholded images for comparison
from google.colab.patches import cv2_imshow

print("Original Green Image:")
cv2_imshow(example_green_image)

print(f"\nThresholded Green Image (Threshold = {threshold_value}):")
cv2_imshow(thresholded_example_green)

# Note: We will apply this thresholding to all images after finding a suitable value.

In [None]:
# Apply the chosen thresholding to all training and testing green images
# Using the threshold_value from the previous cell
train_green_thresholded = [apply_global_thresholding(img, threshold_value) for img in train_green_images]
test_green_thresholded = [apply_global_thresholding(img, threshold_value) for img in test_green_images]

print(f"Applied global thresholding to {len(train_green_thresholded)} training green images.")
print(f"Applied global thresholding to {len(test_green_thresholded)} testing green images.")

# Re-run foci detection and nucleus labeling with the new thresholded images
train_labeled_nuclei = []
for i in range(len(train_paired_images)):
    labeled_nuclei_in_image = detect_foci_and_label_nucleus(
        train_paired_images[i],
        train_segmented_masks[i],
        train_green_thresholded[i]
    )
    train_labeled_nuclei.extend(labeled_nuclei_in_image)

test_labeled_nuclei = []
for i in range(len(test_paired_images)):
    labeled_nuclei_in_image = detect_foci_and_label_nucleus(
        test_paired_images[i],
        test_segmented_masks[i],
        test_green_thresholded[i]
    )
    test_labeled_nuclei.extend(labeled_nuclei_in_image)

# Count damaged and normal nuclei with the new labeling
train_damaged_count_new = sum(1 for img, label in train_labeled_nuclei if label == "damaged")
train_normal_count_new = sum(1 for img, label in train_labeled_nuclei if label == "normal")
test_damaged_count_new = sum(1 for img, label in test_labeled_nuclei if label == "damaged")
test_normal_count_new = sum(1 for img, label in test_labeled_nuclei if label == "normal")

print("\nNuclei counts with updated green channel thresholding:")
print(f"Training set: {train_damaged_count_new} damaged, {train_normal_count_new} normal.")
print(f"Testing set: {test_damaged_count_new} damaged, {test_normal_count_new} normal.")

# Update the all_train_labeled_nuclei and all_test_labeled_nuclei variables
all_train_labeled_nuclei = train_labeled_nuclei
all_test_labeled_nuclei = test_labeled_nuclei

## Optimize Random Forest Classifier

### Subtask:
Optimize the Random Forest model by experimenting with hyperparameters and class weighting to improve performance on the imbalanced dataset.

In [None]:
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

# Update the combined features and labels with the newly processed data
# Assuming all_train_labeled_nuclei and all_test_labeled_nuclei have been updated
train_images_for_model = [nucleus[0] for nucleus in all_train_labeled_nuclei]
train_labels_for_model = [nucleus[1] for nucleus in all_train_labeled_nuclei]
test_images_for_model = [nucleus[0] for nucleus in all_test_labeled_nuclei]
test_labels_for_model = [nucleus[1] for nucleus in all_test_labeled_nuclei]

# Convert images to NumPy arrays and add channel dimension (if not already done)
train_images_for_model = np.array(train_images_for_model).reshape(-1, 64, 64, 1)
test_images_for_model = np.array(test_images_for_model).reshape(-1, 64, 64, 1)

# Convert labels to NumPy arrays and to numerical format (if not already done)
label_mapping = {"normal": 0, "damaged": 1}
train_labels_for_model = np.array([label_mapping[label] for label in train_labels_for_model])
test_labels_for_model = np.array([label_mapping[label] for label in test_labels_for_model])

# Re-extract and combine features with the potentially updated labels
# This ensures the features correspond to the new labeling
train_shape_features = [extract_shape_features(nucleus_img) for nucleus_img, label in all_train_labeled_nuclei]
test_shape_features = [extract_shape_features(nucleus_img) for nucleus_img, label in all_test_labeled_nuclei]

train_texture_features = [extract_texture_features(nucleus_img) for nucleus_img, label in all_train_labeled_nuclei]
test_texture_features = [extract_texture_features(nucleus_img) for nucleus_img, label in all_test_labeled_labeled_nuclei] # Corrected typo here


# Convert to numpy arrays
train_shape_features = np.array(train_shape_features)
test_shape_features = np.array(test_shape_features)

# Ensure all texture feature lists have the same length before converting to numpy array
def pad_features(features_list):
    max_len = max(len(f) for f in features_list)
    padded_features = []
    for f in features_list:
        # Ensure each element in f is a number before padding
        padded_features.append([float(x) if isinstance(x, (int, float)) else 0.0 for x in f] + [0.0] * (max_len - len(f)))
    return np.array(padded_features)

train_texture_features = pad_features(train_texture_features)
test_texture_features = pad_features(test_texture_features)

# Re-combine shape and texture features
if train_shape_features.shape[0] == train_texture_features.shape[0]:
    train_texture_features_flat = train_texture_features.reshape(train_texture_features.shape[0], -1)
    train_combined_features = np.concatenate((train_shape_features, train_texture_features_flat), axis=1)
else:
    print("Mismatch in training samples for shape and texture features after re-extraction.")
    train_combined_features = None

if test_shape_features.shape[0] == test_texture_features.shape[0]:
    test_texture_features_flat = test_texture_features.reshape(test_texture_features.shape[0], -1)
    test_combined_features = np.concatenate((test_shape_features, test_texture_features_flat), axis=1)
else:
    print("Mismatch in testing samples for shape and texture features after re-extraction.")
    test_combined_features = None

# Apply SMOTE to handle class imbalance on the combined training features
if train_combined_features is not None and train_labels_for_model is not None:
    smote = SMOTE(random_state=42)
    train_features_smote, train_labels_smote = smote.fit_resample(train_combined_features, train_labels_for_model)

    # Define the parameter grid for GridSearchCV
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'class_weight': [None, 'balanced', 'balanced_subsample'] # Experiment with class weighting
    }

    # Instantiate the Random Forest Classifier
    rf = RandomForestClassifier(random_state=42)

    # Instantiate GridSearchCV
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='recall', n_jobs=-1) # Optimize for recall

    # Fit GridSearchCV to the SMOTE-resampled training data
    grid_search.fit(train_features_smote, train_labels_smote)

    # Get the best parameters and the best estimator
    best_params = grid_search.best_params_
    best_rf_classifier = grid_search.best_estimator_

    print("Best parameters found by GridSearchCV:")
    print(best_params)

    print("\nOptimized Random Forest classifier trained successfully.")

    # Evaluate the best model on the original (non-SMOTE) test set
    if test_combined_features is not None and test_labels_for_model is not None:
        predictions = best_rf_classifier.predict(test_combined_features)
        print("\nOptimized Random Forest Classifier Evaluation on Test Set:")
        print(classification_report(test_labels_for_model, predictions))
    else:
        print("\nTest data not available for evaluating the optimized Random Forest classifier.")

else:
    print("Combined training features or labels not available for Random Forest optimization.")

In [None]:
def process_image_pair(paired_image):
    """
    Processes a paired image (red, green) through segmentation,
    thresholding, foci detection, and labeling.

    Args:
        paired_image: A tuple containing the red and green channel images (grayscale).

    Returns:
        A list of tuples, where each tuple contains the normalized
        red nucleus image and its corresponding label ("damaged" or "normal").
    """
    red_img, green_img = paired_image

    # 1. Nucleus segmentation (red channel)
    segmented_mask = segment_nuclei(red_img)

    # 2. Green channel global thresholding
    # Use the threshold_value defined previously (or a default if not defined)
    global threshold_value # Access the threshold value from the previous cell if it exists
    try:
        threshold_value_to_use = threshold_value
    except NameError:
        threshold_value_to_use = 50 # Default value if not set

    thresholded_green_image = apply_global_thresholding(green_img, threshold_value_to_use)


    # 3. Foci detection and nucleus labeling
    labeled_nuclei_in_image = detect_foci_and_label_nucleus(
        paired_image,
        segmented_mask,
        thresholded_green_image
    )

    return labeled_nuclei_in_image

# Process training images
all_train_labeled_nuclei = []
for paired_img in train_paired_images:
    labeled_nuclei = process_image_pair(paired_img)
    all_train_labeled_nuclei.extend(labeled_nuclei)

# Process testing images
all_test_labeled_nuclei = []
for paired_img in test_paired_images:
    labeled_nuclei = process_image_pair(paired_img)
    all_test_labeled_nuclei.extend(labeled_nuclei)

# Print the total number of labeled nuclei
print(f"Total labeled nuclei in training set: {len(all_train_labeled_nuclei)}")
print(f"Total labeled nuclei in testing set: {len(all_test_labeled_nuclei)}")

In [None]:
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import numpy as np # Ensure numpy is imported
from sklearn.feature_selection import SelectKBest, f_classif # Import feature selection

# Update the combined features and labels with the newly processed data
# Assuming all_train_labeled_nuclei and all_test_labeled_nuclei have been updated
train_images_for_model_re = [nucleus[0] for nucleus in all_train_labeled_nuclei]
train_labels_for_model_re = [nucleus[1] for nucleus in all_train_labeled_nuclei]
test_images_for_model_re = [nucleus[0] for nucleus in all_test_labeled_nuclei]
test_labels_for_model_re = [nucleus[1] for nucleus in all_test_labeled_nuclei]

# Convert images to NumPy arrays and add channel dimension (if not already done)
train_images_for_model_re = np.array(train_images_for_model_re).reshape(-1, 64, 64, 1)
test_images_for_model_re = np.array(test_images_for_model_re).reshape(-1, 64, 64, 1)

# Convert labels to NumPy arrays and to numerical format (if not already done)
label_mapping = {"normal": 0, "damaged": 1}
train_labels_for_model_re = np.array([label_mapping[label] for label in train_labels_for_model_re])
test_labels_for_model_re = np.array([label_mapping[label] for label in test_labels_for_model_re])

# Re-extract and combine features with the potentially updated labels
# This ensures the features correspond to the new labeling
train_shape_features_re = [extract_shape_features(nucleus_img) for nucleus_img, label in all_train_labeled_nuclei]
test_shape_features_re = [extract_shape_features(nucleus_img) for nucleus_img, label in all_test_labeled_nuclei]

train_texture_features_re = [extract_texture_features(nucleus_img) for nucleus_img, label in all_train_labeled_nuclei]
test_texture_features_re = [extract_texture_features(nucleus_img) for nucleus_img, label in all_test_labeled_nuclei]


# Convert to numpy arrays
train_shape_features_re = np.array(train_shape_features_re)
test_shape_features_re = np.array(test_shape_features_re)

# Ensure all texture feature lists have the same length before converting to numpy array
def pad_features(features_list):
    max_len = max(len(f) for f in features_list)
    padded_features = []
    for f in features_list:
        # Ensure each element in f is a number before padding
        padded_features.append([float(x) if isinstance(x, (int, float)) else 0.0 for x in f] + [0.0] * (max_len - len(f)))
    return np.array(padded_features)

train_texture_features_re = pad_features(train_texture_features_re)
test_texture_features_re = pad_features(test_texture_features_re)

# Re-combine shape and texture features
if train_shape_features_re.shape[0] == train_texture_features_re.shape[0]:
    train_texture_features_flat_re = train_texture_features_re.reshape(train_texture_features_re.shape[0], -1)
    train_combined_features_re = np.concatenate((train_shape_features_re, train_texture_features_flat_re), axis=1)
else:
    print("Mismatch in training samples for shape and texture features after re-extraction.")
    train_combined_features_re = None

if test_shape_features_re.shape[0] == test_texture_features_re.shape[0]:
    test_texture_features_flat_re = test_texture_features_re.reshape(test_texture_features_re.shape[0], -1)
    test_combined_features_re = np.concatenate((test_shape_features_re, test_texture_features_flat_re), axis=1)
else:
    print("Mismatch in testing samples for shape and texture features after re-extraction.")
    test_combined_features_re = None


# Apply SMOTE to handle class imbalance on the combined training features
if train_combined_features_re is not None and train_labels_for_model_re is not None:
    smote = SMOTE(random_state=42)
    train_features_smote, train_labels_smote = smote.fit_resample(train_combined_features_re, train_labels_for_model_re)

    # Define the parameter grid for GridSearchCV
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'class_weight': [None, 'balanced', 'balanced_subsample'] # Experiment with class weighting
    }

    # Instantiate the Random Forest Classifier
    rf = RandomForestClassifier(random_state=42)

    # Instantiate GridSearchCV
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='recall', n_jobs=-1) # Optimize for recall

    # Fit GridSearchCV to the SMOTE-resampled training data
    grid_search.fit(train_features_smote, train_labels_smote)

    # Get the best parameters and the best estimator
    best_params = grid_search.best_params_
    best_rf_classifier = grid_search.best_estimator_

    print("Best parameters found by GridSearchCV:")
    print(best_params)

    print("\nOptimized Random Forest classifier trained successfully.")

    # Evaluate the best model on the original (non-SMOTE) test set
    if test_combined_features_re is not None and test_labels_for_model_re is not None:
        predictions = best_rf_classifier.predict(test_combined_features_re)
        print("\nOptimized Random Forest Classifier Evaluation on Test Set:")
        print(classification_report(test_labels_for_model_re, predictions))
    else:
        print("\nTest data not available for evaluating the optimized Random Forest classifier.")

else:
    print("Combined training features or labels not available for Random Forest optimization.")

In [None]:
def process_image_pair(paired_image):
    """
    Processes a paired image (red, green) through segmentation,
    thresholding, foci detection, and labeling.

    Args:
        paired_image: A tuple containing the red and green channel images (grayscale).

    Returns:
        A list of tuples, where each tuple contains the normalized
        red nucleus image and its corresponding label ("damaged" or "normal").
    """
    red_img, green_img = paired_image

    # 1. Nucleus segmentation (red channel)
    segmented_mask = segment_nuclei(red_img)

    # 2. Green channel global thresholding
    # Use the threshold_value defined previously (or a default if not defined)
    global threshold_value # Access the threshold value from the previous cell if it exists
    try:
        threshold_value_to_use = threshold_value
    except NameError:
        threshold_value_to_use = 50 # Default value if not set

    thresholded_green_image = apply_global_thresholding(green_img, threshold_value_to_use)


    # 3. Foci detection and nucleus labeling
    labeled_nuclei_in_image = detect_foci_and_label_nucleus(
        paired_image,
        segmented_mask,
        thresholded_green_image
    )

    return labeled_nuclei_in_image

# Process training images
all_train_labeled_nuclei = []
for paired_img in train_paired_images:
    labeled_nuclei = process_image_pair(paired_img)
    all_train_labeled_nuclei.extend(labeled_nuclei)

# Process testing images
all_test_labeled_nuclei = []
for paired_img in test_paired_images:
    labeled_nuclei = process_image_pair(paired_img)
    all_test_labeled_nuclei.extend(labeled_nuclei)

# Print the total number of labeled nuclei
print(f"Total labeled nuclei in training set: {len(all_train_labeled_nuclei)}")
print(f"Total labeled nuclei in testing set: {len(all_test_labeled_nuclei)}")

In [None]:
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import numpy as np # Ensure numpy is imported
from sklearn.feature_selection import SelectKBest, f_classif # Import feature selection

# Update the combined features and labels with the newly processed data
# Assuming all_train_labeled_nuclei and all_test_labeled_nuclei have been updated
train_images_for_model_re = [nucleus[0] for nucleus in all_train_labeled_nuclei]
train_labels_for_model_re = [nucleus[1] for nucleus in all_train_labeled_nuclei]
test_images_for_model_re = [nucleus[0] for nucleus in all_test_labeled_nuclei]
test_labels_for_model_re = [nucleus[1] for nucleus in all_test_labeled_nuclei]

# Convert images to NumPy arrays and add channel dimension (if not already done)
train_images_for_model_re = np.array(train_images_for_model_re).reshape(-1, 64, 64, 1)
test_images_for_model_re = np.array(test_images_for_model_re).reshape(-1, 64, 64, 1)

# Convert labels to NumPy arrays and to numerical format (if not already done)
label_mapping = {"normal": 0, "damaged": 1}
train_labels_for_model_re = np.array([label_mapping[label] for label in train_labels_for_model_re])
test_labels_for_model_re = np.array([label_mapping[label] for label in test_labels_for_model_re])

# Re-extract and combine features with the potentially updated labels
# This ensures the features correspond to the new labeling
train_shape_features_re = [extract_shape_features(nucleus_img) for nucleus_img, label in all_train_labeled_nuclei]
test_shape_features_re = [extract_shape_features(nucleus_img) for nucleus_img, label in all_test_labeled_nuclei]

train_texture_features_re = [extract_texture_features(nucleus_img) for nucleus_img, label in all_train_labeled_nuclei]
test_texture_features_re = [extract_texture_features(nucleus_img) for nucleus_img, label in all_test_labeled_nuclei]


# Convert to numpy arrays
train_shape_features_re = np.array(train_shape_features_re)
test_shape_features_re = np.array(test_shape_features_re)

# Ensure all texture feature lists have the same length before converting to numpy array
def pad_features(features_list):
    max_len = max(len(f) for f in features_list)
    padded_features = []
    for f in features_list:
        # Ensure each element in f is a number before padding
        padded_features.append([float(x) if isinstance(x, (int, float)) else 0.0 for x in f] + [0.0] * (max_len - len(f)))
    return np.array(padded_features)

train_texture_features_re = pad_features(train_texture_features_re)
test_texture_features_re = pad_features(test_texture_features_re)

# Re-combine shape and texture features
if train_shape_features_re.shape[0] == train_texture_features_re.shape[0]:
    train_texture_features_flat_re = train_texture_features_re.reshape(train_texture_features_re.shape[0], -1)
    train_combined_features_re = np.concatenate((train_shape_features_re, train_texture_features_flat_re), axis=1)
else:
    print("Mismatch in training samples for shape and texture features after re-extraction.")
    train_combined_features_re = None

if test_shape_features_re.shape[0] == test_texture_features_re.shape[0]:
    test_texture_features_flat_re = test_texture_features_re.reshape(test_texture_features_re.shape[0], -1)
    test_combined_features_re = np.concatenate((test_shape_features_re, test_texture_features_flat_re), axis=1)
else:
    print("Mismatch in testing samples for shape and texture features after re-extraction.")
    test_combined_features_re = None


# Apply SMOTE to handle class imbalance on the combined training features
if train_combined_features_re is not None and train_labels_for_model_re is not None:
    smote = SMOTE(random_state=42)
    train_features_smote, train_labels_smote = smote.fit_resample(train_combined_features_re, train_labels_for_model_re)

    # Define the parameter grid for GridSearchCV
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'class_weight': [None, 'balanced', 'balanced_subsample'] # Experiment with class weighting
    }

    # Instantiate the Random Forest Classifier
    rf = RandomForestClassifier(random_state=42)

    # Instantiate GridSearchCV
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='recall', n_jobs=-1) # Optimize for recall

    # Fit GridSearchCV to the SMOTE-resampled training data
    grid_search.fit(train_features_smote, train_labels_smote)

    # Get the best parameters and the best estimator
    best_params = grid_search.best_params_
    best_rf_classifier = grid_search.best_estimator_

    print("Best parameters found by GridSearchCV:")
    print(best_params)

    print("\nOptimized Random Forest classifier trained successfully.")

    # Evaluate the best model on the original (non-SMOTE) test set
    if test_combined_features_re is not None and test_labels_for_model_re is not None:
        predictions = best_rf_classifier.predict(test_combined_features_re)
        print("\nOptimized Random Forest Classifier Evaluation on Test Set:")
        print(classification_report(test_labels_for_model_re, predictions))
    else:
        print("\nTest data not available for evaluating the optimized Random Forest classifier.")

else:
    print("Combined training features or labels not available for Random Forest optimization.")

In [None]:
import cv2
import os
import numpy as np

def load_and_preprocess_images(base_dir):
    """Loads images from specified directory and converts to grayscale."""
    images = []
    image_filenames = sorted(os.listdir(base_dir))
    for filename in image_filenames:
        img_path = os.path.join(base_dir, filename)
        img = cv2.imread(img_path)

        # Check if image was loaded successfully
        if img is None:
            print(f"Warning: Could not load image {img_path}")
            continue

        # OpenCV loads images in BGR format, convert to grayscale
        gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        images.append(gray_img)
    return np.array(images)

train_red_images = load_and_preprocess_images('DNA_Damage_Project/image/red')
test_red_images = load_and_preprocess_images('DNA_Damage_Project/test images/red')

print(f"Loaded {len(train_red_images)} training red images.")
print(f"Loaded {len(test_red_images)} testing red images.")

In [None]:
import cv2
import os
import numpy as np

def load_and_preprocess_images(base_dir):
    """Loads images from specified directory and converts to grayscale."""
    images = []
    image_filenames = sorted(os.listdir(base_dir))
    for filename in image_filenames:
        img_path = os.path.join(base_dir, filename)
        img = cv2.imread(img_path)

        # Check if image was loaded successfully
        if img is None:
            print(f"Warning: Could not load image {img_path}")
            continue

        # OpenCV loads images in BGR format, convert to grayscale
        gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        images.append(gray_img)
    return np.array(images)

train_green_images = load_and_preprocess_images('DNA_Damage_Project/image/green')
test_green_images = load_and_preprocess_images('DNA_Damage_Project/test images/green')

print(f"Loaded {len(train_green_images)} training green images.")
print(f"Loaded {len(test_green_images)} testing green images.")

In [None]:
def pair_images(red_images, green_images):
    """Pairs corresponding red and green images."""
    # Assuming images are already sorted by filename in the loading step
    # and that the filenames match between red and green directories
    paired_images = []
    for i in range(len(red_images)):
        paired_images.append((red_images[i], green_images[i]))
    return paired_images

train_paired_images = pair_images(train_red_images, train_green_images)
test_paired_images = pair_images(test_red_images, test_green_images)

print(f"Created {len(train_paired_images)} paired training images.")
print(f"Created {len(test_paired_images)} paired testing images.")

In [None]:
def process_image_pair(paired_image):
    """
    Processes a paired image (red, green) through segmentation,
    thresholding, foci detection, and labeling.

    Args:
        paired_image: A tuple containing the red and green channel images (grayscale).

    Returns:
        A list of tuples, where each tuple contains the normalized
        red nucleus image and its corresponding label ("damaged" or "normal").
    """
    red_img, green_img = paired_image

    # 1. Nucleus segmentation (red channel)
    segmented_mask = segment_nuclei(red_img)

    # 2. Green channel global thresholding
    # Use the threshold_value defined previously (or a default if not defined)
    global threshold_value # Access the threshold value from the previous cell if it exists
    try:
        threshold_value_to_use = threshold_value
    except NameError:
        threshold_value_to_use = 50 # Default value if not set

    thresholded_green_image = apply_global_thresholding(green_img, threshold_value_to_use)


    # 3. Foci detection and nucleus labeling
    labeled_nuclei_in_image = detect_foci_and_label_nucleus(
        paired_image,
        segmented_mask,
        thresholded_green_image
    )

    return labeled_nuclei_in_image

# Process training images
all_train_labeled_nuclei = []
for paired_img in train_paired_images:
    labeled_nuclei = process_image_pair(paired_img)
    all_train_labeled_nuclei.extend(labeled_nuclei)

# Process testing images
all_test_labeled_nuclei = []
for paired_img in test_paired_images:
    labeled_nuclei = process_image_pair(paired_img)
    all_test_labeled_nuclei.extend(labeled_nuclei)

# Print the total number of labeled nuclei
print(f"Total labeled nuclei in training set: {len(all_train_labeled_nuclei)}")
print(f"Total labeled nuclei in testing set: {len(all_test_labeled_nuclei)}")

In [None]:
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import numpy as np # Ensure numpy is imported
from sklearn.feature_selection import SelectKBest, f_classif # Import feature selection

# Update the combined features and labels with the newly processed data
# Assuming all_train_labeled_nuclei and all_test_labeled_nuclei have been updated
train_images_for_model_re = [nucleus[0] for nucleus in all_train_labeled_nuclei]
train_labels_for_model_re = [nucleus[1] for nucleus in all_train_labeled_nuclei]
test_images_for_model_re = [nucleus[0] for nucleus in all_test_labeled_nuclei]
test_labels_for_model_re = [nucleus[1] for nucleus in all_test_labeled_nuclei]

# Convert images to NumPy arrays and add channel dimension (if not already done)
train_images_for_model_re = np.array(train_images_for_model_re).reshape(-1, 64, 64, 1)
test_images_for_model_re = np.array(test_images_for_model_re).reshape(-1, 64, 64, 1)

# Convert labels to NumPy arrays and to numerical format (if not already done)
label_mapping = {"normal": 0, "damaged": 1}
train_labels_for_model_re = np.array([label_mapping[label] for label in train_labels_for_model_re])
test_labels_for_model_re = np.array([label_mapping[label] for label in test_labels_for_model_re])

# Re-extract and combine features with the potentially updated labels
# This ensures the features correspond to the new labeling
train_shape_features_re = [extract_shape_features(nucleus_img) for nucleus_img, label in all_train_labeled_nuclei]
test_shape_features_re = [extract_shape_features(nucleus_img) for nucleus_img, label in all_test_labeled_nuclei]

train_texture_features_re = [extract_texture_features(nucleus_img) for nucleus_img, label in all_train_labeled_nuclei]
test_texture_features_re = [extract_texture_features(nucleus_img) for nucleus_img, label in all_test_labeled_nuclei]


# Convert to numpy arrays
train_shape_features_re = np.array(train_shape_features_re)
test_shape_features_re = np.array(test_shape_features_re)

# Ensure all texture feature lists have the same length before converting to numpy array
def pad_features(features_list):
    max_len = max(len(f) for f in features_list)
    padded_features = []
    for f in features_list:
        # Ensure each element in f is a number before padding
        padded_features.append([float(x) if isinstance(x, (int, float)) else 0.0 for x in f] + [0.0] * (max_len - len(f)))
    return np.array(padded_features)

train_texture_features_re = pad_features(train_texture_features_re)
test_texture_features_re = pad_features(test_texture_features_re)

# Re-combine shape and texture features
if train_shape_features_re.shape[0] == train_texture_features_re.shape[0]:
    train_texture_features_flat_re = train_texture_features_re.reshape(train_texture_features_re.shape[0], -1)
    train_combined_features_re = np.concatenate((train_shape_features_re, train_texture_features_flat_re), axis=1)
else:
    print("Mismatch in training samples for shape and texture features after re-extraction.")
    train_combined_features_re = None

if test_shape_features_re.shape[0] == test_texture_features_re.shape[0]:
    test_texture_features_flat_re = test_texture_features_re.reshape(test_texture_features_re.shape[0], -1)
    test_combined_features_re = np.concatenate((test_shape_features_re, test_texture_features_flat_re), axis=1)
else:
    print("Mismatch in testing samples for shape and texture features after re-extraction.")
    test_combined_features_re = None


# Apply SMOTE to handle class imbalance on the combined training features
if train_combined_features_re is not None and train_labels_for_model_re is not None:
    smote = SMOTE(random_state=42)
    train_features_smote, train_labels_smote = smote.fit_resample(train_combined_features_re, train_labels_for_model_re)

    # Define the parameter grid for GridSearchCV
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'class_weight': [None, 'balanced', 'balanced_subsample'] # Experiment with class weighting
    }

    # Instantiate the Random Forest Classifier
    rf = RandomForestClassifier(random_state=42)

    # Instantiate GridSearchCV
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='recall', n_jobs=-1) # Optimize for recall

    # Fit GridSearchCV to the SMOTE-resampled training data
    grid_search.fit(train_features_smote, train_labels_smote)

    # Get the best parameters and the best estimator
    best_params = grid_search.best_params_
    best_rf_classifier = grid_search.best_estimator_

    print("Best parameters found by GridSearchCV:")
    print(best_params)

    print("\nOptimized Random Forest classifier trained successfully.")

    # Evaluate the best model on the original (non-SMOTE) test set
    if test_combined_features_re is not None and test_labels_for_model_re is not None:
        predictions = best_rf_classifier.predict(test_combined_features_re)
        print("\nOptimized Random Forest Classifier Evaluation on Test Set:")
        print(classification_report(test_labels_for_model_re, predictions))
    else:
        print("\nTest data not available for evaluating the optimized Random Forest classifier.")

else:
    print("Combined training features or labels not available for Random Forest optimization.")

In [None]:
!unzip DNA_Damage_Project-20250901T154313Z-1-001.zip

In [None]:
import cv2
import os
import numpy as np

def load_and_preprocess_images(base_dir):
    """Loads images from specified directory and converts to grayscale."""
    images = []
    image_filenames = sorted(os.listdir(base_dir))
    for filename in image_filenames:
        img_path = os.path.join(base_dir, filename)
        img = cv2.imread(img_path)

        # Check if image was loaded successfully
        if img is None:
            print(f"Warning: Could not load image {img_path}")
            continue

        # OpenCV loads images in BGR format, convert to grayscale
        gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        images.append(gray_img)
    return np.array(images)

train_red_images = load_and_preprocess_images('DNA_Damage_Project/image/red')
test_red_images = load_and_preprocess_images('DNA_Damage_Project/test images/red')

print(f"Loaded {len(train_red_images)} training red images.")
print(f"Loaded {len(test_red_images)} testing red images.")

In [None]:
import cv2
import os
import numpy as np

def load_and_preprocess_images(base_dir):
    """Loads images from specified directory and converts to grayscale."""
    images = []
    image_filenames = sorted(os.listdir(base_dir))
    for filename in image_filenames:
        img_path = os.path.join(base_dir, filename)
        img = cv2.imread(img_path)

        # Check if image was loaded successfully
        if img is None:
            print(f"Warning: Could not load image {img_path}")
            continue

        # OpenCV loads images in BGR format, convert to grayscale
        gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        images.append(gray_img)
    return np.array(images)

train_green_images = load_and_preprocess_images('DNA_Damage_Project/image/green')
test_green_images = load_and_preprocess_images('DNA_Damage_Project/test images/green')

print(f"Loaded {len(train_green_images)} training green images.")
print(f"Loaded {len(test_green_images)} testing green images.")

In [None]:
def pair_images(red_images, green_images):
    """Pairs corresponding red and green images."""
    # Assuming images are already sorted by filename in the loading step
    # and that the filenames match between red and green directories
    paired_images = []
    for i in range(len(red_images)):
        paired_images.append((red_images[i], green_images[i]))
    return paired_images

train_paired_images = pair_images(train_red_images, train_green_images)
test_paired_images = pair_images(test_red_images, test_green_images)

print(f"Created {len(train_paired_images)} paired training images.")
print(f"Created {len(test_paired_images)} paired testing images.")

In [None]:
def segment_nuclei(image):
    """Segments nuclei using Otsu's thresholding and watershed."""
    # Apply Otsu's thresholding
    _, thresh = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

    # Morphological operations to clean up the mask
    kernel = np.ones((3, 3), np.uint8)
    opening = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel, iterations=2)

    # Sure background area
    sure_bg = cv2.dilate(opening, kernel, iterations=3)

    # Sure foreground area
    dist_transform = cv2.distanceTransform(opening, cv2.DIST_L2, 5)
    _, sure_fg = cv2.threshold(dist_transform, 0.7 * dist_transform.max(), 255, 0)

    # Unknown region
    sure_fg = np.uint8(sure_fg)
    unknown = cv2.subtract(sure_bg, sure_fg)

    # Marker labelling
    _, markers = cv2.connectedComponents(sure_fg)

    # Add one to all labels so that sure background is not 0, but 1
    markers = markers + 1

    # Now, mark the region of unknown with zero
    markers[unknown == 255] = 0

    # Apply watershed algorithm
    # Create a BGR version of the grayscale image for watershed visualization (optional, but watershed expects 3 channels)
    img_bgr = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
    markers = cv2.watershed(img_bgr, markers)

    # Create a binary mask from the watershed result
    segmented_mask = np.zeros_like(image, dtype=np.uint8)
    segmented_mask[markers > 1] = 255  # Nuclei are marked with labels > 1

    # Remove small objects (noise)
    min_size = 100  # Minimum size threshold for nuclei
    num_labels, labels_img, stats, centroids = cv2.connectedComponentsWithStats(segmented_mask, connectivity=8)

    cleaned_mask = np.zeros_like(segmented_mask)
    for i in range(1, num_labels): # Start from 1 to exclude background
        if stats[i, cv2.CC_STAT_AREA] >= min_size:
            cleaned_mask[labels_img == i] = 255

    return cleaned_mask

train_segmented_masks = [segment_nuclei(img) for img in train_red_images]
test_segmented_masks = [segment_nuclei(img) for img in test_red_images]

print(f"Generated {len(train_segmented_masks)} training segmented masks.")
print(f"Generated {len(test_segmented_masks)} testing segmented masks.")

In [None]:
def apply_global_thresholding(image, threshold_value=50):
    """Applies global thresholding to a grayscale image."""
    # Apply global thresholding
    _, thresholded_img = cv2.threshold(image, threshold_value, 255, cv2.THRESH_BINARY)
    return thresholded_img

# Apply thresholding to training and testing green images
# Using the threshold_value from the previous cell if it exists, otherwise use a default
global threshold_value
try:
    threshold_value_to_use = threshold_value
except NameError:
    threshold_value_to_use = 50 # Default value

train_green_thresholded = [apply_global_thresholding(img, threshold_value_to_use) for img in train_green_images]
test_green_thresholded = [apply_global_thresholding(img, threshold_value_to_use) for img in test_green_images]

print(f"Applied global thresholding to {len(train_green_thresholded)} training green images.")
print(f"Applied global thresholding to {len(test_green_thresholded)} testing green images.")

In [None]:
def detect_foci_and_label_nucleus(paired_image, segmented_mask, thresholded_green_image, foci_threshold=10):
    """
    Detects green foci within segmented nuclei and labels each nucleus
    as "damaged" or "normal".

    Args:
        paired_image: A tuple containing the red and green channel images (grayscale).
        segmented_mask: The binary segmented mask for the nuclei.
        thresholded_green_image: The thresholded green channel image.
        foci_threshold: The minimum number of non-zero pixels (foci) to label a nucleus as "damaged".

    Returns:
        A list of tuples, where each tuple contains the normalized
        red nucleus image and its corresponding label ("damaged" or "normal").
    """
    red_img, green_img = paired_image
    labeled_nuclei = []

    # Find contours in the segmented mask to identify individual nuclei
    contours, _ = cv2.findContours(segmented_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    for contour in contours:
        # Create a mask specifically for the current nucleus
        nucleus_mask = np.zeros_like(segmented_mask)
        cv2.drawContours(nucleus_mask, [contour], -1, 255, -1)

        # Apply this nucleus mask to the thresholded green channel image
        masked_green = cv2.bitwise_and(thresholded_green_image, thresholded_green_image, mask=nucleus_mask)

        # Count the number of non-zero pixels (representing potential foci)
        foci_count = np.count_nonzero(masked_green)

        # Determine the label based on foci count
        label = "damaged" if foci_count > foci_threshold else "normal"

        # Find bounding box
        x, y, w, h = cv2.boundingRect(contour)

        # Extract ROI for the red channel
        red_roi = red_img[y:y+h, x:x+w]

        # Calculate minimum enclosing circle
        (center_x, center_y), radius = cv2.minEnclosingCircle(contour)
        center_x -= x  # Adjust center to be relative to ROI
        center_y -= y

        # Determine size of square bounding box for the circle
        square_size = int(np.ceil(2 * radius))
        padding = square_size

        # Pad the red ROI to the square size, centering the nucleus
        # Calculate padding amounts
        pad_x_before = int((padding - w) / 2)
        pad_x_after = padding - w - pad_x_before
        pad_y_before = int((padding - h) / 2)
        pad_y_after = padding - h - pad_y_before

        padded_red_roi = np.pad(red_roi, ((pad_y_before, pad_y_after), (pad_x_before, pad_x_after)), mode='constant')

        # Resize to target size (assuming the target size is 64x64 as used in the previous normalization step)
        target_size = (64, 64)
        normalized_red_nucleus = cv2.resize(padded_red_roi, target_size, interpolation=cv2.INTER_AREA)


        # Store the normalized red nucleus image and its label
        labeled_nuclei.append((normalized_red_nucleus, label))

    return labeled_nuclei

# Apply foci detection and labeling to training and testing sets
train_labeled_nuclei = []
for i in range(len(train_paired_images)):
    labeled_nuclei_in_image = detect_foci_and_label_nucleus(
        train_paired_images[i],
        train_segmented_masks[i],
        train_green_thresholded[i]
    )
    train_labeled_nuclei.extend(labeled_nuclei_in_image)

test_labeled_nuclei = []
for i in range(len(test_paired_images)):
    labeled_nuclei_in_image = detect_foci_and_label_nucleus(
        test_paired_images[i],
        test_segmented_masks[i],
        test_green_thresholded[i]
    )
    test_labeled_nuclei.extend(labeled_nuclei_in_image)

# Count damaged and normal nuclei
train_damaged_count = sum(1 for img, label in train_labeled_nuclei if label == "damaged")
train_normal_count = sum(1 for img, label in train_labeled_nuclei if label == "normal")
test_damaged_count = sum(1 for img, label in test_labeled_nuclei if label == "damaged")
test_normal_count = sum(1 for img, label in test_labeled_nuclei if label == "normal")

print(f"Training set: {train_damaged_count} damaged, {train_normal_count} normal.")
print(f"Testing set: {test_damaged_count} damaged, {test_normal_count} normal.")

In [None]:
def process_image_pair(paired_image):
    """
    Processes a paired image (red, green) through segmentation,
    thresholding, foci detection, and labeling.

    Args:
        paired_image: A tuple containing the red and green channel images (grayscale).

    Returns:
        A list of tuples, where each tuple contains the normalized
        red nucleus image and its corresponding label ("damaged" or "normal").
    """
    red_img, green_img = paired_image

    # 1. Nucleus segmentation (red channel)
    segmented_mask = segment_nuclei(red_img)

    # 2. Green channel global thresholding
    # Use the threshold_value defined previously (or a default if not defined)
    global threshold_value # Access the threshold value from the previous cell if it exists
    try:
        threshold_value_to_use = threshold_value
    except NameError:
        threshold_value_to_use = 50 # Default value if not set

    thresholded_green_image = apply_global_thresholding(green_img, threshold_value_to_use)


    # 3. Foci detection and nucleus labeling
    labeled_nuclei_in_image = detect_foci_and_label_nucleus(
        paired_image,
        segmented_mask,
        thresholded_green_image
    )

    return labeled_nuclei_in_image

# Process training images
all_train_labeled_nuclei = []
for paired_img in train_paired_images:
    labeled_nuclei = process_image_pair(paired_img)
    all_train_labeled_nuclei.extend(labeled_nuclei)

# Process testing images
all_test_labeled_nuclei = []
for paired_img in test_paired_images:
    labeled_nuclei = process_image_pair(paired_img)
    all_test_labeled_nuclei.extend(labeled_nuclei)

# Print the total number of labeled nuclei
print(f"Total labeled nuclei in training set: {len(all_train_labeled_nuclei)}")
print(f"Total labeled nuclei in testing set: {len(all_test_labeled_nuclei)}")

In [None]:
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import numpy as np # Ensure numpy is imported
from sklearn.feature_selection import SelectKBest, f_classif # Import feature selection

# Update the combined features and labels with the newly processed data
# Assuming all_train_labeled_nuclei and all_test_labeled_nuclei have been updated
train_images_for_model_re = [nucleus[0] for nucleus in all_train_labeled_nuclei]
train_labels_for_model_re = [nucleus[1] for nucleus in all_train_labeled_nuclei]
test_images_for_model_re = [nucleus[0] for nucleus in all_test_labeled_nuclei]
test_labels_for_model_re = [nucleus[1] for nucleus in all_test_labeled_nuclei]

# Convert images to NumPy arrays and add channel dimension (if not already done)
train_images_for_model_re = np.array(train_images_for_model_re).reshape(-1, 64, 64, 1)
test_images_for_model_re = np.array(test_images_for_model_re).reshape(-1, 64, 64, 1)

# Convert labels to NumPy arrays and to numerical format (if not already done)
label_mapping = {"normal": 0, "damaged": 1}
train_labels_for_model_re = np.array([label_mapping[label] for label in train_labels_for_model_re])
test_labels_for_model_re = np.array([label_mapping[label] for label in test_labels_for_model_re])

# Re-extract and combine features with the potentially updated labels
# This ensures the features correspond to the new labeling
train_shape_features_re = [extract_shape_features(nucleus_img) for nucleus_img, label in all_train_labeled_nuclei]
test_shape_features_re = [extract_shape_features(nucleus_img) for nucleus_img, label in all_test_labeled_nuclei]

train_texture_features_re = [extract_texture_features(nucleus_img) for nucleus_img, label in all_train_labeled_nuclei]
test_texture_features_re = [extract_texture_features(nucleus_img) for nucleus_img, label in all_test_labeled_nuclei]


# Convert to numpy arrays
train_shape_features_re = np.array(train_shape_features_re)
test_shape_features_re = np.array(test_texture_features_re) # Corrected: should be test_texture_features_re

# Ensure all texture feature lists have the same length before converting to numpy array
def pad_features(features_list):
    max_len = max(len(f) for f in features_list)
    padded_features = []
    for f in features_list:
        # Ensure each element in f is a number before padding
        padded_features.append([float(x) if isinstance(x, (int, float)) else 0.0 for x in f] + [0.0] * (max_len - len(f)))
    return np.array(padded_features)

train_texture_features_re = pad_features(train_texture_features_re)
test_texture_features_re = pad_features(test_texture_features_re)

# Re-combine shape and texture features
if train_shape_features_re.shape[0] == train_texture_features_re.shape[0]:
    train_texture_features_flat_re = train_texture_features_re.reshape(train_texture_features_re.shape[0], -1)
    train_combined_features_re = np.concatenate((train_shape_features_re, train_texture_features_flat_re), axis=1)
else:
    print("Mismatch in training samples for shape and texture features after re-extraction.")
    train_combined_features_re = None

if test_shape_features_re.shape[0] == test_texture_features_re.shape[0]:
    test_texture_features_flat_re = test_texture_features_re.reshape(test_texture_features_re.shape[0], -1)
    test_combined_features_re = np.concatenate((test_shape_features_re, test_texture_features_flat_re), axis=1)
else:
    print("Mismatch in testing samples for shape and texture features after re-extraction.")
    test_combined_features_re = None


# Apply SMOTE to handle class imbalance on the combined training features
if train_combined_features_re is not None and train_labels_for_model_re is not None:
    smote = SMOTE(random_state=42)
    train_features_smote, train_labels_smote = smote.fit_resample(train_combined_features_re, train_labels_for_model_re)

    # Define the parameter grid for GridSearchCV
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'class_weight': [None, 'balanced', 'balanced_subsample'] # Experiment with class weighting
    }

    # Instantiate the Random Forest Classifier
    rf = RandomForestClassifier(random_state=42)

    # Instantiate GridSearchCV
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='recall', n_jobs=-1) # Optimize for recall

    # Fit GridSearchCV to the SMOTE-resampled training data
    grid_search.fit(train_features_smote, train_labels_smote)

    # Get the best parameters and the best estimator
    best_params = grid_search.best_params_
    best_rf_classifier = grid_search.best_estimator_

    print("Best parameters found by GridSearchCV:")
    print(best_params)

    print("\nOptimized Random Forest classifier trained successfully.")

    # Evaluate the best model on the original (non-SMOTE) test set
    if test_combined_features_re is not None and test_labels_for_model_re is not None:
        predictions = best_rf_classifier.predict(test_combined_features_re)
        print("\nOptimized Random Forest Classifier Evaluation on Test Set:")
        print(classification_report(test_labels_for_model_re, predictions))
    else:
        print("\nTest data not available for evaluating the optimized Random Forest classifier.")

else:
    print("Combined training features or labels not available for Random Forest optimization.")

In [None]:
!ls DNA_Damage_Project

In [None]:
!unzip -l DNA_Damage_Project-20250901T154313Z-1-001.zip
!unzip DNA_Damage_Project-20250901T154313Z-1-001.zip -d .

In [None]:
import cv2
import os
import numpy as np

def load_and_preprocess_images(base_dir):
    """Loads images from specified directory and converts to grayscale."""
    images = []
    image_filenames = sorted(os.listdir(base_dir))
    for filename in image_filenames:
        img_path = os.path.join(base_dir, filename)
        img = cv2.imread(img_path)

        # Check if image was loaded successfully
        if img is None:
            print(f"Warning: Could not load image {img_path}")
            continue

        # OpenCV loads images in BGR format, convert to grayscale
        gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        images.append(gray_img)
    return np.array(images)

train_red_images = load_and_preprocess_images('DNA_Damage_Project/image/red')
test_red_images = load_and_preprocess_images('DNA_Damage_Project/test images/red')

print(f"Loaded {len(train_red_images)} training red images.")
print(f"Loaded {len(test_red_images)} testing red images.")

In [None]:
import cv2
import os
import numpy as np

def load_and_preprocess_images(base_dir):
    """Loads images from specified directory and converts to grayscale."""
    images = []
    image_filenames = sorted(os.listdir(base_dir))
    for filename in image_filenames:
        img_path = os.path.join(base_dir, filename)
        img = cv2.imread(img_path)

        # Check if image was loaded successfully
        if img is None:
            print(f"Warning: Could not load image {img_path}")
            continue

        # OpenCV loads images in BGR format, convert to grayscale
        gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        images.append(gray_img)
    return np.array(images)

train_green_images = load_and_preprocess_images('DNA_Damage_Project/image/green')
test_green_images = load_and_preprocess_images('DNA_Damage_Project/test images/green')

print(f"Loaded {len(train_green_images)} training green images.")
print(f"Loaded {len(test_green_images)} testing green images.")

In [None]:
def pair_images(red_images, green_images):
    """Pairs corresponding red and green images."""
    # Assuming images are already sorted by filename in the loading step
    # and that the filenames match between red and green directories
    paired_images = []
    for i in range(len(red_images)):
        paired_images.append((red_images[i], green_images[i]))
    return paired_images

train_paired_images = pair_images(train_red_images, train_green_images)
test_paired_images = pair_images(test_red_images, test_green_images)

print(f"Created {len(train_paired_images)} paired training images.")
print(f"Created {len(test_paired_images)} paired testing images.")

In [None]:
def segment_nuclei(image):
    """Segments nuclei using Otsu's thresholding and watershed."""
    # Apply Otsu's thresholding
    _, thresh = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

    # Morphological operations to clean up the mask
    kernel = np.ones((3, 3), np.uint8)
    opening = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel, iterations=2)

    # Sure background area
    sure_bg = cv2.dilate(opening, kernel, iterations=3)

    # Sure foreground area
    dist_transform = cv2.distanceTransform(opening, cv2.DIST_L2, 5)
    _, sure_fg = cv2.threshold(dist_transform, 0.7 * dist_transform.max(), 255, 0)

    # Unknown region
    sure_fg = np.uint8(sure_fg)
    unknown = cv2.subtract(sure_bg, sure_fg)

    # Marker labelling
    _, markers = cv2.connectedComponents(sure_fg)

    # Add one to all labels so that sure background is not 0, but 1
    markers = markers + 1

    # Now, mark the region of unknown with zero
    markers[unknown == 255] = 0

    # Apply watershed algorithm
    # Create a BGR version of the grayscale image for watershed visualization (optional, but watershed expects 3 channels)
    img_bgr = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
    markers = cv2.watershed(img_bgr, markers)

    # Create a binary mask from the watershed result
    segmented_mask = np.zeros_like(image, dtype=np.uint8)
    segmented_mask[markers > 1] = 255  # Nuclei are marked with labels > 1

    # Remove small objects (noise)
    min_size = 100  # Minimum size threshold for nuclei
    num_labels, labels_img, stats, centroids = cv2.connectedComponentsWithStats(segmented_mask, connectivity=8)

    cleaned_mask = np.zeros_like(segmented_mask)
    for i in range(1, num_labels): # Start from 1 to exclude background
        if stats[i, cv2.CC_STAT_AREA] >= min_size:
            cleaned_mask[labels_img == i] = 255

    return cleaned_mask

train_segmented_masks = [segment_nuclei(img) for img in train_red_images]
test_segmented_masks = [segment_nuclei(img) for img in test_red_images]

print(f"Generated {len(train_segmented_masks)} training segmented masks.")
print(f"Generated {len(test_segmented_masks)} testing segmented masks.")

In [None]:
def apply_global_thresholding(image, threshold_value=50):
    """Applies global thresholding to a grayscale image."""
    # Apply global thresholding
    _, thresholded_img = cv2.threshold(image, threshold_value, 255, cv2.THRESH_BINARY)
    return thresholded_img

# Apply thresholding to training and testing green images
# Using the threshold_value from the previous cell if it exists, otherwise use a default
global threshold_value
try:
    threshold_value_to_use = threshold_value
except NameError:
    threshold_value_to_use = 50 # Default value

train_green_thresholded = [apply_global_thresholding(img, threshold_value_to_use) for img in train_green_images]
test_green_thresholded = [apply_global_thresholding(img, threshold_value_to_use) for img in test_green_images]

print(f"Applied global thresholding to {len(train_green_thresholded)} training green images.")
print(f"Applied global thresholding to {len(test_green_thresholded)} testing green images.")

In [None]:
def detect_foci_and_label_nucleus(paired_image, segmented_mask, thresholded_green_image, foci_threshold=10):
    """
    Detects green foci within segmented nuclei and labels each nucleus
    as "damaged" or "normal".

    Args:
        paired_image: A tuple containing the red and green channel images (grayscale).
        segmented_mask: The binary segmented mask for the nuclei.
        thresholded_green_image: The thresholded green channel image.
        foci_threshold: The minimum number of non-zero pixels (foci) to label a nucleus as "damaged".

    Returns:
        A list of tuples, where each tuple contains the normalized
        red nucleus image and its corresponding label ("damaged" or "normal").
    """
    red_img, green_img = paired_image
    labeled_nuclei = []

    # Find contours in the segmented mask to identify individual nuclei
    contours, _ = cv2.findContours(segmented_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    for contour in contours:
        # Create a mask specifically for the current nucleus
        nucleus_mask = np.zeros_like(segmented_mask)
        cv2.drawContours(nucleus_mask, [contour], -1, 255, -1)

        # Apply this nucleus mask to the thresholded green channel image
        masked_green = cv2.bitwise_and(thresholded_green_image, thresholded_green_image, mask=nucleus_mask)

        # Count the number of non-zero pixels (representing potential foci)
        foci_count = np.count_nonzero(masked_green)

        # Determine the label based on foci count
        label = "damaged" if foci_count > foci_threshold else "normal"

        # Find bounding box
        x, y, w, h = cv2.boundingRect(contour)

        # Extract ROI for the red channel
        red_roi = red_img[y:y+h, x:x+w]

        # Calculate minimum enclosing circle
        (center_x, center_y), radius = cv2.minEnclosingCircle(contour)
        center_x -= x  # Adjust center to be relative to ROI
        center_y -= y

        # Determine size of square bounding box for the circle
        square_size = int(np.ceil(2 * radius))
        padding = square_size

        # Pad the red ROI to the square size, centering the nucleus
        # Calculate padding amounts
        pad_x_before = int((padding - w) / 2)
        pad_x_after = padding - w - pad_x_before
        pad_y_before = int((padding - h) / 2)
        pad_y_after = padding - h - pad_y_before

        padded_red_roi = np.pad(red_roi, ((pad_y_before, pad_y_after), (pad_x_before, pad_x_after)), mode='constant')

        # Resize to target size (assuming the target size is 64x64 as used in the previous normalization step)
        target_size = (64, 64)
        normalized_red_nucleus = cv2.resize(padded_red_roi, target_size, interpolation=cv2.INTER_AREA)


        # Store the normalized red nucleus image and its label
        labeled_nuclei.append((normalized_red_nucleus, label))

    return labeled_nuclei

# Apply foci detection and labeling to training and testing sets
train_labeled_nuclei = []
for i in range(len(train_paired_images)):
    labeled_nuclei_in_image = detect_foci_and_label_nucleus(
        train_paired_images[i],
        train_segmented_masks[i],
        train_green_thresholded[i]
    )
    train_labeled_nuclei.extend(labeled_nuclei_in_image)

test_labeled_nuclei = []
for i in range(len(test_paired_images)):
    labeled_nuclei_in_image = detect_foci_and_label_nucleus(
        test_paired_images[i],
        test_segmented_masks[i],
        test_green_thresholded[i]
    )
    test_labeled_nuclei.extend(labeled_nuclei_in_image)

# Count damaged and normal nuclei
train_damaged_count = sum(1 for img, label in train_labeled_nuclei if label == "damaged")
train_normal_count = sum(1 for img, label in train_labeled_nuclei if label == "normal")
test_damaged_count = sum(1 for img, label in test_labeled_nuclei if label == "damaged")
test_normal_count = sum(1 for img, label in test_labeled_nuclei if label == "normal")

print(f"Training set: {train_damaged_count} damaged, {train_normal_count} normal.")
print(f"Testing set: {test_damaged_count} damaged, {test_normal_count} normal.")

In [None]:
def process_image_pair(paired_image):
    """
    Processes a paired image (red, green) through segmentation,
    thresholding, foci detection, and labeling.

    Args:
        paired_image: A tuple containing the red and green channel images (grayscale).

    Returns:
        A list of tuples, where each tuple contains the normalized
        red nucleus image and its corresponding label ("damaged" or "normal").
    """
    red_img, green_img = paired_image

    # 1. Nucleus segmentation (red channel)
    segmented_mask = segment_nuclei(red_img)

    # 2. Green channel global thresholding
    # Use the threshold_value defined previously (or a default if not defined)
    global threshold_value # Access the threshold value from the previous cell if it exists
    try:
        threshold_value_to_use = threshold_value
    except NameError:
        threshold_value_to_use = 50 # Default value if not set

    thresholded_green_image = apply_global_thresholding(green_img, threshold_value_to_use)


    # 3. Foci detection and nucleus labeling
    labeled_nuclei_in_image = detect_foci_and_label_nucleus(
        paired_image,
        segmented_mask,
        thresholded_green_image
    )

    return labeled_nuclei_in_image

# Process training images
all_train_labeled_nuclei = []
for paired_img in train_paired_images:
    labeled_nuclei = process_image_pair(paired_img)
    all_train_labeled_nuclei.extend(labeled_nuclei)

# Process testing images
all_test_labeled_nuclei = []
for paired_img in test_paired_images:
    labeled_nuclei = process_image_pair(paired_img)
    all_test_labeled_nuclei.extend(labeled_nuclei)

# Print the total number of labeled nuclei
print(f"Total labeled nuclei in training set: {len(all_train_labeled_nuclei)}")
print(f"Total labeled nuclei in testing set: {len(all_test_labeled_nuclei)}")

In [None]:
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import numpy as np # Ensure numpy is imported
from sklearn.feature_selection import SelectKBest, f_classif # Import feature selection

# Update the combined features and labels with the newly processed data
# Assuming all_train_labeled_nuclei and all_test_labeled_nuclei have been updated
train_images_for_model_re = [nucleus[0] for nucleus in all_train_labeled_nuclei]
train_labels_for_model_re = [nucleus[1] for nucleus in all_train_labeled_nuclei]
test_images_for_model_re = [nucleus[0] for nucleus in all_test_labeled_nuclei]
test_labels_for_model_re = [nucleus[1] for nucleus in all_test_labeled_nuclei]

# Convert images to NumPy arrays and add channel dimension (if not already done)
train_images_for_model_re = np.array(train_images_for_model_re).reshape(-1, 64, 64, 1)
test_images_for_model_re = np.array(test_images_for_model_re).reshape(-1, 64, 64, 1)

# Convert labels to NumPy arrays and to numerical format (if not already done)
label_mapping = {"normal": 0, "damaged": 1}
train_labels_for_model_re = np.array([label_mapping[label] for label in train_labels_for_model_re])
test_labels_for_model_re = np.array([label_mapping[label] for label in test_labels_for_model_re])

# Re-extract and combine features with the potentially updated labels
# This ensures the features correspond to the new labeling
train_shape_features_re = [extract_shape_features(nucleus_img) for nucleus_img, label in all_train_labeled_nuclei]
test_shape_features_re = [extract_shape_features(nucleus_img) for nucleus_img, label in all_test_labeled_nuclei]

train_texture_features_re = [extract_texture_features(nucleus_img) for nucleus_img, label in all_train_labeled_nuclei]
test_texture_features_re = [extract_texture_features(nucleus_img) for nucleus_img, label in all_test_labeled_nuclei]


# Convert to numpy arrays
train_shape_features_re = np.array(train_shape_features_re)
test_shape_features_re = np.array(test_shape_features_re)

# Ensure all texture feature lists have the same length before converting to numpy array
def pad_features(features_list):
    max_len = max(len(f) for f in features_list)
    padded_features = []
    for f in features_list:
        # Ensure each element in f is a number before padding
        padded_features.append([float(x) if isinstance(x, (int, float)) else 0.0 for x in f] + [0.0] * (max_len - len(f)))
    return np.array(padded_features)

train_texture_features_re = pad_features(train_texture_features_re)
test_texture_features_re = pad_features(test_texture_features_re)

# Re-combine shape and texture features
if train_shape_features_re.shape[0] == train_texture_features_re.shape[0]:
    train_texture_features_flat_re = train_texture_features_re.reshape(train_texture_features_re.shape[0], -1)
    train_combined_features_re = np.concatenate((train_shape_features_re, train_texture_features_flat_re), axis=1)
else:
    print("Mismatch in training samples for shape and texture features after re-extraction.")
    train_combined_features_re = None

if test_shape_features_re.shape[0] == test_texture_features_re.shape[0]:
    test_texture_features_flat_re = test_texture_features_re.reshape(test_texture_features_re.shape[0], -1)
    test_combined_features_re = np.concatenate((test_shape_features_re, test_texture_features_flat_re), axis=1)
else:
    print("Mismatch in testing samples for shape and texture features after re-extraction.")
    test_combined_features_re = None


# Apply SMOTE to handle class imbalance on the combined training features
if train_combined_features_re is not None and train_labels_for_model_re is not None:
    smote = SMOTE(random_state=42)
    train_features_smote, train_labels_smote = smote.fit_resample(train_combined_features_re, train_labels_for_model_re)

    # Define the parameter grid for GridSearchCV
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 1, 4], # Corrected typo here
        'class_weight': [None, 'balanced', 'balanced_subsample'] # Experiment with class weighting
    }

    # Instantiate the Random Forest Classifier
    rf = RandomForestClassifier(random_state=42)

    # Instantiate GridSearchCV
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='recall', n_jobs=-1) # Optimize for recall

    # Fit GridSearchCV to the SMOTE-resampled training data
    grid_search.fit(train_features_smote, train_labels_smote)

    # Get the best parameters and the best estimator
    best_params = grid_search.best_params_
    best_rf_classifier = grid_search.best_estimator_

    print("Best parameters found by GridSearchCV:")
    print(best_params)

    print("\nOptimized Random Forest classifier trained successfully.")

    # Evaluate the best model on the original (non-SMOTE) test set
    if test_combined_features_re is not None and test_labels_for_model_re is not None:
        predictions = best_rf_classifier.predict(test_combined_features_re)
        print("\nOptimized Random Forest Classifier Evaluation on Test Set:")
        print(classification_report(test_labels_for_model_re, predictions))
    else:
        print("\nTest data not available for evaluating the optimized Random Forest classifier.")

else:
    print("Combined training features or labels not available for Random Forest optimization.")

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Define the CNN model architecture
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(64, 64, 1)),
    MaxPooling2D((2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Conv2D(128, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid') # Sigmoid for binary classification
])

# Compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Setup data augmentation
datagen = ImageDataGenerator(
    rotation_range=20,
    zoom_range=0.1,
    width_shift_range=0.1,
    height_shift_range=0.1,
    horizontal_flip=True,
    vertical_flip=True
)

# Fit the data generator on the training data
datagen.fit(train_images_for_model_re) # Use the re-extracted images

# Train the model using the augmented data
history = model.fit(datagen.flow(train_images_for_model_re, train_labels_for_model_re, batch_size=32),
                    epochs=50, # Number of training epochs
                    validation_data=(test_images_for_model_re, test_labels_for_model_re)) # Use the re-extracted images and labels

print("CNN model training completed.")

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

# Make predictions on the test set
predictions_prob = model.predict(test_images_for_model_re) # Use the re-extracted images
predictions = (predictions_prob > 0.5).astype("int32")

# Generate and print classification report
print("CNN Classification Report:")
print(classification_report(test_labels_for_model_re, predictions)) # Use the re-extracted labels

# Generate and print confusion matrix
conf_matrix_cnn = confusion_matrix(test_labels_for_model_re, predictions) # Use the re-extracted labels
print("CNN Confusion Matrix:")
print(conf_matrix_cnn)

In [None]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import SelectKBest, f_classif

# Get predictions (probabilities) from the trained CNN on the training and testing data
train_cnn_predictions_prob = model.predict(train_images_for_model_re) # Use the re-extracted images
test_cnn_predictions_prob = model.predict(test_images_for_model_re) # Use the re-extracted images

# Reshape CNN predictions to be 2D arrays
train_cnn_predictions_prob = train_cnn_predictions_prob.reshape(-1, 1)
test_cnn_predictions_prob = test_cnn_predictions_prob.reshape(-1, 1)


# Combine CNN probabilities with handcrafted features for training
# Ensure both arrays have the same number of samples
if train_combined_features_re.shape[0] == train_cnn_predictions_prob.shape[0]:
    train_ensemble_features_re = np.concatenate((train_combined_features_re, train_cnn_predictions_prob), axis=1)
    print(f"Shape of train_ensemble_features_re: {train_ensemble_features_re.shape}")
else:
    print("Mismatch in the number of training samples for combined features and CNN predictions.")
    train_ensemble_features_re = None # Or handle the error appropriately


# Combine CNN probabilities with handcrafted features for testing
# Ensure both arrays have the same number of samples
if test_combined_features_re.shape[0] == test_cnn_predictions_prob.shape[0]:
    test_ensemble_features_re = np.concatenate((test_combined_features_re, test_cnn_predictions_prob), axis=1)
    print(f"Shape of test_ensemble_features_re: {test_ensemble_features_re.shape}")
else:
    print("Mismatch in the number of testing samples for combined features and CNN predictions.")
    test_ensemble_features_re = None # Or handle the error appropriately


# --- Feature Selection (Optional but Recommended for Ensemble) ---
# Use SelectKBest to select the top K features based on ANOVA F-value
# You can adjust the value of k (number of features to select)
if train_ensemble_features_re is not None and train_labels_for_model_re is not None:
    # Address potential NaNs or Infs in features before selection
    train_ensemble_features_clean_re = np.nan_to_num(train_ensemble_features_re)

    # Check if there are enough samples for feature selection
    if train_ensemble_features_clean_re.shape[0] > 1 and np.var(train_ensemble_features_clean_re, axis=0).sum() > 0:
        k = min(100, train_ensemble_features_clean_re.shape[1]) # Select top 100 features or fewer if less are available
        try:
            selector = SelectKBest(score_func=f_classif, k=k)
            train_ensemble_features_selected_re = selector.fit_transform(train_ensemble_features_clean_re, train_labels_for_model_re)
            test_ensemble_features_selected_re = selector.transform(np.nan_to_num(test_ensemble_features_re)) # Apply the same selection to test data
            print(f"Shape of train_ensemble_features_selected_re: {train_ensemble_features_selected_re.shape}")
            print(f"Shape of test_ensemble_features_selected_re: {test_ensemble_features_selected_re.shape}")
        except ValueError as e:
             print(f"Could not perform feature selection: {e}")
             # Fallback to using all features if selection fails
             train_ensemble_features_selected_re = train_ensemble_features_clean_re
             test_ensemble_features_selected_re = np.nan_to_num(test_ensemble_features_re)
             print("Using all features for ensemble training.")
    else:
         print("Not enough samples or variance for feature selection. Using all features.")
         train_ensemble_features_selected_re = train_ensemble_features_clean_re
         test_ensemble_features_selected_re = np.nan_to_num(test_ensemble_features_re)
else:
    print("Ensemble features not available for selection.")
    train_ensemble_features_selected_re = None
    test_ensemble_features_selected_re = None


# --- Train Second-Level Random Forest Classifier ---
# Apply SMOTE to the selected training features to handle class imbalance for the ensemble model
if train_ensemble_features_selected_re is not None and train_labels_for_model_re is not None:
    try:
        smote_ensemble = SMOTE(random_state=42)
        train_ensemble_features_smote_re, train_labels_ensemble_smote_re = smote_ensemble.fit_resample(train_ensemble_features_selected_re, train_labels_for_model_re)

        # Instantiate and train the second-level Random Forest Classifier
        ensemble_classifier_re = RandomForestClassifier(n_estimators=100, random_state=42)
        ensemble_classifier_re.fit(train_ensemble_features_smote_re, train_labels_ensemble_smote_re)

        print("\nEnsemble Random Forest classifier trained successfully with SMOTE.")
    except ValueError as e:
         print(f"Could not train ensemble classifier: {e}")
         ensemble_classifier_re = None
else:
    print("Could not train ensemble classifier due to missing data.")

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Make predictions on the test set using the ensemble model
if test_ensemble_features_selected_re is not None and ensemble_classifier_re is not None:
    ensemble_predictions_re = ensemble_classifier_re.predict(test_ensemble_features_selected_re)

    # Generate and print classification report for the ensemble model
    print("Ensemble Model Classification Report:")
    print(classification_report(test_labels_for_model_re, ensemble_predictions_re))

    # Compute and print accuracy score for the ensemble model
    ensemble_accuracy_re = accuracy_score(test_labels_for_model_re, ensemble_predictions_re)
    print(f"Ensemble Model Accuracy Score: {ensemble_accuracy_re}")

    # Generate and print confusion matrix for the ensemble model
    conf_matrix_ensemble_re = confusion_matrix(test_labels_for_model_re, ensemble_predictions_re)
    print("Ensemble Model Confusion Matrix:")
    print(conf_matrix_ensemble_re)
else:
    print("Ensemble model or test features not available for evaluation.")

In [None]:
import cv2
import os
import numpy as np

def load_and_preprocess_images(base_dir):
    """Loads images from specified directory and converts to grayscale."""
    images = []
    image_filenames = sorted(os.listdir(base_dir))
    for filename in image_filenames:
        img_path = os.path.join(base_dir, filename)
        img = cv2.imread(img_path)

        # Check if image was loaded successfully
        if img is None:
            print(f"Warning: Could not load image {img_path}")
            continue

        # OpenCV loads images in BGR format, convert to grayscale
        gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        images.append(gray_img)
    return np.array(images)

train_red_images = load_and_preprocess_images('DNA_Damage_Project/image/red')
test_red_images = load_and_preprocess_images('DNA_Damage_Project/test images/red')

print(f"Loaded {len(train_red_images)} training red images.")
print(f"Loaded {len(test_red_images)} testing red images.")

In [None]:
import cv2
import os
import numpy as np

def load_and_preprocess_images(base_dir):
    """Loads images from specified directory and converts to grayscale."""
    images = []
    image_filenames = sorted(os.listdir(base_dir))
    for filename in image_filenames:
        img_path = os.path.join(base_dir, filename)
        img = cv2.imread(img_path)

        # Check if image was loaded successfully
        if img is None:
            print(f"Warning: Could not load image {img_path}")
            continue

        # OpenCV loads images in BGR format, convert to grayscale
        gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        images.append(gray_img)
    return np.array(images)

train_green_images = load_and_preprocess_images('DNA_Damage_Project/image/green')
test_green_images = load_and_preprocess_images('DNA_Damage_Project/test images/green')

print(f"Loaded {len(train_green_images)} training green images.")
print(f"Loaded {len(test_green_images)} testing green images.")

In [None]:
def pair_images(red_images, green_images):
    """Pairs corresponding red and green images."""
    # Assuming images are already sorted by filename in the loading step
    # and that the filenames match between red and green directories
    paired_images = []
    for i in range(len(red_images)):
        paired_images.append((red_images[i], green_images[i]))
    return paired_images

train_paired_images = pair_images(train_red_images, train_green_images)
test_paired_images = pair_images(test_red_images, test_green_images)

print(f"Created {len(train_paired_images)} paired training images.")
print(f"Created {len(test_paired_images)} paired testing images.")

In [None]:
def segment_nuclei(image):
    """Segments nuclei using Otsu's thresholding and watershed."""
    # Apply Otsu's thresholding
    _, thresh = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

    # Morphological operations to clean up the mask
    kernel = np.ones((3, 3), np.uint8)
    opening = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel, iterations=2)

    # Sure background area
    sure_bg = cv2.dilate(opening, kernel, iterations=3)

    # Sure foreground area
    dist_transform = cv2.distanceTransform(opening, cv2.DIST_L2, 5)
    _, sure_fg = cv2.threshold(dist_transform, 0.7 * dist_transform.max(), 255, 0)

    # Unknown region
    sure_fg = np.uint8(sure_fg)
    unknown = cv2.subtract(sure_bg, sure_fg)

    # Marker labelling
    _, markers = cv2.connectedComponents(sure_fg)

    # Add one to all labels so that sure background is not 0, but 1
    markers = markers + 1

    # Now, mark the region of unknown with zero
    markers[unknown == 255] = 0

    # Apply watershed algorithm
    # Create a BGR version of the grayscale image for watershed visualization (optional, but watershed expects 3 channels)
    img_bgr = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
    markers = cv2.watershed(img_bgr, markers)

    # Create a binary mask from the watershed result
    segmented_mask = np.zeros_like(image, dtype=np.uint8)
    segmented_mask[markers > 1] = 255  # Nuclei are marked with labels > 1

    # Remove small objects (noise)
    min_size = 100  # Minimum size threshold for nuclei
    num_labels, labels_img, stats, centroids = cv2.connectedComponentsWithStats(segmented_mask, connectivity=8)

    cleaned_mask = np.zeros_like(segmented_mask)
    for i in range(1, num_labels): # Start from 1 to exclude background
        if stats[i, cv2.CC_STAT_AREA] >= min_size:
            cleaned_mask[labels_img == i] = 255

    return cleaned_mask

train_segmented_masks = [segment_nuclei(img) for img in train_red_images]
test_segmented_masks = [segment_nuclei(img) for img in test_red_images]

print(f"Generated {len(train_segmented_masks)} training segmented masks.")
print(f"Generated {len(test_segmented_masks)} testing segmented masks.")

In [None]:
def apply_global_thresholding(image, threshold_value=50):
    """Applies global thresholding to a grayscale image."""
    # Apply global thresholding
    _, thresholded_img = cv2.threshold(image, threshold_value, 255, cv2.THRESH_BINARY)
    return thresholded_img

# Apply thresholding to training and testing green images
# Using the threshold_value from the previous cell if it exists, otherwise use a default
global threshold_value
try:
    threshold_value_to_use = threshold_value
except NameError:
    threshold_value_to_use = 50 # Default value

train_green_thresholded = [apply_global_thresholding(img, threshold_value_to_use) for img in train_green_images]
test_green_thresholded = [apply_global_thresholding(img, threshold_value_to_use) for img in test_green_images]

print(f"Applied global thresholding to {len(train_green_thresholded)} training green images.")
print(f"Applied global thresholding to {len(test_green_thresholded)} testing green images.")

In [None]:
def detect_foci_and_label_nucleus(paired_image, segmented_mask, thresholded_green_image, foci_threshold=10):
    """
    Detects green foci within segmented nuclei and labels each nucleus
    as "damaged" or "normal".

    Args:
        paired_image: A tuple containing the red and green channel images (grayscale).
        segmented_mask: The binary segmented mask for the nuclei.
        thresholded_green_image: The thresholded green channel image.
        foci_threshold: The minimum number of non-zero pixels (foci) to label a nucleus as "damaged".

    Returns:
        A list of tuples, where each tuple contains the normalized
        red nucleus image and its corresponding label ("damaged" or "normal").
    """
    red_img, green_img = paired_image
    labeled_nuclei = []

    # Find contours in the segmented mask to identify individual nuclei
    contours, _ = cv2.findContours(segmented_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    for contour in contours:
        # Create a mask specifically for the current nucleus
        nucleus_mask = np.zeros_like(segmented_mask)
        cv2.drawContours(nucleus_mask, [contour], -1, 255, -1)

        # Apply this nucleus mask to the thresholded green channel image
        masked_green = cv2.bitwise_and(thresholded_green_image, thresholded_green_image, mask=nucleus_mask)

        # Count the number of non-zero pixels (representing potential foci)
        foci_count = np.count_nonzero(masked_green)

        # Determine the label based on foci count
        label = "damaged" if foci_count > foci_threshold else "normal"

        # Find bounding box
        x, y, w, h = cv2.boundingRect(contour)

        # Extract ROI for the red channel
        red_roi = red_img[y:y+h, x:x+w]

        # Calculate minimum enclosing circle
        (center_x, center_y), radius = cv2.minEnclosingCircle(contour)
        center_x -= x  # Adjust center to be relative to ROI
        center_y -= y

        # Determine size of square bounding box for the circle
        square_size = int(np.ceil(2 * radius))
        padding = square_size

        # Pad the red ROI to the square size, centering the nucleus
        # Calculate padding amounts
        pad_x_before = int((padding - w) / 2)
        pad_x_after = padding - w - pad_x_before
        pad_y_before = int((padding - h) / 2)
        pad_y_after = padding - h - pad_y_before

        padded_red_roi = np.pad(red_roi, ((pad_y_before, pad_y_after), (pad_x_before, pad_x_after)), mode='constant')

        # Resize to target size (assuming the target size is 64x64 as used in the previous normalization step)
        target_size = (64, 64)
        normalized_red_nucleus = cv2.resize(padded_red_roi, target_size, interpolation=cv2.INTER_AREA)


        # Store the normalized red nucleus image and its label
        labeled_nuclei.append((normalized_red_nucleus, label))

    return labeled_nuclei

# Apply foci detection and labeling to training and testing sets
train_labeled_nuclei = []
for i in range(len(train_paired_images)):
    labeled_nuclei_in_image = detect_foci_and_label_nucleus(
        train_paired_images[i],
        train_segmented_masks[i],
        train_green_thresholded[i]
    )
    train_labeled_nuclei.extend(labeled_nuclei_in_image)

test_labeled_nuclei = []
for i in range(len(test_paired_images)):
    labeled_nuclei_in_image = detect_foci_and_label_nucleus(
        test_paired_images[i],
        test_segmented_masks[i],
        test_green_thresholded[i]
    )
    test_labeled_nuclei.extend(labeled_nuclei_in_image)

# Count damaged and normal nuclei
train_damaged_count = sum(1 for img, label in train_labeled_nuclei if label == "damaged")
train_normal_count = sum(1 for img, label in train_labeled_nuclei if label == "normal")
test_damaged_count = sum(1 for img, label in test_labeled_nuclei if label == "damaged")
test_normal_count = sum(1 for img, label in test_labeled_nuclei if label == "normal")

print(f"Training set: {train_damaged_count} damaged, {train_normal_count} normal.")
print(f"Testing set: {test_damaged_count} damaged, {test_normal_count} normal.")

In [None]:
def process_image_pair(paired_image):
    """
    Processes a paired image (red, green) through segmentation,
    thresholding, foci detection, and labeling.

    Args:
        paired_image: A tuple containing the red and green channel images (grayscale).

    Returns:
        A list of tuples, where each tuple contains the normalized
        red nucleus image and its corresponding label ("damaged" or "normal").
    """
    red_img, green_img = paired_image

    # 1. Nucleus segmentation (red channel)
    segmented_mask = segment_nuclei(red_img)

    # 2. Green channel global thresholding
    # Use the threshold_value defined previously (or a default if not defined)
    global threshold_value # Access the threshold value from the previous cell if it exists
    try:
        threshold_value_to_use = threshold_value
    except NameError:
        threshold_value_to_use = 50 # Default value if not set

    thresholded_green_image = apply_global_thresholding(green_img, threshold_value_to_use)


    # 3. Foci detection and nucleus labeling
    labeled_nuclei_in_image = detect_foci_and_label_nucleus(
        paired_image,
        segmented_mask,
        thresholded_green_image
    )

    return labeled_nuclei_in_image

# Process training images
all_train_labeled_nuclei = []
for paired_img in train_paired_images:
    labeled_nuclei = process_image_pair(paired_img)
    all_train_labeled_nuclei.extend(labeled_nuclei)

# Process testing images
all_test_labeled_nuclei = []
for paired_img in test_paired_images:
    labeled_nuclei = process_image_pair(paired_img)
    all_test_labeled_nuclei.extend(labeled_nuclei)

# Print the total number of labeled nuclei
print(f"Total labeled nuclei in training set: {len(all_train_labeled_nuclei)}")
print(f"Total labeled nuclei in testing set: {len(all_test_labeled_nuclei)}")

In [None]:
# Separate images and labels for training
train_images_for_model = [nucleus[0] for nucleus in all_train_labeled_nuclei]
train_labels_for_model = [nucleus[1] for nucleus in all_train_labeled_nuclei]

# Separate images and labels for testing
test_images_for_model = [nucleus[0] for nucleus in all_test_labeled_nuclei]
test_labels_for_model = [nucleus[1] for nucleus in all_test_labeled_nuclei]

# Convert images to NumPy arrays and add channel dimension
train_images_for_model = np.array(train_images_for_model).reshape(-1, 64, 64, 1)
test_images_for_model = np.array(test_images_for_model).reshape(-1, 64, 64, 1)

# Convert labels to NumPy arrays and to numerical format
label_mapping = {"normal": 0, "damaged": 1}
train_labels_for_model = np.array([label_mapping[label] for label in train_labels_for_model])
test_labels_for_model = np.array([label_mapping[label] for label in test_labels_for_model])

# Print shapes
print(f"Shape of train_images_for_model: {train_images_for_model.shape}")
print(f"Shape of train_labels_for_model: {train_labels_for_model.shape}")
print(f"Shape of test_images_for_model: {test_images_for_model.shape}")
print(f"Shape of test_labels_for_model: {test_labels_for_model.shape}")

In [None]:
import mahotas
from skimage.feature import hog
import cv2
import numpy as np
from skimage import measure # Import measure for regionprops

def extract_shape_features(image):
    """Extracts shape features from a binary nucleus mask using skimage.measure.regionprops."""
    # Ensure the image is binary (0 or 255) and has the correct dtype for regionprops
    if image.max() <= 1:
        image = image * 255
    image = image.astype(np.uint8)

    # Find contours (still needed for some calculations like perimeter and hull)
    contours, _ = cv2.findContours(image.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    if not contours:
        # Return a list of zeros if no contour is found
        # Adjust size based on the features extracted by regionprops + Hu moments + HOG
        # regionprops provides area, perimeter, major/minor axis length, eccentricity, solidity
        # Hu moments are 7. HOG depends on parameters (e.g., 9 orientations for 1 cell block size 16x16)
        return [0] * (5 + 7 + 9)

    # Assuming the largest contour is the nucleus
    contour = max(contours, key=cv2.contourArea)

    # Use regionprops for basic shape features
    # Need to create a labeled image for regionprops
    labeled_image, num_labels = measure.label(image, connectivity=2, return_num=True)

    if num_labels < 1:
         # Return a list of zeros if no labeled regions are found
         return [0] * (5 + 7 + 9)

    # Find properties of the largest region (assumed to be the nucleus)
    properties = measure.regionprops(labeled_image)[0] # Assuming only one main region after segmentation

    area = properties.area
    perimeter = properties.perimeter
    major_axis_length = properties.major_axis_length if properties.major_axis_length is not None else 0
    minor_axis_length = properties.minor_axis_length if properties.minor_axis_length is not None else 0
    axis_ratio = major_axis_length / minor_axis_length if minor_axis_length > 0 else 0
    eccentricity = properties.eccentricity if properties.eccentricity is not None else 0
    solidity = properties.solidity if properties.solidity is not None else 0

    # Roundness/Form Factor (calculated using area and perimeter from regionprops)
    roundness = 4 * np.pi * area / (perimeter**2) if perimeter > 0 else 0

    # Hu Moments (using cv2 on the contour)
    hu_moments = cv2.HuMoments(cv2.moments(contour)).flatten()

    # Histogram of Oriented Gradients (HOG) - using the image directly
    try:
        # HOG requires a specific input format and parameters. This is a basic example.
        # The parameters for HOG (pixels_per_cell, cells_per_block, orientations) can be tuned.
        hog_features = hog(image, pixels_per_cell=(16, 16), cells_per_block=(1, 1), orientations=9, feature_vector=True)
    except ValueError:
        hog_features = [0] * 9 # Example size, adjust based on HOG parameters

    # Combine features
    features = [area, perimeter, major_axis_length, minor_axis_length, axis_ratio, eccentricity, solidity, roundness] + list(hu_moments) + list(hog_features)

    return features

# Extract features for training and testing data
train_shape_features = [extract_shape_features(nucleus_img) for nucleus_img, label in all_train_labeled_nuclei]
test_shape_features = [extract_shape_features(nucleus_img) for nucleus_img, label in all_test_labeled_nuclei]

# Convert to numpy arrays
train_shape_features = np.array(train_shape_features)
test_shape_features = np.array(test_shape_features)

print(f"Shape of train_shape_features: {train_shape_features.shape}")
print(f"Shape of test_shape_features: {test_shape_features.shape}")

In [None]:
import mahotas as mh
from skimage import feature, measure
import cv2
import numpy as np
from skimage.filters import gaussian#, laplacian # Comment out laplacian import
from skimage.feature import SIFT
import scipy.ndimage # Import scipy for laplacian

def extract_texture_features(image):
    """Extracts various texture features from a grayscale image."""

    # Ensure the image is in the correct format (uint8) for some libraries
    image = image.astype(np.uint8)

    # Check if the image is too small for feature extraction
    if image.shape[0] < 21 or image.shape[1] < 21: # Some features like Gabor might need a minimum size
         return [0] * (13 + 27 + 2 + 16 + 25 + 2 + 4*4) # Haralick + LBP + LoG (mean, std) + Gabor (4 orientations * 2 freqs * 2 features) + Zernike (degree 8) + SIFT (mean, std) + skimage haralick

    # Haralick Texture Features (using mahotas) - keeping this as it was used before
    # Check if the image is not all zeros or constant, which can cause issues with haralick
    if np.all(image == image[0, 0]):
        haralick_features = [0] * 13
    else:
        try:
            haralick_features = mh.features.haralick(image).tolist()
        except ValueError:
            haralick_features = [0] * 13

    # Local Binary Patterns (LBP) (using skimage) - from user's snippet
    try:
        lbp = feature.local_binary_pattern(image, P=8, R=1, method='uniform')
        # Use a fixed number of bins for the histogram (26 uniform patterns + 2 for ends)
        lbp_hist, _ = np.histogram(lbp, bins=np.arange(0, 28), density=True)
        lbp_features = lbp_hist.tolist()
    except Exception:
        lbp_features = [0] * 27 # 26 uniform patterns + 1 for the remaining

    # Haralick Texture Features (using scikit-image) - from user's snippet - using different distances/angles
    try:
        # Ensure image is uint8 for graycomatrix
        image_uint8 = image.astype(np.uint8)
        # Adjust distances and angles as needed
        glcm = measure.graycomatrix(image_uint8, distances=[1, 3, 5], angles=[0, np.pi/4, np.pi/2, 3*np.pi/4], symmetric=True, normed=True)
        # Extract properties - using contrast, correlation, energy, homogeneity as in user's snippet
        haralick_features_skimage = measure.graycoprops(glcm, props=['contrast', 'correlation', 'energy', 'homogeneity']).flatten().tolist()
    except Exception:
        haralick_features_skimage = [0] * (4 * 4) # 4 properties * 4 angles

    # Laplacian of Gaussian (LoG) (using cv2) - from user's snippet
    try:
        # Apply Gaussian smoothing using cv2
        blurred_image = cv2.GaussianBlur(image, (0, 0), 1)
        # Apply Laplacian filter using cv2
        log_image = cv2.Laplacian(blurred_image, cv2.CV_64F)
        # Simple statistics from LoG response (e.g., mean and variance)
        log_features = [np.mean(log_image), np.std(log_image)]
    except Exception:
        log_features = [0] * 2 # Adjust size based on features extracted


    # Gabor Wavelets (using cv2) - from user's snippet
    try:
        gabor_features = []
        # Define Gabor filter parameters (can be extended with different orientations and frequencies)
        kernels = []
        for theta in np.arange(0, np.pi, np.pi / 4): # 4 orientations
            for freq in [5, 10]: # 2 frequencies
                kern = cv2.getGaborKernel((21, 21), 5.0, theta, freq, 0.5, 0, ktype=cv2.CV_32F)
                kernels.append(kern)

        # Apply filters and extract features (e.g., mean and variance of the response)
        for kernel in kernels:
            fimg = cv2.filter2D(image, cv2.CV_8UC3, kernel) # Changed depth to CV_8UC3 as per documentation examples
            # Convert to grayscale for mean/std calculation if fimg is BGR
            if len(fimg.shape) == 3:
                fimg = cv2.cvtColor(fimg, cv2.COLOR_BGR2GRAY)
            gabor_features.extend([np.mean(fimg), np.std(fimg)])
    except Exception:
        gabor_features = [0] * (4 * 2 * 2) # 4 orientations * 2 frequencies * 2 features (mean, std)


    # Zernike Moments (using mahotas) - keeping this
    try:
        # Ensure the image is binary for Zernike moments
        # Use Otsu's thresholding on the current nucleus image ROI
        _, binary_nucleus = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
        # Compute Zernike moments up to a certain degree (e.g., 8)
        # The number of moments depends on the degree. For degree 8, there are 25 moments.
        zernike_moments = mh.features.zernike_moments(binary_nucleus, radius=image.shape[0]//2, degree=8).tolist()
    except Exception:
         # Approximate number of Zernike moments for degree 8
        zernike_moments = [0] * 25


    # SIFT (Scale-Invariant Feature Transform) (using skimage) - keeping this
    try:
        # Initialize SIFT detector
        # Consider adjusting parameters for feature detection
        detector_extractor = SIFT()
        # Find keypoints and compute descriptors
        detector_extractor.detect_and_extract(image)
        descriptors = detector_extractor.descriptors

        # If no keypoints are found, descriptors will be None
        if descriptors is not None:
            # Simple representation: mean and standard deviation of descriptors
            sift_features = [np.mean(descriptors), np.std(descriptors)]
            # Or, you could cluster descriptors or use a bag-of-visual-words approach
        else:
            sift_features = [0] * 2 # Adjust size based on features extracted (e.g., mean, std)
    except Exception:
        sift_features = [0] * 2 # Adjust size based on features extracted


    # Combine all texture features - including both mahotas and skimage haralick
    features = haralick_features + lbp_features + haralick_features_skimage + log_features + gabor_features + zernike_moments + sift_features

    return features


# Extract texture features for training and testing data
train_texture_features = [extract_texture_features(nucleus_img) for nucleus_img, label in all_train_labeled_nuclei]
test_texture_features = [extract_texture_features(nucleus_img) for nucleus_img, label in all_test_labeled_nuclei]

# Convert to numpy arrays
# Ensure all feature lists have the same length before converting to numpy array
def pad_features(features_list):
    max_len = max(len(f) for f in features_list)
    padded_features = []
    for f in features_list:
        # Ensure each element in f is a number before padding
        padded_features.append([float(x) if isinstance(x, (int, float)) else 0.0 for x in f] + [0.0] * (max_len - len(f)))
    return np.array(padded_features)

train_texture_features = pad_features(train_texture_features)
test_texture_features = pad_features(test_texture_features)


print(f"Shape of train_texture_features: {train_texture_features.shape}")
print(f"Shape of test_texture_features: {test_texture_features.shape}")

In [None]:
# Combine shape and texture features for training
# Ensure both arrays have the same number of samples
if train_shape_features.shape[0] == train_texture_features.shape[0]:
    train_texture_features_flat = train_texture_features.reshape(train_texture_features.shape[0], -1)
    train_combined_features = np.concatenate((train_shape_features, train_texture_features_flat), axis=1)
    print(f"Shape of train_combined_features: {train_combined_features.shape}")
else:
    print("Mismatch in the number of training samples for shape and texture features.")
    train_combined_features = None # Or handle the error appropriately


# Combine shape and texture features for testing
# Ensure both arrays have the same number of samples
if test_shape_features.shape[0] == test_texture_features.shape[0]:
    test_texture_features_flat = test_texture_features.reshape(test_texture_features.shape[0], -1)
    test_combined_features = np.concatenate((test_shape_features, test_texture_features_flat), axis=1)
    print(f"Shape of test_combined_features: {test_combined_features.shape}")
else:
    print("Mismatch in the number of testing samples for shape and texture features.")
    test_combined_features = None # Or handle the error appropriately

In [None]:
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import numpy as np # Ensure numpy is imported
from sklearn.feature_selection import SelectKBest, f_classif # Import feature selection

# Apply SMOTE to handle class imbalance on the combined training features
if train_combined_features is not None and train_labels_for_model is not None:
    smote = SMOTE(random_state=42)
    train_features_smote, train_labels_smote = smote.fit_resample(train_combined_features, train_labels_for_model)

    # Define the parameter grid for GridSearchCV
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'class_weight': [None, 'balanced', 'balanced_subsample'] # Experiment with class weighting
    }

    # Instantiate the Random Forest Classifier
    rf = RandomForestClassifier(random_state=42)

    # Instantiate GridSearchCV
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='recall', n_jobs=-1) # Optimize for recall

    # Fit GridSearchCV to the SMOTE-resampled training data
    grid_search.fit(train_features_smote, train_labels_smote)

    # Get the best parameters and the best estimator
    best_params = grid_search.best_params_
    best_rf_classifier = grid_search.best_estimator_

    print("Best parameters found by GridSearchCV:")
    print(best_params)

    print("\nOptimized Random Forest classifier trained successfully.")

    # Evaluate the best model on the original (non-SMOTE) test set
    if test_combined_features is not None and test_labels_for_model is not None:
        predictions = best_rf_classifier.predict(test_combined_features)
        print("\nOptimized Random Forest Classifier Evaluation on Test Set:")
        print(classification_report(test_labels_for_model, predictions))
    else:
        print("\nTest data not available for evaluating the optimized Random Forest classifier.")

else:
    print("Combined training features or labels not available for Random Forest optimization.")

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import pandas as pd
import numpy as np # Ensure numpy is imported

# Make predictions on the test set
predictions = best_rf_classifier.predict(test_combined_features) # Use the best classifier

# Generate and print classification report
print("Optimized Random Forest Classifier Evaluation:")
print(classification_report(test_labels_for_model, predictions))

# Compute and print accuracy score
accuracy = accuracy_score(test_labels_for_model, predictions)
print(f"Accuracy Score: {accuracy}")

# Generate and print confusion matrix
conf_matrix = confusion_matrix(test_labels_for_model, predictions)
print("Confusion Matrix:")
print(conf_matrix)

# Extract feature importances from the best classifier
feature_importances = best_rf_classifier.feature_importances_

# Create a list of feature names (based on the order they were combined)
# This requires knowing the number of features from each category
num_shape_features = train_shape_features.shape[1]

# Correctly calculate the number of flattened texture features
# The number of texture features is simply the second dimension of the flattened array.
# We can get this from the shape of train_texture_features_flat created in the previous step.
try:
    num_texture_features_flat = train_texture_features_flat.shape[1]
except NameError:
    # If train_texture_features_flat was not created, calculate its expected size
    # based on the total combined features and shape features
    if train_combined_features is not None and train_shape_features is not None:
        num_texture_features_flat = train_combined_features.shape[1] - train_shape_features.shape[1]
    else:
        # Fallback if necessary variables are not defined
        num_texture_features_flat = 0 # Default to 0 if cannot determine

# Assuming a simple naming convention for demonstration
shape_feature_names = [f"shape_{i}" for i in range(num_shape_features)]
texture_feature_names = [f"texture_{i}" for i in range(num_texture_features_flat)]
all_feature_names = shape_feature_names + texture_feature_names

# Check if the lengths match before creating the Series
if len(feature_importances) == len(all_feature_names):
    # Create a pandas Series for feature importances
    feature_importance_series = pd.Series(feature_importances, index=all_feature_names)

    # Sort feature importances in descending order
    sorted_feature_importances = feature_importance_series.sort_values(ascending=False)

    # Print the top 20 most important features
    print("\nTop 20 Most Important Features:")
    print(sorted_feature_importances.head(20))
else:
    print(f"Mismatch in feature counts: Model has {len(feature_importances)}, generated names have {len(all_feature_names)}")

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Define the CNN model architecture
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(64, 64, 1)),
    MaxPooling2D((2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Conv2D(128, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid') # Sigmoid for binary classification
])

# Compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Setup data augmentation
datagen = ImageDataGenerator(
    rotation_range=20,
    zoom_range=0.1,
    width_shift_range=0.1,
    height_shift_range=0.1,
    horizontal_flip=True,
    vertical_flip=True
)

# Fit the data generator on the training data
datagen.fit(train_images_for_model) # Use the original train_images_for_model

# Train the model using the augmented data
history = model.fit(datagen.flow(train_images_for_model, train_labels_for_model, batch_size=32),
                    epochs=50, # Number of training epochs
                    validation_data=(test_images_for_model, test_labels_for_model)) # Use the original test_images_for_model and labels

print("CNN model training completed.")

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

# Make predictions on the test set
predictions_prob = model.predict(test_images_for_model)
predictions = (predictions_prob > 0.5).astype("int32")

# Generate and print classification report
print("CNN Classification Report:")
print(classification_report(test_labels_for_model, predictions))

# Generate and print confusion matrix
conf_matrix_cnn = confusion_matrix(test_labels_for_model, predictions)
print("CNN Confusion Matrix:")
print(conf_matrix_cnn)

In [None]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import SelectKBest, f_classif

# Get predictions (probabilities) from the trained CNN on the training and testing data
train_cnn_predictions_prob = model.predict(train_images_for_model) # Use the original train_images_for_model
test_cnn_predictions_prob = model.predict(test_images_for_model) # Use the original test_images_for_model

# Reshape CNN predictions to be 2D arrays
train_cnn_predictions_prob = train_cnn_predictions_prob.reshape(-1, 1)
test_cnn_predictions_prob = test_cnn_predictions_prob.reshape(-1, 1)


# Combine CNN probabilities with handcrafted features for training
# Ensure both arrays have the same number of samples
if train_combined_features.shape[0] == train_cnn_predictions_prob.shape[0]:
    train_ensemble_features = np.concatenate((train_combined_features, train_cnn_predictions_prob), axis=1)
    print(f"Shape of train_ensemble_features: {train_ensemble_features.shape}")
else:
    print("Mismatch in the number of training samples for combined features and CNN predictions.")
    train_ensemble_features = None # Or handle the error appropriately


# Combine CNN probabilities with handcrafted features for testing
# Ensure both arrays have the same number of samples
if test_combined_features.shape[0] == test_cnn_predictions_prob.shape[0]:
    test_ensemble_features = np.concatenate((test_combined_features, test_cnn_predictions_prob), axis=1)
    print(f"Shape of test_ensemble_features: {test_ensemble_features.shape}")
else:
    print("Mismatch in the number of testing samples for combined features and CNN predictions.")
    test_ensemble_features = None # Or handle the error appropriately


# --- Feature Selection (Optional but Recommended for Ensemble) ---
# Use SelectKBest to select the top K features based on ANOVA F-value
# You can adjust the value of k (number of features to select)
if train_ensemble_features is not None and train_labels_for_model is not None:
    # Address potential NaNs or Infs in features before selection
    train_ensemble_features_clean = np.nan_to_num(train_ensemble_features)

    # Check if there are enough samples for feature selection
    if train_ensemble_features_clean.shape[0] > 1 and np.var(train_ensemble_features_clean, axis=0).sum() > 0:
        k = min(100, train_ensemble_features_clean.shape[1]) # Select top 100 features or fewer if less are available
        try:
            selector = SelectKBest(score_func=f_classif, k=k)
            train_ensemble_features_selected = selector.fit_transform(train_ensemble_features_clean, train_labels_for_model)
            test_ensemble_features_selected = selector.transform(np.nan_to_num(test_ensemble_features)) # Apply the same selection to test data
            print(f"Shape of train_ensemble_features_selected: {train_ensemble_features_selected.shape}")
            print(f"Shape of test_ensemble_features_selected: {test_ensemble_features_selected.shape}")
        except ValueError as e:
             print(f"Could not perform feature selection: {e}")
             # Fallback to using all features if selection fails
             train_ensemble_features_selected = train_ensemble_features_clean
             test_ensemble_features_selected = np.nan_to_num(test_ensemble_features)
             print("Using all features for ensemble training.")
    else:
         print("Not enough samples or variance for feature selection. Using all features.")
         train_ensemble_features_selected = train_ensemble_features_clean
         test_ensemble_features_selected = np.nan_to_num(test_ensemble_features)
else:
    print("Ensemble features not available for selection.")
    train_ensemble_features_selected = None
    test_ensemble_features_selected = None


# --- Train Second-Level Random Forest Classifier ---
# Apply SMOTE to the selected training features to handle class imbalance for the ensemble model
if train_ensemble_features_selected is not None and train_labels_for_model is not None:
    try:
        smote_ensemble = SMOTE(random_state=42)
        train_ensemble_features_smote, train_labels_ensemble_smote = smote_ensemble.fit_resample(train_ensemble_features_selected, train_labels_for_model)

        # Instantiate and train the second-level Random Forest Classifier
        ensemble_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
        ensemble_classifier.fit(train_ensemble_features_smote, train_labels_ensemble_smote)

        print("\nEnsemble Random Forest classifier trained successfully with SMOTE.")
    except ValueError as e:
         print(f"Could not train ensemble classifier: {e}")
         ensemble_classifier = None
else:
    print("Could not train ensemble classifier due to missing data.")

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Make predictions on the test set using the ensemble model
if test_ensemble_features_selected is not None and ensemble_classifier is not None:
    ensemble_predictions = ensemble_classifier.predict(test_ensemble_features_selected)

    # Generate and print classification report for the ensemble model
    print("Ensemble Model Classification Report:")
    print(classification_report(test_labels_for_model, ensemble_predictions))

    # Compute and print accuracy score for the ensemble model
    ensemble_accuracy = accuracy_score(test_labels_for_model, ensemble_predictions)
    print(f"Ensemble Model Accuracy Score: {ensemble_accuracy}")

    # Generate and print confusion matrix for the ensemble model
    conf_matrix_ensemble = confusion_matrix(test_labels_for_model, ensemble_predictions)
    print("Ensemble Model Confusion Matrix:")
    print(conf_matrix_ensemble)
else:
    print("Ensemble model or test features not available for evaluation.")

In [None]:
import cv2
import os
import numpy as np

def load_and_preprocess_images(base_dir):
    """Loads images from specified directory and converts to grayscale."""
    images = []
    image_filenames = sorted(os.listdir(base_dir))
    for filename in image_filenames:
        img_path = os.path.join(base_dir, filename)
        img = cv2.imread(img_path)

        # Check if image was loaded successfully
        if img is None:
            print(f"Warning: Could not load image {img_path}")
            continue

        # OpenCV loads images in BGR format, convert to grayscale
        gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        images.append(gray_img)
    return np.array(images)

train_red_images = load_and_preprocess_images('DNA_Damage_Project/image/red')
test_red_images = load_and_preprocess_images('DNA_Damage_Project/test images/red')

print(f"Loaded {len(train_red_images)} training red images.")
print(f"Loaded {len(test_red_images)} testing red images.")

In [None]:
import cv2
import os
import numpy as np

def load_and_preprocess_images(base_dir):
    """Loads images from specified directory and converts to grayscale."""
    images = []
    image_filenames = sorted(os.listdir(base_dir))
    for filename in image_filenames:
        img_path = os.path.join(base_dir, filename)
        img = cv2.imread(img_path)

        # Check if image was loaded successfully
        if img is None:
            print(f"Warning: Could not load image {img_path}")
            continue

        # OpenCV loads images in BGR format, convert to grayscale
        gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        images.append(gray_img)
    return np.array(images)

train_green_images = load_and_preprocess_images('DNA_Damage_Project/image/green')
test_green_images = load_and_preprocess_images('DNA_Damage_Project/test images/green')

print(f"Loaded {len(train_green_images)} training green images.")
print(f"Loaded {len(test_green_images)} testing green images.")

In [None]:
def pair_images(red_images, green_images):
    """Pairs corresponding red and green images."""
    # Assuming images are already sorted by filename in the loading step
    # and that the filenames match between red and green directories
    paired_images = []
    for i in range(len(red_images)):
        paired_images.append((red_images[i], green_images[i]))
    return paired_images

train_paired_images = pair_images(train_red_images, train_green_images)
test_paired_images = pair_images(test_red_images, test_green_images)

print(f"Created {len(train_paired_images)} paired training images.")
print(f"Created {len(test_paired_images)} paired testing images.")

In [None]:
def segment_nuclei(image):
    """Segments nuclei using Otsu's thresholding and watershed."""
    # Apply Otsu's thresholding
    _, thresh = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

    # Morphological operations to clean up the mask
    kernel = np.ones((3, 3), np.uint8)
    opening = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel, iterations=2)

    # Sure background area
    sure_bg = cv2.dilate(opening, kernel, iterations=3)

    # Sure foreground area
    dist_transform = cv2.distanceTransform(opening, cv2.DIST_L2, 5)
    _, sure_fg = cv2.threshold(dist_transform, 0.7 * dist_transform.max(), 255, 0)

    # Unknown region
    sure_fg = np.uint8(sure_fg)
    unknown = cv2.subtract(sure_bg, sure_fg)

    # Marker labelling
    _, markers = cv2.connectedComponents(sure_fg)

    # Add one to all labels so that sure background is not 0, but 1
    markers = markers + 1

    # Now, mark the region of unknown with zero
    markers[unknown == 255] = 0

    # Apply watershed algorithm
    # Create a BGR version of the grayscale image for watershed visualization (optional, but watershed expects 3 channels)
    img_bgr = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
    markers = cv2.watershed(img_bgr, markers)

    # Create a binary mask from the watershed result
    segmented_mask = np.zeros_like(image, dtype=np.uint8)
    segmented_mask[markers > 1] = 255  # Nuclei are marked with labels > 1

    # Remove small objects (noise)
    min_size = 100  # Minimum size threshold for nuclei
    num_labels, labels_img, stats, centroids = cv2.connectedComponentsWithStats(segmented_mask, connectivity=8)

    cleaned_mask = np.zeros_like(segmented_mask)
    for i in range(1, num_labels): # Start from 1 to exclude background
        if stats[i, cv2.CC_STAT_AREA] >= min_size:
            cleaned_mask[labels_img == i] = 255

    return cleaned_mask

train_segmented_masks = [segment_nuclei(img) for img in train_red_images]
test_segmented_masks = [segment_nuclei(img) for img in test_red_images]

print(f"Generated {len(train_segmented_masks)} training segmented masks.")
print(f"Generated {len(test_segmented_masks)} testing segmented masks.")

In [None]:
def apply_global_thresholding(image, threshold_value=50):
    """Applies global thresholding to a grayscale image."""
    # Apply global thresholding
    _, thresholded_img = cv2.threshold(image, threshold_value, 255, cv2.THRESH_BINARY)
    return thresholded_img

# Apply thresholding to training and testing green images
# Using the threshold_value from the previous cell if it exists, otherwise use a default
global threshold_value
try:
    threshold_value_to_use = threshold_value
except NameError:
    threshold_value_to_use = 50 # Default value

train_green_thresholded = [apply_global_thresholding(img, threshold_value_to_use) for img in train_green_images]
test_green_thresholded = [apply_global_thresholding(img, threshold_value_to_use) for img in test_green_images]

print(f"Applied global thresholding to {len(train_green_thresholded)} training green images.")
print(f"Applied global thresholding to {len(test_green_thresholded)} testing green images.")

In [None]:
def detect_foci_and_label_nucleus(paired_image, segmented_mask, thresholded_green_image, foci_threshold=10):
    """
    Detects green foci within segmented nuclei and labels each nucleus
    as "damaged" or "normal".

    Args:
        paired_image: A tuple containing the red and green channel images (grayscale).
        segmented_mask: The binary segmented mask for the nuclei.
        thresholded_green_image: The thresholded green channel image.
        foci_threshold: The minimum number of non-zero pixels (foci) to label a nucleus as "damaged".

    Returns:
        A list of tuples, where each tuple contains the normalized
        red nucleus image and its corresponding label ("damaged" or "normal").
    """
    red_img, green_img = paired_image
    labeled_nuclei = []

    # Find contours in the segmented mask to identify individual nuclei
    contours, _ = cv2.findContours(segmented_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    for contour in contours:
        # Create a mask specifically for the current nucleus
        nucleus_mask = np.zeros_like(segmented_mask)
        cv2.drawContours(nucleus_mask, [contour], -1, 255, -1)

        # Apply this nucleus mask to the thresholded green channel image
        masked_green = cv2.bitwise_and(thresholded_green_image, thresholded_green_image, mask=nucleus_mask)

        # Count the number of non-zero pixels (representing potential foci)
        foci_count = np.count_nonzero(masked_green)

        # Determine the label based on foci count
        label = "damaged" if foci_count > foci_threshold else "normal"

        # Find bounding box
        x, y, w, h = cv2.boundingRect(contour)

        # Extract ROI for the red channel
        red_roi = red_img[y:y+h, x:x+w]

        # Calculate minimum enclosing circle
        (center_x, center_y), radius = cv2.minEnclosingCircle(contour)
        center_x -= x  # Adjust center to be relative to ROI
        center_y -= y

        # Determine size of square bounding box for the circle
        square_size = int(np.ceil(2 * radius))
        padding = square_size

        # Pad the red ROI to the square size, centering the nucleus
        # Calculate padding amounts
        pad_x_before = int((padding - w) / 2)
        pad_x_after = padding - w - pad_x_before
        pad_y_before = int((padding - h) / 2)
        pad_y_after = padding - h - pad_y_before

        padded_red_roi = np.pad(red_roi, ((pad_y_before, pad_y_after), (pad_x_before, pad_x_after)), mode='constant')

        # Resize to target size (assuming the target size is 64x64 as used in the previous normalization step)
        target_size = (64, 64)
        normalized_red_nucleus = cv2.resize(padded_red_roi, target_size, interpolation=cv2.INTER_AREA)


        # Store the normalized red nucleus image and its label
        labeled_nuclei.append((normalized_red_nucleus, label))

    return labeled_nuclei

# Apply foci detection and labeling to training and testing sets
train_labeled_nuclei = []
for i in range(len(train_paired_images)):
    labeled_nuclei_in_image = detect_foci_and_label_nucleus(
        train_paired_images[i],
        train_segmented_masks[i],
        train_green_thresholded[i]
    )
    train_labeled_nuclei.extend(labeled_nuclei_in_image)

test_labeled_nuclei = []
for i in range(len(test_paired_images)):
    labeled_nuclei_in_image = detect_foci_and_label_nucleus(
        test_paired_images[i],
        test_segmented_masks[i],
        test_green_thresholded[i]
    )
    test_labeled_nuclei.extend(labeled_nuclei_in_image)

# Count damaged and normal nuclei
train_damaged_count = sum(1 for img, label in train_labeled_nuclei if label == "damaged")
train_normal_count = sum(1 for img, label in train_labeled_nuclei if label == "normal")
test_damaged_count = sum(1 for img, label in test_labeled_nuclei if label == "damaged")
test_normal_count = sum(1 for img, label in test_labeled_nuclei if label == "normal")

print(f"Training set: {train_damaged_count} damaged, {train_normal_count} normal.")
print(f"Testing set: {test_damaged_count} damaged, {test_normal_count} normal.")

In [None]:
def process_image_pair(paired_image):
    """
    Processes a paired image (red, green) through segmentation,
    thresholding, foci detection, and labeling.

    Args:
        paired_image: A tuple containing the red and green channel images (grayscale).

    Returns:
        A list of tuples, where each tuple contains the normalized
        red nucleus image and its corresponding label ("damaged" or "normal").
    """
    red_img, green_img = paired_image

    # 1. Nucleus segmentation (red channel)
    segmented_mask = segment_nuclei(red_img)

    # 2. Green channel global thresholding
    # Use the threshold_value defined previously (or a default if not defined)
    global threshold_value # Access the threshold value from the previous cell if it exists
    try:
        threshold_value_to_use = threshold_value
    except NameError:
        threshold_value_to_use = 50 # Default value if not set

    thresholded_green_image = apply_global_thresholding(green_img, threshold_value_to_use)


    # 3. Foci detection and nucleus labeling
    labeled_nuclei_in_image = detect_foci_and_label_nucleus(
        paired_image,
        segmented_mask,
        thresholded_green_image
    )

    return labeled_nuclei_in_image

# Process training images
all_train_labeled_nuclei = []
for paired_img in train_paired_images:
    labeled_nuclei = process_image_pair(paired_img)
    all_train_labeled_nuclei.extend(labeled_nuclei)

# Process testing images
all_test_labeled_nuclei = []
for paired_img in test_paired_images:
    labeled_nuclei = process_image_pair(paired_img)
    all_test_labeled_nuclei.extend(labeled_nuclei)

# Print the total number of labeled nuclei
print(f"Total labeled nuclei in training set: {len(all_train_labeled_nuclei)}")
print(f"Total labeled nuclei in testing set: {len(all_test_labeled_nuclei)}")

In [None]:
# Separate images and labels for training
train_images_for_model = [nucleus[0] for nucleus in all_train_labeled_nuclei]
train_labels_for_model = [nucleus[1] for nucleus in all_train_labeled_nuclei]

# Separate images and labels for testing
test_images_for_model = [nucleus[0] for nucleus in all_test_labeled_nuclei]
test_labels_for_model = [nucleus[1] for nucleus in all_test_labeled_nuclei]

# Convert images to NumPy arrays and add channel dimension
train_images_for_model = np.array(train_images_for_model).reshape(-1, 64, 64, 1)
test_images_for_model = np.array(test_images_for_model).reshape(-1, 64, 64, 1)

# Convert labels to NumPy arrays and to numerical format
label_mapping = {"normal": 0, "damaged": 1}
train_labels_for_model = np.array([label_mapping[label] for label in train_labels_for_model])
test_labels_for_model = np.array([label_mapping[label] for label in test_labels_for_model])

# Print shapes
print(f"Shape of train_images_for_model: {train_images_for_model.shape}")
print(f"Shape of train_labels_for_model: {train_labels_for_model.shape}")
print(f"Shape of test_images_for_model: {test_images_for_model.shape}")
print(f"Shape of test_labels_for_model: {test_labels_for_model.shape}")

In [None]:
import mahotas
from skimage.feature import hog
import cv2
import numpy as np
from skimage import measure # Import measure for regionprops

def extract_shape_features(image):
    """Extracts shape features from a binary nucleus mask using skimage.measure.regionprops."""
    # Ensure the image is binary (0 or 255) and has the correct dtype for regionprops
    if image.max() <= 1:
        image = image * 255
    image = image.astype(np.uint8)

    # Find contours (still needed for some calculations like perimeter and hull)
    contours, _ = cv2.findContours(image.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    if not contours:
        # Return a list of zeros if no contour is found
        # Adjust size based on the features extracted by regionprops + Hu moments + HOG
        # regionprops provides area, perimeter, major/minor axis length, eccentricity, solidity
        # Hu moments are 7. HOG depends on parameters (e.g., 9 orientations for 1 cell block size 16x16)
        return [0] * (5 + 7 + 9)

    # Assuming the largest contour is the nucleus
    contour = max(contours, key=cv2.contourArea)

    # Use regionprops for basic shape features
    # Need to create a labeled image for regionprops
    labeled_image, num_labels = measure.label(image, connectivity=2, return_num=True)

    if num_labels < 1:
         # Return a list of zeros if no labeled regions are found
         return [0] * (5 + 7 + 9)

    # Find properties of the largest region (assumed to be the nucleus)
    properties = measure.regionprops(labeled_image)[0] # Assuming only one main region after segmentation

    area = properties.area
    perimeter = properties.perimeter
    major_axis_length = properties.major_axis_length if properties.major_axis_length is not None else 0
    minor_axis_length = properties.minor_axis_length if properties.minor_axis_length is not None else 0
    axis_ratio = major_axis_length / minor_axis_length if minor_axis_length > 0 else 0
    eccentricity = properties.eccentricity if properties.eccentricity is not None else 0
    solidity = properties.solidity if properties.solidity is not None else 0

    # Roundness/Form Factor (calculated using area and perimeter from regionprops)
    roundness = 4 * np.pi * area / (perimeter**2) if perimeter > 0 else 0

    # Hu Moments (using cv2 on the contour)
    hu_moments = cv2.HuMoments(cv2.moments(contour)).flatten()

    # Histogram of Oriented Gradients (HOG) - using the image directly
    try:
        # HOG requires a specific input format and parameters. This is a basic example.
        # The parameters for HOG (pixels_per_cell, cells_per_block, orientations) can be tuned.
        hog_features = hog(image, pixels_per_cell=(16, 16), cells_per_block=(1, 1), orientations=9, feature_vector=True)
    except ValueError:
        hog_features = [0] * 9 # Example size, adjust based on HOG parameters

    # Combine features
    features = [area, perimeter, major_axis_length, minor_axis_length, axis_ratio, eccentricity, solidity, roundness] + list(hu_moments) + list(hog_features)

    return features

# Extract features for training and testing data
train_shape_features = [extract_shape_features(nucleus_img) for nucleus_img, label in all_train_labeled_nuclei]
test_shape_features = [extract_shape_features(nucleus_img) for nucleus_img, label in all_test_labeled_nuclei]

# Convert to numpy arrays
train_shape_features = np.array(train_shape_features)
test_shape_features = np.array(test_shape_features)

print(f"Shape of train_shape_features: {train_shape_features.shape}")
print(f"Shape of test_shape_features: {test_shape_features.shape}")

In [None]:
import mahotas as mh
from skimage import feature, measure
import cv2
import numpy as np
from skimage.filters import gaussian#, laplacian # Comment out laplacian import
from skimage.feature import SIFT
import scipy.ndimage # Import scipy for laplacian

def extract_texture_features(image):
    """Extracts various texture features from a grayscale image."""

    # Ensure the image is in the correct format (uint8) for some libraries
    image = image.astype(np.uint8)

    # Check if the image is too small for feature extraction
    if image.shape[0] < 21 or image.shape[1] < 21: # Some features like Gabor might need a minimum size
         return [0] * (13 + 27 + 2 + 16 + 25 + 2 + 4*4) # Haralick + LBP + LoG (mean, std) + Gabor (4 orientations * 2 freqs * 2 features) + Zernike (degree 8) + SIFT (mean, std) + skimage haralick

    # Haralick Texture Features (using mahotas) - keeping this as it was used before
    # Check if the image is not all zeros or constant, which can cause issues with haralick
    if np.all(image == image[0, 0]):
        haralick_features = [0] * 13
    else:
        try:
            haralick_features = mh.features.haralick(image).tolist()
        except ValueError:
            haralick_features = [0] * 13

    # Local Binary Patterns (LBP) (using skimage) - from user's snippet
    try:
        lbp = feature.local_binary_pattern(image, P=8, R=1, method='uniform')
        # Use a fixed number of bins for the histogram (26 uniform patterns + 2 for ends)
        lbp_hist, _ = np.histogram(lbp, bins=np.arange(0, 28), density=True)
        lbp_features = lbp_hist.tolist()
    except Exception:
        lbp_features = [0] * 27 # 26 uniform patterns + 1 for the remaining

    # Haralick Texture Features (using scikit-image) - from user's snippet - using different distances/angles
    try:
        # Ensure image is uint8 for graycomatrix
        image_uint8 = image.astype(np.uint8)
        # Adjust distances and angles as needed
        glcm = measure.graycomatrix(image_uint8, distances=[1, 3, 5], angles=[0, np.pi/4, np.pi/2, 3*np.pi/4], symmetric=True, normed=True)
        # Extract properties - using contrast, correlation, energy, homogeneity as in user's snippet
        haralick_features_skimage = measure.graycoprops(glcm, props=['contrast', 'correlation', 'energy', 'homogeneity']).flatten().tolist()
    except Exception:
        haralick_features_skimage = [0] * (4 * 4) # 4 properties * 4 angles

    # Laplacian of Gaussian (LoG) (using cv2) - from user's snippet
    try:
        # Apply Gaussian smoothing using cv2
        blurred_image = cv2.GaussianBlur(image, (0, 0), 1)
        # Apply Laplacian filter using cv2
        log_image = cv2.Laplacian(blurred_image, cv2.CV_64F)
        # Simple statistics from LoG response (e.g., mean and variance)
        log_features = [np.mean(log_image), np.std(log_image)]
    except Exception:
        log_features = [0] * 2 # Adjust size based on features extracted


    # Gabor Wavelets (using cv2) - from user's snippet
    try:
        gabor_features = []
        # Define Gabor filter parameters (can be extended with different orientations and frequencies)
        kernels = []
        for theta in np.arange(0, np.pi, np.pi / 4): # 4 orientations
            for freq in [5, 10]: # 2 frequencies
                kern = cv2.getGaborKernel((21, 21), 5.0, theta, freq, 0.5, 0, ktype=cv2.CV_32F)
                kernels.append(kern)

        # Apply filters and extract features (e.g., mean and variance of the response)
        for kernel in kernels:
            fimg = cv2.filter2D(image, cv2.CV_8UC3, kernel) # Changed depth to CV_8UC3 as per documentation examples
            # Convert to grayscale for mean/std calculation if fimg is BGR
            if len(fimg.shape) == 3:
                fimg = cv2.cvtColor(fimg, cv2.COLOR_BGR2GRAY)
            gabor_features.extend([np.mean(fimg), np.std(fimg)])
    except Exception:
        gabor_features = [0] * (4 * 2 * 2) # 4 orientations * 2 frequencies * 2 features (mean, std)


    # Zernike Moments (using mahotas) - keeping this
    try:
        # Ensure the image is binary for Zernike moments
        # Use Otsu's thresholding on the current nucleus image ROI
        _, binary_nucleus = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
        # Compute Zernike moments up to a certain degree (e.g., 8)
        # The number of moments depends on the degree. For degree 8, there are 25 moments.
        zernike_moments = mh.features.zernike_moments(binary_nucleus, radius=image.shape[0]//2, degree=8).tolist()
    except Exception:
         # Approximate number of Zernike moments for degree 8
        zernike_moments = [0] * 25


    # SIFT (Scale-Invariant Feature Transform) (using skimage) - keeping this
    try:
        # Initialize SIFT detector
        # Consider adjusting parameters for feature detection
        detector_extractor = SIFT()
        # Find keypoints and compute descriptors
        detector_extractor.detect_and_extract(image)
        descriptors = detector_extractor.descriptors

        # If no keypoints are found, descriptors will be None
        if descriptors is not None:
            # Simple representation: mean and standard deviation of descriptors
            sift_features = [np.mean(descriptors), np.std(descriptors)]
            # Or, you could cluster descriptors or use a bag-of-visual-words approach
        else:
            sift_features = [0] * 2 # Adjust size based on features extracted (e.g., mean, std)
    except Exception:
        sift_features = [0] * 2 # Adjust size based on features extracted


    # Combine all texture features - including both mahotas and skimage haralick
    features = haralick_features + lbp_features + haralick_features_skimage + log_features + gabor_features + zernike_moments + sift_features

    return features


# Extract texture features for training and testing data
train_texture_features = [extract_texture_features(nucleus_img) for nucleus_img, label in all_train_labeled_nuclei]
test_texture_features = [extract_texture_features(nucleus_img) for nucleus_img, label in all_test_labeled_nuclei]

# Convert to numpy arrays
# Ensure all feature lists have the same length before converting to numpy array
def pad_features(features_list):
    max_len = max(len(f) for f in features_list)
    padded_features = []
    for f in features_list:
        # Ensure each element in f is a number before padding
        padded_features.append([float(x) if isinstance(x, (int, float)) else 0.0 for x in f] + [0.0] * (max_len - len(f)))
    return np.array(padded_features)

train_texture_features = pad_features(train_texture_features)
test_texture_features = pad_features(test_texture_features)


print(f"Shape of train_texture_features: {train_texture_features.shape}")
print(f"Shape of test_texture_features: {test_texture_features.shape}")

In [None]:
# Combine shape and texture features for training
# Ensure both arrays have the same number of samples
if train_shape_features.shape[0] == train_texture_features.shape[0]:
    train_texture_features_flat = train_texture_features.reshape(train_texture_features.shape[0], -1)
    train_combined_features = np.concatenate((train_shape_features, train_texture_features_flat), axis=1)
    print(f"Shape of train_combined_features: {train_combined_features.shape}")
else:
    print("Mismatch in the number of training samples for shape and texture features.")
    train_combined_features = None # Or handle the error appropriately


# Combine shape and texture features for testing
# Ensure both arrays have the same number of samples
if test_shape_features.shape[0] == test_texture_features.shape[0]:
    test_texture_features_flat = test_texture_features.reshape(test_texture_features.shape[0], -1)
    test_combined_features = np.concatenate((test_shape_features, test_texture_features_flat), axis=1)
    print(f"Shape of test_combined_features: {test_combined_features.shape}")
else:
    print("Mismatch in the number of testing samples for shape and texture features.")
    test_combined_features = None # Or handle the error appropriately

In [None]:
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import numpy as np # Ensure numpy is imported
from sklearn.feature_selection import SelectKBest, f_classif # Import feature selection

# Apply SMOTE to handle class imbalance on the combined training features
if train_combined_features is not None and train_labels_for_model is not None:
    smote = SMOTE(random_state=42)
    train_features_smote, train_labels_smote = smote.fit_resample(train_combined_features, train_labels_for_model)

    # Define the parameter grid for GridSearchCV
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'class_weight': [None, 'balanced', 'balanced_subsample'] # Experiment with class weighting
    }

    # Instantiate the Random Forest Classifier
    rf = RandomForestClassifier(random_state=42)

    # Instantiate GridSearchCV
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='recall', n_jobs=-1) # Optimize for recall

    # Fit GridSearchCV to the SMOTE-resampled training data
    grid_search.fit(train_features_smote, train_labels_smote)

    # Get the best parameters and the best estimator
    best_params = grid_search.best_params_
    best_rf_classifier = grid_search.best_estimator_

    print("Best parameters found by GridSearchCV:")
    print(best_params)

    print("\nOptimized Random Forest classifier trained successfully.")

    # Evaluate the best model on the original (non-SMOTE) test set
    if test_combined_features is not None and test_labels_for_model is not None:
        predictions = best_rf_classifier.predict(test_combined_features)
        print("\nOptimized Random Forest Classifier Evaluation on Test Set:")
        print(classification_report(test_labels_for_model, predictions))
    else:
        print("\nTest data not available for evaluating the optimized Random Forest classifier.")

else:
    print("Combined training features or labels not available for Random Forest optimization.")

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import pandas as pd
import numpy as np # Ensure numpy is imported

# Make predictions on the test set
predictions = best_rf_classifier.predict(test_combined_features) # Use the best classifier

# Generate and print classification report
print("Optimized Random Forest Classifier Evaluation:")
print(classification_report(test_labels_for_model, predictions))

# Compute and print accuracy score
accuracy = accuracy_score(test_labels_for_model, predictions)
print(f"Accuracy Score: {accuracy}")

# Generate and print confusion matrix
conf_matrix = confusion_matrix(test_labels_for_model, predictions)
print("Confusion Matrix:")
print(conf_matrix)

# Extract feature importances from the best classifier
feature_importances = best_rf_classifier.feature_importances_

# Create a list of feature names (based on the order they were combined)
# This requires knowing the number of features from each category
num_shape_features = train_shape_features.shape[1]

# Correctly calculate the number of flattened texture features
# The number of texture features is simply the second dimension of the flattened array.
# We can get this from the shape of train_texture_features_flat created in the previous step.
try:
    num_texture_features_flat = train_texture_features_flat.shape[1]
except NameError:
    # If train_texture_features_flat was not created, calculate its expected size
    # based on the total combined features and shape features
    if train_combined_features is not None and train_shape_features is not None:
        num_texture_features_flat = train_combined_features.shape[1] - train_shape_features.shape[1]
    else:
        # Fallback if necessary variables are not defined
        num_texture_features_flat = 0 # Default to 0 if cannot determine

# Assuming a simple naming convention for demonstration
shape_feature_names = [f"shape_{i}" for i in range(num_shape_features)]
texture_feature_names = [f"texture_{i}" for i in range(num_texture_features_flat)]
all_feature_names = shape_feature_names + texture_feature_names

# Check if the lengths match before creating the Series
if len(feature_importances) == len(all_feature_names):
    # Create a pandas Series for feature importances
    feature_importance_series = pd.Series(feature_importances, index=all_feature_names)

    # Sort feature importances in descending order
    sorted_feature_importances = feature_importance_series.sort_values(ascending=False)

    # Print the top 20 most important features
    print("\nTop 20 Most Important Features:")
    print(sorted_feature_importances.head(20))
else:
    print(f"Mismatch in feature counts: Model has {len(feature_importances)}, generated names have {len(all_feature_names)}")

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Define the CNN model architecture
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(64, 64, 1)),
    MaxPooling2D((2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Conv2D(128, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid') # Sigmoid for binary classification
])

# Compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Setup data augmentation
datagen = ImageDataGenerator(
    rotation_range=20,
    zoom_range=0.1,
    width_shift_range=0.1,
    height_shift_range=0.1,
    horizontal_flip=True,
    vertical_flip=True
)

# Fit the data generator on the training data
datagen.fit(train_images_for_model) # Use the original train_images_for_model

# Train the model using the augmented data
history = model.fit(datagen.flow(train_images_for_model, train_labels_for_model, batch_size=32),
                    epochs=50, # Number of training epochs
                    validation_data=(test_images_for_model, test_labels_for_model)) # Use the original test_images_for_model and labels

print("CNN model training completed.")

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

# Make predictions on the test set
predictions_prob = model.predict(test_images_for_model)
predictions = (predictions_prob > 0.5).astype("int32")

# Generate and print classification report
print("CNN Classification Report:")
print(classification_report(test_labels_for_model, predictions))

# Generate and print confusion matrix
conf_matrix_cnn = confusion_matrix(test_labels_for_model, predictions)
print("CNN Confusion Matrix:")
print(conf_matrix_cnn)

In [None]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import SelectKBest, f_classif

# Get predictions (probabilities) from the trained CNN on the training and testing data
train_cnn_predictions_prob = model.predict(train_images_for_model) # Use the original train_images_for_model
test_cnn_predictions_prob = model.predict(test_images_for_model) # Use the original test_images_for_model

# Reshape CNN predictions to be 2D arrays
train_cnn_predictions_prob = train_cnn_predictions_prob.reshape(-1, 1)
test_cnn_predictions_prob = test_cnn_predictions_prob.reshape(-1, 1)


# Combine CNN probabilities with handcrafted features for training
# Ensure both arrays have the same number of samples
if train_combined_features.shape[0] == train_cnn_predictions_prob.shape[0]:
    train_ensemble_features = np.concatenate((train_combined_features, train_cnn_predictions_prob), axis=1)
    print(f"Shape of train_ensemble_features: {train_ensemble_features.shape}")
else:
    print("Mismatch in the number of training samples for combined features and CNN predictions.")
    train_ensemble_features = None # Or handle the error appropriately


# Combine CNN probabilities with handcrafted features for testing
# Ensure both arrays have the same number of samples
if test_combined_features.shape[0] == test_cnn_predictions_prob.shape[0]:
    test_ensemble_features = np.concatenate((test_combined_features, test_cnn_predictions_prob), axis=1)
    print(f"Shape of test_ensemble_features: {test_ensemble_features.shape}")
else:
    print("Mismatch in the number of testing samples for combined features and CNN predictions.")
    test_ensemble_features = None # Or handle the error appropriately


# --- Feature Selection (Optional but Recommended for Ensemble) ---
# Use SelectKBest to select the top K features based on ANOVA F-value
# You can adjust the value of k (number of features to select)
if train_ensemble_features is not None and train_labels_for_model is not None:
    # Address potential NaNs or Infs in features before selection
    train_ensemble_features_clean = np.nan_to_num(train_ensemble_features)

    # Check if there are enough samples for feature selection
    if train_ensemble_features_clean.shape[0] > 1 and np.var(train_ensemble_features_clean, axis=0).sum() > 0:
        k = min(100, train_ensemble_features_clean.shape[1]) # Select top 100 features or fewer if less are available
        try:
            selector = SelectKBest(score_func=f_classif, k=k)
            train_ensemble_features_selected = selector.fit_transform(train_ensemble_features_clean, train_labels_for_model)
            test_ensemble_features_selected = selector.transform(np.nan_to_num(test_ensemble_features)) # Apply the same selection to test data
            print(f"Shape of train_ensemble_features_selected: {train_ensemble_features_selected.shape}")
            print(f"Shape of test_ensemble_features_selected: {test_ensemble_features_selected.shape}")
        except ValueError as e:
             print(f"Could not perform feature selection: {e}")
             # Fallback to using all features if selection fails
             train_ensemble_features_selected = train_ensemble_features_clean
             test_ensemble_features_selected = np.nan_to_num(test_ensemble_features)
             print("Using all features for ensemble training.")
    else:
         print("Not enough samples or variance for feature selection. Using all features.")
         train_ensemble_features_selected = train_ensemble_features_clean
         test_ensemble_features_selected = np.nan_to_num(test_ensemble_features)
else:
    print("Ensemble features not available for selection.")
    train_ensemble_features_selected = None
    test_ensemble_features_selected = None


# --- Train Second-Level Random Forest Classifier ---
# Apply SMOTE to the selected training features to handle class imbalance for the ensemble model
if train_ensemble_features_selected is not None and train_labels_for_model is not None:
    try:
        smote_ensemble = SMOTE(random_state=42)
        train_ensemble_features_smote, train_labels_ensemble_smote = smote_ensemble.fit_resample(train_ensemble_features_selected, train_labels_for_model)

        # Instantiate and train the second-level Random Forest Classifier
        ensemble_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
        ensemble_classifier.fit(train_ensemble_features_smote, train_labels_ensemble_smote)

        print("\nEnsemble Random Forest classifier trained successfully with SMOTE.")
    except ValueError as e:
         print(f"Could not train ensemble classifier: {e}")
         ensemble_classifier = None
else:
    print("Could not train ensemble classifier due to missing data.")

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Make predictions on the test set using the ensemble model
if test_ensemble_features_selected is not None and ensemble_classifier is not None:
    ensemble_predictions = ensemble_classifier.predict(test_ensemble_features_selected)

    # Generate and print classification report for the ensemble model
    print("Ensemble Model Classification Report:")
    print(classification_report(test_labels_for_model, ensemble_predictions))

    # Compute and print accuracy score for the ensemble model
    ensemble_accuracy = accuracy_score(test_labels_for_model, ensemble_predictions)
    print(f"Ensemble Model Accuracy Score: {ensemble_accuracy}")

    # Generate and print confusion matrix for the ensemble model
    conf_matrix_ensemble = confusion_matrix(test_labels_for_model, ensemble_predictions)
    print("Ensemble Model Confusion Matrix:")
    print(conf_matrix_ensemble)
else:
    print("Ensemble model or test features not available for evaluation.")

In [None]:
!pip install mahotas

In [None]:
!ls -la

In [None]:
!unzip '/content/DNA_Damage_Project-20250901T154313Z-1-001 (1).zip' -d .

In [None]:
import cv2
import os
import numpy as np

def load_and_preprocess_images(base_dir):
    """Loads images from specified directory and converts to grayscale."""
    images = []
    image_filenames = sorted(os.listdir(base_dir))
    for filename in image_filenames:
        img_path = os.path.join(base_dir, filename)
        img = cv2.imread(img_path)

        # Check if image was loaded successfully
        if img is None:
            print(f"Warning: Could not load image {img_path}")
            continue

        # OpenCV loads images in BGR format, convert to grayscale
        gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        images.append(gray_img)
    return np.array(images)

# Swap train and test assignments based on the user's feedback
train_red_images = load_and_preprocess_images('DNA_Damage_Project/test images/red')
test_red_images = load_and_preprocess_images('DNA_Damage_Project/image/red')


print(f"Loaded {len(train_red_images)} training red images.")
print(f"Loaded {len(test_red_images)} testing red images.")

In [None]:
import cv2
import os
import numpy as np

def load_and_preprocess_images(base_dir):
    """Loads images from specified directory and converts to grayscale."""
    images = []
    image_filenames = sorted(os.listdir(base_dir))
    for filename in image_filenames:
        img_path = os.path.join(base_dir, filename)
        img = cv2.imread(img_path)

        # Check if image was loaded successfully
        if img is None:
            print(f"Warning: Could not load image {img_path}")
            continue

        # OpenCV loads images in BGR format, convert to grayscale
        gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        images.append(gray_img)
    return np.array(images)

# Swap train and test assignments based on the user's feedback
train_green_images = load_and_preprocess_images('DNA_Damage_Project/test images/green')
test_green_images = load_and_preprocess_images('DNA_Damage_Project/image/green')


print(f"Loaded {len(train_green_images)} training green images.")
print(f"Loaded {len(test_green_images)} testing green images.")

In [None]:
def pair_images(red_images, green_images):
    """Pairs corresponding red and green images."""
    # Assuming images are already sorted by filename in the loading step
    # and that the filenames match between red and green directories
    paired_images = []
    for i in range(len(red_images)):
        paired_images.append((red_images[i], green_images[i]))
    return paired_images

# Swap train and test paired images assignments based on the user's feedback
train_paired_images = pair_images(train_red_images, train_green_images)
test_paired_images = pair_images(test_red_images, test_green_images)


print(f"Created {len(train_paired_images)} paired training images.")
print(f"Created {len(test_paired_images)} paired testing images.")

In [None]:
def segment_nuclei(image):
    """Segments nuclei using Otsu's thresholding and watershed."""
    # Apply Otsu's thresholding
    _, thresh = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

    # Morphological operations to clean up the mask
    kernel = np.ones((3, 3), np.uint8)
    opening = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel, iterations=2)

    # Sure background area
    sure_bg = cv2.dilate(opening, kernel, iterations=3)

    # Sure foreground area
    dist_transform = cv2.distanceTransform(opening, cv2.DIST_L2, 5)
    _, sure_fg = cv2.threshold(dist_transform, 0.7 * dist_transform.max(), 255, 0)

    # Unknown region
    sure_fg = np.uint8(sure_fg)
    unknown = cv2.subtract(sure_bg, sure_fg)

    # Marker labelling
    _, markers = cv2.connectedComponents(sure_fg)

    # Add one to all labels so that sure background is not 0, but 1
    markers = markers + 1

    # Now, mark the region of unknown with zero
    markers[unknown == 255] = 0

    # Apply watershed algorithm
    # Create a BGR version of the grayscale image for watershed visualization (optional, but watershed expects 3 channels)
    img_bgr = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
    markers = cv2.watershed(img_bgr, markers)

    # Create a binary mask from the watershed result
    segmented_mask = np.zeros_like(image, dtype=np.uint8)
    segmented_mask[markers > 1] = 255  # Nuclei are marked with labels > 1

    # Remove small objects (noise)
    min_size = 100  # Minimum size threshold for nuclei
    num_labels, labels_img, stats, centroids = cv2.connectedComponentsWithStats(segmented_mask, connectivity=8)

    cleaned_mask = np.zeros_like(segmented_mask)
    for i in range(1, num_labels): # Start from 1 to exclude background
        if stats[i, cv2.CC_STAT_AREA] >= min_size:
            cleaned_mask[labels_img == i] = 255

    return cleaned_mask

train_segmented_masks = [segment_nuclei(img) for img in train_red_images]
test_segmented_masks = [segment_nuclei(img) for img in test_red_images]

print(f"Generated {len(train_segmented_masks)} training segmented masks.")
print(f"Generated {len(test_segmented_masks)} testing segmented masks.")

In [None]:
def apply_global_thresholding(image, threshold_value=50):
    """Applies global thresholding to a grayscale image."""
    # Apply global thresholding
    _, thresholded_img = cv2.threshold(image, threshold_value, 255, cv2.THRESH_BINARY)
    return thresholded_img

# Apply thresholding to training and testing green images
# Using the threshold_value from the previous cell if it exists, otherwise use a default
global threshold_value
try:
    threshold_value_to_use = threshold_value
except NameError:
    threshold_value_to_use = 50 # Default value

train_green_thresholded = [apply_global_thresholding(img, threshold_value_to_use) for img in train_green_images]
test_green_thresholded = [apply_global_thresholding(img, threshold_value_to_use) for img in test_green_images]

print(f"Applied global thresholding to {len(train_green_thresholded)} training green images.")
print(f"Applied global thresholding to {len(test_green_thresholded)} testing green images.")

In [None]:
def detect_foci_and_label_nucleus(paired_image, segmented_mask, thresholded_green_image, foci_threshold=10):
    """
    Detects green foci within segmented nuclei and labels each nucleus
    as "damaged" or "normal".

    Args:
        paired_image: A tuple containing the red and green channel images (grayscale).
        segmented_mask: The binary segmented mask for the nuclei.
        thresholded_green_image: The thresholded green channel image.
        foci_threshold: The minimum number of non-zero pixels (foci) to label a nucleus as "damaged".

    Returns:
        A list of tuples, where each tuple contains the normalized
        red nucleus image and its corresponding label ("damaged" or "normal").
    """
    red_img, green_img = paired_image
    labeled_nuclei = []

    # Find contours in the segmented mask to identify individual nuclei
    contours, _ = cv2.findContours(segmented_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    for contour in contours:
        # Create a mask specifically for the current nucleus
        nucleus_mask = np.zeros_like(segmented_mask)
        cv2.drawContours(nucleus_mask, [contour], -1, 255, -1)

        # Apply this nucleus mask to the thresholded green channel image
        masked_green = cv2.bitwise_and(thresholded_green_image, thresholded_green_image, mask=nucleus_mask)

        # Count the number of non-zero pixels (representing potential foci)
        foci_count = np.count_nonzero(masked_green)

        # Determine the label based on foci count
        label = "damaged" if foci_count > foci_threshold else "normal"

        # Find bounding box
        x, y, w, h = cv2.boundingRect(contour)

        # Extract ROI for the red channel
        red_roi = red_img[y:y+h, x:x+w]

        # Calculate minimum enclosing circle
        (center_x, center_y), radius = cv2.minEnclosingCircle(contour)
        center_x -= x  # Adjust center to be relative to ROI
        center_y -= y

        # Determine size of square bounding box for the circle
        square_size = int(np.ceil(2 * radius))
        padding = square_size

        # Pad the red ROI to the square size, centering the nucleus
        # Calculate padding amounts
        pad_x_before = int((padding - w) / 2)
        pad_x_after = padding - w - pad_x_before
        pad_y_before = int((padding - h) / 2)
        pad_y_after = padding - h - pad_y_before

        padded_red_roi = np.pad(red_roi, ((pad_y_before, pad_y_after), (pad_x_before, pad_x_after)), mode='constant')

        # Resize to target size (assuming the target size is 64x64 as used in the previous normalization step)
        target_size = (64, 64)
        normalized_red_nucleus = cv2.resize(padded_red_roi, target_size, interpolation=cv2.INTER_AREA)


        # Store the normalized red nucleus image and its label
        labeled_nuclei.append((normalized_red_nucleus, label))

    return labeled_nuclei

# Apply foci detection and labeling to training and testing sets
train_labeled_nuclei = []
for i in range(len(train_paired_images)):
    labeled_nuclei_in_image = detect_foci_and_label_nucleus(
        train_paired_images[i],
        train_segmented_masks[i],
        train_green_thresholded[i]
    )
    train_labeled_nuclei.extend(labeled_nuclei_in_image)

test_labeled_nuclei = []
for i in range(len(test_paired_images)):
    labeled_nuclei_in_image = detect_foci_and_label_nucleus(
        test_paired_images[i],
        test_segmented_masks[i],
        test_green_thresholded[i]
    )
    test_labeled_nuclei.extend(labeled_nuclei_in_image)

# Count damaged and normal nuclei
train_damaged_count = sum(1 for img, label in train_labeled_nuclei if label == "damaged")
train_normal_count = sum(1 for img, label in train_labeled_nuclei if label == "normal")
test_damaged_count = sum(1 for img, label in test_labeled_nuclei if label == "damaged")
test_normal_count = sum(1 for img, label in test_labeled_nuclei if label == "normal")

print(f"Training set: {train_damaged_count} damaged, {train_normal_count} normal.")
print(f"Testing set: {test_damaged_count} damaged, {test_normal_count} normal.")

In [None]:
def process_image_pair(paired_image):
    """
    Processes a paired image (red, green) through segmentation,
    thresholding, foci detection, and labeling.

    Args:
        paired_image: A tuple containing the red and green channel images (grayscale).

    Returns:
        A list of tuples, where each tuple contains the normalized
        red nucleus image and its corresponding label ("damaged" or "normal").
    """
    red_img, green_img = paired_image

    # 1. Nucleus segmentation (red channel)
    segmented_mask = segment_nuclei(red_img)

    # 2. Green channel global thresholding
    # Use the threshold_value defined previously (or a default if not defined)
    global threshold_value # Access the threshold value from the previous cell if it exists
    try:
        threshold_value_to_use = threshold_value
    except NameError:
        threshold_value_to_use = 50 # Default value if not set

    thresholded_green_image = apply_global_thresholding(green_img, threshold_value_to_use)


    # 3. Foci detection and nucleus labeling
    labeled_nuclei_in_image = detect_foci_and_label_nucleus(
        paired_image,
        segmented_mask,
        thresholded_green_image
    )

    return labeled_nuclei_in_image

# Process training images
all_train_labeled_nuclei = []
for paired_img in train_paired_images:
    labeled_nuclei = process_image_pair(paired_img)
    all_train_labeled_nuclei.extend(labeled_nuclei)

# Process testing images
all_test_labeled_nuclei = []
for paired_img in test_paired_images:
    labeled_nuclei = process_image_pair(paired_img)
    all_test_labeled_nuclei.extend(labeled_nuclei)

# Print the total number of labeled nuclei
print(f"Total labeled nuclei in training set: {len(all_train_labeled_nuclei)}")
print(f"Total labeled nuclei in testing set: {len(all_test_labeled_nuclei)}")

In [None]:
# Separate images and labels for training
train_images_for_model = [nucleus[0] for nucleus in all_train_labeled_nuclei]
train_labels_for_model = [nucleus[1] for nucleus in all_train_labeled_nuclei]

# Separate images and labels for testing
test_images_for_model = [nucleus[0] for nucleus in all_test_labeled_nuclei]
test_labels_for_model = [nucleus[1] for nucleus in all_test_labeled_nuclei]

# Convert images to NumPy arrays and add channel dimension
train_images_for_model = np.array(train_images_for_model).reshape(-1, 64, 64, 1)
test_images_for_model = np.array(test_images_for_model).reshape(-1, 64, 64, 1)

# Convert labels to NumPy arrays and to numerical format
label_mapping = {"normal": 0, "damaged": 1}
train_labels_for_model = np.array([label_mapping[label] for label in train_labels_for_model])
test_labels_for_model = np.array([label_mapping[label] for label in test_labels_for_model])

# Print shapes
print(f"Shape of train_images_for_model: {train_images_for_model.shape}")
print(f"Shape of train_labels_for_model: {train_labels_for_model.shape}")
print(f"Shape of test_images_for_model: {test_images_for_model.shape}")
print(f"Shape of test_labels_for_model: {test_labels_for_model.shape}")

In [None]:
import mahotas
from skimage.feature import hog
import cv2
import numpy as np
from skimage import measure # Import measure for regionprops

def extract_shape_features(image):
    """Extracts shape features from a binary nucleus mask using skimage.measure.regionprops."""
    # Ensure the image is binary (0 or 255) and has the correct dtype for regionprops
    if image.max() <= 1:
        image = image * 255
    image = image.astype(np.uint8)

    # Find contours (still needed for some calculations like perimeter and hull)
    contours, _ = cv2.findContours(image.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    if not contours:
        # Return a list of zeros if no contour is found
        # Adjust size based on the features extracted by regionprops + Hu moments + HOG
        # regionprops provides area, perimeter, major/minor axis length, eccentricity, solidity
        # Hu moments are 7. HOG depends on parameters (e.g., 9 orientations for 1 cell block size 16x16)
        return [0] * (5 + 7 + 9)

    # Assuming the largest contour is the nucleus
    contour = max(contours, key=cv2.contourArea)

    # Use regionprops for basic shape features
    # Need to create a labeled image for regionprops
    labeled_image, num_labels = measure.label(image, connectivity=2, return_num=True)

    if num_labels < 1:
         # Return a list of zeros if no labeled regions are found
         return [0] * (5 + 7 + 9)

    # Find properties of the largest region (assumed to be the nucleus)
    properties = measure.regionprops(labeled_image)[0] # Assuming only one main region after segmentation

    area = properties.area
    perimeter = properties.perimeter
    major_axis_length = properties.major_axis_length if properties.major_axis_length is not None else 0
    minor_axis_length = properties.minor_axis_length if properties.minor_axis_length is not None else 0
    axis_ratio = major_axis_length / minor_axis_length if minor_axis_length > 0 else 0
    eccentricity = properties.eccentricity if properties.eccentricity is not None else 0
    solidity = properties.solidity if properties.solidity is not None else 0

    # Roundness/Form Factor (calculated using area and perimeter from regionprops)
    roundness = 4 * np.pi * area / (perimeter**2) if perimeter > 0 else 0

    # Hu Moments (using cv2 on the contour)
    hu_moments = cv2.HuMoments(cv2.moments(contour)).flatten()

    # Histogram of Oriented Gradients (HOG) - using the image directly
    try:
        # HOG requires a specific input format and parameters. This is a basic example.
        # The parameters for HOG (pixels_per_cell, cells_per_block, orientations) can be tuned.
        hog_features = hog(image, pixels_per_cell=(16, 16), cells_per_block=(1, 1), orientations=9, feature_vector=True)
    except ValueError:
        hog_features = [0] * 9 # Example size, adjust based on HOG parameters

    # Combine features
    features = [area, perimeter, major_axis_length, minor_axis_length, axis_ratio, eccentricity, solidity, roundness] + list(hu_moments) + list(hog_features)

    return features

# Extract features for training and testing data
train_shape_features = [extract_shape_features(nucleus_img) for nucleus_img, label in all_train_labeled_nuclei]
test_shape_features = [extract_shape_features(nucleus_img) for nucleus_img, label in all_test_labeled_nuclei]

# Convert to numpy arrays
train_shape_features = np.array(train_shape_features)
test_shape_features = np.array(test_shape_features)

print(f"Shape of train_shape_features: {train_shape_features.shape}")
print(f"Shape of test_shape_features: {test_shape_features.shape}")

In [None]:
import mahotas as mh
from skimage import feature, measure
import cv2
import numpy as np
from skimage.filters import gaussian#, laplacian # Comment out laplacian import
from skimage.feature import SIFT
import scipy.ndimage # Import scipy for laplacian

def extract_texture_features(image):
    """Extracts various texture features from a grayscale image."""

    # Ensure the image is in the correct format (uint8) for some libraries
    image = image.astype(np.uint8)

    # Check if the image is too small for feature extraction
    if image.shape[0] < 21 or image.shape[1] < 21: # Some features like Gabor might need a minimum size
         return [0] * (13 + 27 + 2 + 16 + 25 + 2 + 4*4) # Haralick + LBP + LoG (mean, std) + Gabor (4 orientations * 2 freqs * 2 features) + Zernike (degree 8) + SIFT (mean, std) + skimage haralick

    # Haralick Texture Features (using mahotas) - keeping this as it was used before
    # Check if the image is not all zeros or constant, which can cause issues with haralick
    if np.all(image == image[0, 0]):
        haralick_features = [0] * 13
    else:
        try:
            haralick_features = mh.features.haralick(image).tolist()
        except ValueError:
            haralick_features = [0] * 13

    # Local Binary Patterns (LBP) (using skimage) - from user's snippet
    try:
        lbp = feature.local_binary_pattern(image, P=8, R=1, method='uniform')
        # Use a fixed number of bins for the histogram (26 uniform patterns + 2 for ends)
        lbp_hist, _ = np.histogram(lbp, bins=np.arange(0, 28), density=True)
        lbp_features = lbp_hist.tolist()
    except Exception:
        lbp_features = [0] * 27 # 26 uniform patterns + 1 for the remaining

    # Haralick Texture Features (using scikit-image) - from user's snippet - using different distances/angles
    try:
        # Ensure image is uint8 for graycomatrix
        image_uint8 = image.astype(np.uint8)
        # Adjust distances and angles as needed
        glcm = measure.graycomatrix(image_uint8, distances=[1, 3, 5], angles=[0, np.pi/4, np.pi/2, 3*np.pi/4], symmetric=True, normed=True)
        # Extract properties - using contrast, correlation, energy, homogeneity as in user's snippet
        haralick_features_skimage = measure.graycoprops(glcm, props=['contrast', 'correlation', 'energy', 'homogeneity']).flatten().tolist()
    except Exception:
        haralick_features_skimage = [0] * (4 * 4) # 4 properties * 4 angles

    # Laplacian of Gaussian (LoG) (using cv2) - from user's snippet
    try:
        # Apply Gaussian smoothing using cv2
        blurred_image = cv2.GaussianBlur(image, (0, 0), 1)
        # Apply Laplacian filter using cv2
        log_image = cv2.Laplacian(blurred_image, cv2.CV_64F)
        # Simple statistics from LoG response (e.g., mean and variance)
        log_features = [np.mean(log_image), np.std(log_image)]
    except Exception:
        log_features = [0] * 2 # Adjust size based on features extracted


    # Gabor Wavelets (using cv2) - from user's snippet
    try:
        gabor_features = []
        # Define Gabor filter parameters (can be extended with different orientations and frequencies)
        kernels = []
        for theta in np.arange(0, np.pi, np.pi / 4): # 4 orientations
            for freq in [5, 10]: # 2 frequencies
                kern = cv2.getGaborKernel((21, 21), 5.0, theta, freq, 0.5, 0, ktype=cv2.CV_32F)
                kernels.append(kern)

        # Apply filters and extract features (e.g., mean and variance of the response)
        for kernel in kernels:
            fimg = cv2.filter2D(image, cv2.CV_8UC3, kernel) # Changed depth to CV_8UC3 as per documentation examples
            # Convert to grayscale for mean/std calculation if fimg is BGR
            if len(fimg.shape) == 3:
                fimg = cv2.cvtColor(fimg, cv2.COLOR_BGR2GRAY)
            gabor_features.extend([np.mean(fimg), np.std(fimg)])
    except Exception:
        gabor_features = [0] * (4 * 2 * 2) # 4 orientations * 2 frequencies * 2 features (mean, std)


    # Zernike Moments (using mahotas) - keeping this
    try:
        # Ensure the image is binary for Zernike moments
        # Use Otsu's thresholding on the current nucleus image ROI
        _, binary_nucleus = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
        # Compute Zernike moments up to a certain degree (e.g., 8)
        # The number of moments depends on the degree. For degree 8, there are 25 moments.
        zernike_moments = mh.features.zernike_moments(binary_nucleus, radius=image.shape[0]//2, degree=8).tolist()
    except Exception:
         # Approximate number of Zernike moments for degree 8
        zernike_moments = [0] * 25


    # SIFT (Scale-Invariant Feature Transform) (using skimage) - keeping this
    try:
        # Initialize SIFT detector
        # Consider adjusting parameters for feature detection
        detector_extractor = SIFT()
        # Find keypoints and compute descriptors
        detector_extractor.detect_and_extract(image)
        descriptors = detector_extractor.descriptors

        # If no keypoints are found, descriptors will be None
        if descriptors is not None:
            # Simple representation: mean and standard deviation of descriptors
            sift_features = [np.mean(descriptors), np.std(descriptors)]
            # Or, you could cluster descriptors or use a bag-of-visual-words approach
        else:
            sift_features = [0] * 2 # Adjust size based on features extracted (e.g., mean, std)
    except Exception:
        sift_features = [0] * 2 # Adjust size based on features extracted


    # Combine all texture features - including both mahotas and skimage haralick
    features = haralick_features + lbp_features + haralick_features_skimage + log_features + gabor_features + zernike_moments + sift_features

    return features


# Extract texture features for training and testing data
train_texture_features = [extract_texture_features(nucleus_img) for nucleus_img, label in all_train_labeled_nuclei]
test_texture_features = [extract_texture_features(nucleus_img) for nucleus_img, label in all_test_labeled_nuclei]

# Convert to numpy arrays
# Ensure all feature lists have the same length before converting to numpy array
def pad_features(features_list):
    max_len = max(len(f) for f in features_list)
    padded_features = []
    for f in features_list:
        # Ensure each element in f is a number before padding
        padded_features.append([float(x) if isinstance(x, (int, float)) else 0.0 for x in f] + [0.0] * (max_len - len(f)))
    return np.array(padded_features)

train_texture_features = pad_features(train_texture_features)
test_texture_features = pad_features(test_texture_features)


print(f"Shape of train_texture_features: {train_texture_features.shape}")
print(f"Shape of test_texture_features: {test_texture_features.shape}")

In [None]:
# Combine shape and texture features for training
# Ensure both arrays have the same number of samples
if train_shape_features.shape[0] == train_texture_features.shape[0]:
    train_texture_features_flat = train_texture_features.reshape(train_texture_features.shape[0], -1)
    train_combined_features = np.concatenate((train_shape_features, train_texture_features_flat), axis=1)
    print(f"Shape of train_combined_features: {train_combined_features.shape}")
else:
    print("Mismatch in the number of training samples for shape and texture features.")
    train_combined_features = None # Or handle the error appropriately


# Combine shape and texture features for testing
# Ensure both arrays have the same number of samples
if test_shape_features.shape[0] == test_texture_features.shape[0]:
    test_texture_features_flat = test_texture_features.reshape(test_texture_features.shape[0], -1)
    test_combined_features = np.concatenate((test_shape_features, test_texture_features_flat), axis=1)
    print(f"Shape of test_combined_features: {test_combined_features.shape}")
else:
    print("Mismatch in the number of testing samples for shape and texture features.")
    test_combined_features = None # Or handle the error appropriately

In [None]:
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import numpy as np # Ensure numpy is imported
from sklearn.feature_selection import SelectKBest, f_classif # Import feature selection

# Apply SMOTE to handle class imbalance on the combined training features
if train_combined_features is not None and train_labels_for_model is not None:
    smote = SMOTE(random_state=42)
    train_features_smote, train_labels_smote = smote.fit_resample(train_combined_features, train_labels_for_model)

    # Define the parameter grid for GridSearchCV
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'class_weight': [None, 'balanced', 'balanced_subsample'] # Experiment with class weighting
    }

    # Instantiate the Random Forest Classifier
    rf = RandomForestClassifier(random_state=42)

    # Instantiate GridSearchCV
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='recall', n_jobs=-1) # Optimize for recall

    # Fit GridSearchCV to the SMOTE-resampled training data
    grid_search.fit(train_features_smote, train_labels_smote)

    # Get the best parameters and the best estimator
    best_params = grid_search.best_params_
    best_rf_classifier = grid_search.best_estimator_

    print("Best parameters found by GridSearchCV:")
    print(best_params)

    print("\nOptimized Random Forest classifier trained successfully.")

    # Evaluate the best model on the original (non-SMOTE) test set
    if test_combined_features is not None and test_labels_for_model is not None:
        predictions = best_rf_classifier.predict(test_combined_features)
        print("\nOptimized Random Forest Classifier Evaluation on Test Set:")
        print(classification_report(test_labels_for_model, predictions))
    else:
        print("\nTest data not available for evaluating the optimized Random Forest classifier.")

else:
    print("Combined training features or labels not available for Random Forest optimization.")

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import pandas as pd
import numpy as np # Ensure numpy is imported

# Make predictions on the test set
predictions = best_rf_classifier.predict(test_combined_features) # Use the best classifier

# Generate and print classification report
print("Optimized Random Forest Classifier Evaluation:")
print(classification_report(test_labels_for_model, predictions))

# Compute and print accuracy score
accuracy = accuracy_score(test_labels_for_model, predictions)
print(f"Accuracy Score: {accuracy}")

# Generate and print confusion matrix
conf_matrix = confusion_matrix(test_labels_for_model, predictions)
print("Confusion Matrix:")
print(conf_matrix)

# Extract feature importances from the best classifier
feature_importances = best_rf_classifier.feature_importances_

# Create a list of feature names (based on the order they were combined)
# This requires knowing the number of features from each category
num_shape_features = train_shape_features.shape[1]

# Correctly calculate the number of flattened texture features
# The number of texture features is simply the second dimension of the flattened array.
# We can get this from the shape of train_texture_features_flat created in the previous step.
try:
    num_texture_features_flat = train_texture_features_flat.shape[1]
except NameError:
    # If train_texture_features_flat was not created, calculate its expected size
    # based on the total combined features and shape features
    if train_combined_features is not None and train_shape_features is not None:
        num_texture_features_flat = train_combined_features.shape[1] - train_shape_features.shape[1]
    else:
        # Fallback if necessary variables are not defined
        num_texture_features_flat = 0 # Default to 0 if cannot determine

# Assuming a simple naming convention for demonstration
shape_feature_names = [f"shape_{i}" for i in range(num_shape_features)]
texture_feature_names = [f"texture_{i}" for i in range(num_texture_features_flat)]
all_feature_names = shape_feature_names + texture_feature_names

# Check if the lengths match before creating the Series
if len(feature_importances) == len(all_feature_names):
    # Create a pandas Series for feature importances
    feature_importance_series = pd.Series(feature_importances, index=all_feature_names)

    # Sort feature importances in descending order
    sorted_feature_importances = feature_importance_series.sort_values(ascending=False)

    # Print the top 20 most important features
    print("\nTop 20 Most Important Features:")
    print(sorted_feature_importances.head(20))
else:
    print(f"Mismatch in feature counts: Model has {len(feature_importances)}, generated names have {len(all_feature_names)}")

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Define the CNN model architecture
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(64, 64, 1)),
    MaxPooling2D((2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Conv2D(128, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid') # Sigmoid for binary classification
])

# Compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Setup data augmentation
datagen = ImageDataGenerator(
    rotation_range=20,
    zoom_range=0.1,
    width_shift_range=0.1,
    height_shift_range=0.1,
    horizontal_flip=True,
    vertical_flip=True
)

# Fit the data generator on the training data
datagen.fit(train_images_for_model) # Use the original train_images_for_model

# Train the model using the augmented data
history = model.fit(datagen.flow(train_images_for_model, train_labels_for_model, batch_size=32),
                    epochs=50, # Number of training epochs
                    validation_data=(test_images_for_model, test_labels_for_model)) # Use the original test_images_for_model and labels

print("CNN model training completed.")

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

# Make predictions on the test set
predictions_prob = model.predict(test_images_for_model)
predictions = (predictions_prob > 0.5).astype("int32")

# Generate and print classification report
print("CNN Classification Report:")
print(classification_report(test_labels_for_model, predictions))

# Generate and print confusion matrix
conf_matrix_cnn = confusion_matrix(test_labels_for_model, predictions)
print("CNN Confusion Matrix:")
print(conf_matrix_cnn)

In [None]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import SelectKBest, f_classif

# Get predictions (probabilities) from the trained CNN on the training and testing data
train_cnn_predictions_prob = model.predict(train_images_for_model) # Use the original train_images_for_model
test_cnn_predictions_prob = model.predict(test_images_for_model) # Use the original test_images_for_model

# Reshape CNN predictions to be 2D arrays
train_cnn_predictions_prob = train_cnn_predictions_prob.reshape(-1, 1)
test_cnn_predictions_prob = test_cnn_predictions_prob.reshape(-1, 1)


# Combine CNN probabilities with handcrafted features for training
# Ensure both arrays have the same number of samples
if train_combined_features.shape[0] == train_cnn_predictions_prob.shape[0]:
    train_ensemble_features = np.concatenate((train_combined_features, train_cnn_predictions_prob), axis=1)
    print(f"Shape of train_ensemble_features: {train_ensemble_features.shape}")
else:
    print("Mismatch in the number of training samples for combined features and CNN predictions.")
    train_ensemble_features = None # Or handle the error appropriately


# Combine CNN probabilities with handcrafted features for testing
# Ensure both arrays have the same number of samples
if test_combined_features.shape[0] == test_cnn_predictions_prob.shape[0]:
    test_ensemble_features = np.concatenate((test_combined_features, test_cnn_predictions_prob), axis=1)
    print(f"Shape of test_ensemble_features: {test_ensemble_features.shape}")
else:
    print("Mismatch in the number of testing samples for shape and texture features.")
    test_ensemble_features = None # Or handle the error appropriately


# --- Feature Selection (Optional but Recommended for Ensemble) ---
# Use SelectKBest to select the top K features based on ANOVA F-value
# You can adjust the value of k (number of features to select)
if train_ensemble_features is not None and train_labels_for_model is not None:
    # Address potential NaNs or Infs in features before selection
    train_ensemble_features_clean = np.nan_to_num(train_ensemble_features)

    # Check if there are enough samples for feature selection
    if train_ensemble_features_clean.shape[0] > 1 and np.var(train_ensemble_features_clean, axis=0).sum() > 0:
        k = min(100, train_ensemble_features_clean.shape[1]) # Select top 100 features or fewer if less are available
        try:
            selector = SelectKBest(score_func=f_classif, k=k)
            train_ensemble_features_selected = selector.fit_transform(train_ensemble_features_clean, train_labels_for_model)
            test_ensemble_features_selected = selector.transform(np.nan_to_num(test_ensemble_features)) # Apply the same selection to test data
            print(f"Shape of train_ensemble_features_selected: {train_ensemble_features_selected.shape}")
            print(f"Shape of test_ensemble_features_selected: {test_ensemble_features_selected.shape}")
        except ValueError as e:
             print(f"Could not perform feature selection: {e}")
             # Fallback to using all features if selection fails
             train_ensemble_features_selected = train_ensemble_features_clean
             test_ensemble_features_selected = np.nan_to_num(test_ensemble_features)
             print("Using all features for ensemble training.")
    else:
         print("Not enough samples or variance for feature selection. Using all features.")
         train_ensemble_features_selected = train_ensemble_features_clean
         test_ensemble_features_selected = np.nan_to_num(test_ensemble_features)
else:
    print("Ensemble features not available for selection.")
    train_ensemble_features_selected = None
    test_ensemble_features_selected = None


# --- Train Second-Level Random Forest Classifier ---
# Apply SMOTE to the selected training features to handle class imbalance for the ensemble model
if train_ensemble_features_selected is not None and train_labels_for_model is not None:
    try:
        smote_ensemble = SMOTE(random_state=42)
        train_ensemble_features_smote, train_labels_ensemble_smote = smote_ensemble.fit_resample(train_ensemble_features_selected, train_labels_for_model)

        # Instantiate and train the second-level Random Forest Classifier
        ensemble_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
        ensemble_classifier.fit(train_ensemble_features_smote, train_labels_ensemble_smote)

        print("\nEnsemble Random Forest classifier trained successfully with SMOTE.")
    except ValueError as e:
         print(f"Could not train ensemble classifier: {e}")
         ensemble_classifier = None
else:
    print("Could not train ensemble classifier due to missing data.")

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Make predictions on the test set using the ensemble model
if test_ensemble_features_selected is not None and ensemble_classifier is not None:
    ensemble_predictions = ensemble_classifier.predict(test_ensemble_features_selected)

    # Generate and print classification report for the ensemble model
    print("Ensemble Model Classification Report:")
    print(classification_report(test_labels_for_model, ensemble_predictions))

    # Compute and print accuracy score for the ensemble model
    ensemble_accuracy = accuracy_score(test_labels_for_model, ensemble_predictions)
    print(f"Ensemble Model Accuracy Score: {ensemble_accuracy}")

    # Generate and print confusion matrix for the ensemble model
    conf_matrix_ensemble = confusion_matrix(test_labels_for_model, ensemble_predictions)
    print("Ensemble Model Confusion Matrix:")
    print(conf_matrix_ensemble)
else:
    print("Ensemble model or test features not available for evaluation.")

In [None]:
def segment_nuclei(image):
    """Segments nuclei using Otsu's thresholding and watershed."""
    # Apply Otsu's thresholding
    _, thresh = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

    # Morphological operations to clean up the mask
    kernel = np.ones((3, 3), np.uint8)
    opening = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel, iterations=2)

    # Sure background area
    sure_bg = cv2.dilate(opening, kernel, iterations=3)

    # Sure foreground area
    dist_transform = cv2.distanceTransform(opening, cv2.DIST_L2, 5)
    _, sure_fg = cv2.threshold(dist_transform, 0.7 * dist_transform.max(), 255, 0)

    # Unknown region
    sure_fg = np.uint8(sure_fg)
    unknown = cv2.subtract(sure_bg, sure_fg)

    # Marker labelling
    _, markers = cv2.connectedComponents(sure_fg)

    # Add one to all labels so that sure background is not 0, but 1
    markers = markers + 1

    # Now, mark the region of unknown with zero
    markers[unknown == 255] = 0

    # Apply watershed algorithm
    # Create a BGR version of the grayscale image for watershed visualization (optional, but watershed expects 3 channels)
    img_bgr = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
    markers = cv2.watershed(img_bgr, markers)

    # Create a binary mask from the watershed result
    segmented_mask = np.zeros_like(image, dtype=np.uint8)
    segmented_mask[markers > 1] = 255  # Nuclei are marked with labels > 1

    # Remove small objects (noise)
    min_size = 100  # Minimum size threshold for nuclei
    num_labels, labels_img, stats, centroids = cv2.connectedComponentsWithStats(segmented_mask, connectivity=8)

    cleaned_mask = np.zeros_like(segmented_mask)
    for i in range(1, num_labels): # Start from 1 to exclude background
        if stats[i, cv2.CC_STAT_AREA] >= min_size:
            cleaned_mask[labels_img == i] = 255

    return cleaned_mask

train_segmented_masks = [segment_nuclei(img) for img in train_red_images]
test_segmented_masks = [segment_nuclei(img) for img in test_red_images]

print(f"Generated {len(train_segmented_masks)} training segmented masks.")
print(f"Generated {len(test_segmented_masks)} testing segmented masks.")

In [None]:
def apply_global_thresholding(image, threshold_value=50):
    """Applies global thresholding to a grayscale image."""
    # Apply global thresholding
    _, thresholded_img = cv2.threshold(image, threshold_value, 255, cv2.THRESH_BINARY)
    return thresholded_img

# Apply thresholding to training and testing green images
# Using the threshold_value from the previous cell if it exists, otherwise use a default
global threshold_value
try:
    threshold_value_to_use = threshold_value
except NameError:
    threshold_value_to_use = 50 # Default value

train_green_thresholded = [apply_global_thresholding(img, threshold_value_to_use) for img in train_green_images]
test_green_thresholded = [apply_global_thresholding(img, threshold_value_to_use) for img in test_green_images]

print(f"Applied global thresholding to {len(train_green_thresholded)} training green images.")
print(f"Applied global thresholding to {len(test_green_thresholded)} testing green images.")

In [None]:
def detect_foci_and_label_nucleus(paired_image, segmented_mask, thresholded_green_image, foci_threshold=10):
    """
    Detects green foci within segmented nuclei and labels each nucleus
    as "damaged" or "normal".

    Args:
        paired_image: A tuple containing the red and green channel images (grayscale).
        segmented_mask: The binary segmented mask for the nuclei.
        thresholded_green_image: The thresholded green channel image.
        foci_threshold: The minimum number of non-zero pixels (foci) to label a nucleus as "damaged".

    Returns:
        A list of tuples, where each tuple contains the normalized
        red nucleus image and its corresponding label ("damaged" or "normal").
    """
    red_img, green_img = paired_image
    labeled_nuclei = []

    # Find contours in the segmented mask to identify individual nuclei
    contours, _ = cv2.findContours(segmented_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    for contour in contours:
        # Create a mask specifically for the current nucleus
        nucleus_mask = np.zeros_like(segmented_mask)
        cv2.drawContours(nucleus_mask, [contour], -1, 255, -1)

        # Apply this nucleus mask to the thresholded green channel image
        masked_green = cv2.bitwise_and(thresholded_green_image, thresholded_green_image, mask=nucleus_mask)

        # Count the number of non-zero pixels (representing potential foci)
        foci_count = np.count_nonzero(masked_green)

        # Determine the label based on foci count
        label = "damaged" if foci_count > foci_threshold else "normal"

        # Find bounding box
        x, y, w, h = cv2.boundingRect(contour)

        # Extract ROI for the red channel
        red_roi = red_img[y:y+h, x:x+w]

        # Calculate minimum enclosing circle
        (center_x, center_y), radius = cv2.minEnclosingCircle(contour)
        center_x -= x  # Adjust center to be relative to ROI
        center_y -= y

        # Determine size of square bounding box for the circle
        square_size = int(np.ceil(2 * radius))
        padding = square_size

        # Pad the red ROI to the square size, centering the nucleus
        # Calculate padding amounts
        pad_x_before = int((padding - w) / 2)
        pad_x_after = padding - w - pad_x_before
        pad_y_before = int((padding - h) / 2)
        pad_y_after = padding - h - pad_y_before

        padded_red_roi = np.pad(red_roi, ((pad_y_before, pad_y_after), (pad_x_before, pad_x_after)), mode='constant')

        # Resize to target size (assuming the target size is 64x64 as used in the previous normalization step)
        target_size = (64, 64)
        normalized_red_nucleus = cv2.resize(padded_red_roi, target_size, interpolation=cv2.INTER_AREA)


        # Store the normalized red nucleus image and its label
        labeled_nuclei.append((normalized_red_nucleus, label))

    return labeled_nuclei

# Apply foci detection and labeling to training and testing sets
train_labeled_nuclei = []
for i in range(len(train_paired_images)):
    labeled_nuclei_in_image = detect_foci_and_label_nucleus(
        train_paired_images[i],
        train_segmented_masks[i],
        train_green_thresholded[i]
    )
    train_labeled_nuclei.extend(labeled_nuclei_in_image)

test_labeled_nuclei = []
for i in range(len(test_paired_images)):
    labeled_nuclei_in_image = detect_foci_and_label_nucleus(
        test_paired_images[i],
        test_segmented_masks[i],
        test_green_thresholded[i]
    )
    test_labeled_nuclei.extend(labeled_nuclei_in_image)

# Count damaged and normal nuclei
train_damaged_count = sum(1 for img, label in train_labeled_nuclei if label == "damaged")
train_normal_count = sum(1 for img, label in train_labeled_nuclei if label == "normal")
test_damaged_count = sum(1 for img, label in test_labeled_nuclei if label == "damaged")
test_normal_count = sum(1 for img, label in test_labeled_nuclei if label == "normal")

print(f"Training set: {train_damaged_count} damaged, {train_normal_count} normal.")
print(f"Testing set: {test_damaged_count} damaged, {test_normal_count} normal.")

In [None]:
def process_image_pair(paired_image):
    """
    Processes a paired image (red, green) through segmentation,
    thresholding, foci detection, and labeling.

    Args:
        paired_image: A tuple containing the red and green channel images (grayscale).

    Returns:
        A list of tuples, where each tuple contains the normalized
        red nucleus image and its corresponding label ("damaged" or "normal").
    """
    red_img, green_img = paired_image

    # 1. Nucleus segmentation (red channel)
    segmented_mask = segment_nuclei(red_img)

    # 2. Green channel global thresholding
    # Use the threshold_value defined previously (or a default if not defined)
    global threshold_value # Access the threshold value from the previous cell if it exists
    try:
        threshold_value_to_use = threshold_value
    except NameError:
        threshold_value_to_use = 50 # Default value if not set

    thresholded_green_image = apply_global_thresholding(green_img, threshold_value_to_use)


    # 3. Foci detection and nucleus labeling
    labeled_nuclei_in_image = detect_foci_and_label_nucleus(
        paired_image,
        segmented_mask,
        thresholded_green_image
    )

    return labeled_nuclei_in_image

# Process training images
all_train_labeled_nuclei = []
for paired_img in train_paired_images:
    labeled_nuclei = process_image_pair(paired_img)
    all_train_labeled_nuclei.extend(labeled_nuclei)

# Process testing images
all_test_labeled_nuclei = []
for paired_img in test_paired_images:
    labeled_nuclei = process_image_pair(paired_img)
    all_test_labeled_nuclei.extend(labeled_nuclei)

# Print the total number of labeled nuclei
print(f"Total labeled nuclei in training set: {len(all_train_labeled_nuclei)}")
print(f"Total labeled nuclei in testing set: {len(all_test_labeled_nuclei)}")

In [None]:
# Separate images and labels for training
train_images_for_model = [nucleus[0] for nucleus in all_train_labeled_nuclei]
train_labels_for_model = [nucleus[1] for nucleus in all_train_labeled_nuclei]

# Separate images and labels for testing
test_images_for_model = [nucleus[0] for nucleus in all_test_labeled_nuclei]
test_labels_for_model = [nucleus[1] for nucleus in all_test_labeled_nuclei]

# Convert images to NumPy arrays and add channel dimension
train_images_for_model = np.array(train_images_for_model).reshape(-1, 64, 64, 1)
test_images_for_model = np.array(test_images_for_model).reshape(-1, 64, 64, 1)

# Convert labels to NumPy arrays and to numerical format
label_mapping = {"normal": 0, "damaged": 1}
train_labels_for_model = np.array([label_mapping[label] for label in train_labels_for_model])
test_labels_for_model = np.array([label_mapping[label] for label in test_labels_for_model])

# Print shapes
print(f"Shape of train_images_for_model: {train_images_for_model.shape}")
print(f"Shape of train_labels_for_model: {train_labels_for_model.shape}")
print(f"Shape of test_images_for_model: {test_images_for_model.shape}")
print(f"Shape of test_labels_for_model: {test_labels_for_model.shape}")

In [None]:
import mahotas
from skimage.feature import hog
import cv2
import numpy as np
from skimage import measure # Import measure for regionprops

def extract_shape_features(image):
    """Extracts shape features from a binary nucleus mask using skimage.measure.regionprops."""
    # Ensure the image is binary (0 or 255) and has the correct dtype for regionprops
    if image.max() <= 1:
        image = image * 255
    image = image.astype(np.uint8)

    # Find contours (still needed for some calculations like perimeter and hull)
    contours, _ = cv2.findContours(image.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    if not contours:
        # Return a list of zeros if no contour is found
        # Adjust size based on the features extracted by regionprops + Hu moments + HOG
        # regionprops provides area, perimeter, major/minor axis length, eccentricity, solidity
        # Hu moments are 7. HOG depends on parameters (e.g., 9 orientations for 1 cell block size 16x16)
        return [0] * (5 + 7 + 9)

    # Assuming the largest contour is the nucleus
    contour = max(contours, key=cv2.contourArea)

    # Use regionprops for basic shape features
    # Need to create a labeled image for regionprops
    labeled_image, num_labels = measure.label(image, connectivity=2, return_num=True)

    if num_labels < 1:
         # Return a list of zeros if no labeled regions are found
         return [0] * (5 + 7 + 9)

    # Find properties of the largest region (assumed to be the nucleus)
    properties = measure.regionprops(labeled_image)[0] # Assuming only one main region after segmentation

    area = properties.area
    perimeter = properties.perimeter
    major_axis_length = properties.major_axis_length if properties.major_axis_length is not None else 0
    minor_axis_length = properties.minor_axis_length if properties.minor_axis_length is not None else 0
    axis_ratio = major_axis_length / minor_axis_length if minor_axis_length > 0 else 0
    eccentricity = properties.eccentricity if properties.eccentricity is not None else 0
    solidity = properties.solidity if properties.solidity is not None else 0

    # Roundness/Form Factor (calculated using area and perimeter from regionprops)
    roundness = 4 * np.pi * area / (perimeter**2) if perimeter > 0 else 0

    # Hu Moments (using cv2 on the contour)
    hu_moments = cv2.HuMoments(cv2.moments(contour)).flatten()

    # Histogram of Oriented Gradients (HOG) - using the image directly
    try:
        # HOG requires a specific input format and parameters. This is a basic example.
        # The parameters for HOG (pixels_per_cell, cells_per_block, orientations) can be tuned.
        hog_features = hog(image, pixels_per_cell=(16, 16), cells_per_block=(1, 1), orientations=9, feature_vector=True)
    except ValueError:
        hog_features = [0] * 9 # Example size, adjust based on HOG parameters

    # Combine features
    features = [area, perimeter, major_axis_length, minor_axis_length, axis_ratio, eccentricity, solidity, roundness] + list(hu_moments) + list(hog_features)

    return features

# Extract features for training and testing data
train_shape_features = [extract_shape_features(nucleus_img) for nucleus_img, label in all_train_labeled_nuclei]
test_shape_features = [extract_shape_features(nucleus_img) for nucleus_img, label in all_test_labeled_nuclei]

# Convert to numpy arrays
train_shape_features = np.array(train_shape_features)
test_shape_features = np.array(test_shape_features)

print(f"Shape of train_shape_features: {train_shape_features.shape}")
print(f"Shape of test_shape_features: {test_shape_features.shape}")

In [None]:
import mahotas as mh
from skimage import feature, measure
import cv2
import numpy as np
from skimage.filters import gaussian#, laplacian # Comment out laplacian import
from skimage.feature import SIFT
import scipy.ndimage # Import scipy for laplacian

def extract_texture_features(image):
    """Extracts various texture features from a grayscale image."""

    # Ensure the image is in the correct format (uint8) for some libraries
    image = image.astype(np.uint8)

    # Check if the image is too small for feature extraction
    if image.shape[0] < 21 or image.shape[1] < 21: # Some features like Gabor might need a minimum size
         return [0] * (13 + 27 + 2 + 16 + 25 + 2 + 4*4) # Haralick + LBP + LoG (mean, std) + Gabor (4 orientations * 2 freqs * 2 features) + Zernike (degree 8) + SIFT (mean, std) + skimage haralick

    # Haralick Texture Features (using mahotas) - keeping this as it was used before
    # Check if the image is not all zeros or constant, which can cause issues with haralick
    if np.all(image == image[0, 0]):
        haralick_features = [0] * 13
    else:
        try:
            haralick_features = mh.features.haralick(image).tolist()
        except ValueError:
            haralick_features = [0] * 13

    # Local Binary Patterns (LBP) (using skimage) - from user's snippet
    try:
        lbp = feature.local_binary_pattern(image, P=8, R=1, method='uniform')
        # Use a fixed number of bins for the histogram (26 uniform patterns + 2 for ends)
        lbp_hist, _ = np.histogram(lbp, bins=np.arange(0, 28), density=True)
        lbp_features = lbp_hist.tolist()
    except Exception:
        lbp_features = [0] * 27 # 26 uniform patterns + 1 for the remaining

    # Haralick Texture Features (using scikit-image) - from user's snippet - using different distances/angles
    try:
        # Ensure image is uint8 for graycomatrix
        image_uint8 = image.astype(np.uint8)
        # Adjust distances and angles as needed
        glcm = measure.graycomatrix(image_uint8, distances=[1, 3, 5], angles=[0, np.pi/4, np.pi/2, 3*np.pi/4], symmetric=True, normed=True)
        # Extract properties - using contrast, correlation, energy, homogeneity as in user's snippet
        haralick_features_skimage = measure.graycoprops(glcm, props=['contrast', 'correlation', 'energy', 'homogeneity']).flatten().tolist()
    except Exception:
        haralick_features_skimage = [0] * (4 * 4) # 4 properties * 4 angles

    # Laplacian of Gaussian (LoG) (using cv2) - from user's snippet
    try:
        # Apply Gaussian smoothing using cv2
        blurred_image = cv2.GaussianBlur(image, (0, 0), 1)
        # Apply Laplacian filter using cv2
        log_image = cv2.Laplacian(blurred_image, cv2.CV_64F)
        # Simple statistics from LoG response (e.g., mean and variance)
        log_features = [np.mean(log_image), np.std(log_image)]
    except Exception:
        log_features = [0] * 2 # Adjust size based on features extracted


    # Gabor Wavelets (using cv2) - from user's snippet
    try:
        gabor_features = []
        # Define Gabor filter parameters (can be extended with different orientations and frequencies)
        kernels = []
        for theta in np.arange(0, np.pi, np.pi / 4): # 4 orientations
            for freq in [5, 10]: # 2 frequencies
                kern = cv2.getGaborKernel((21, 21), 5.0, theta, freq, 0.5, 0, ktype=cv2.CV_32F)
                kernels.append(kern)

        # Apply filters and extract features (e.g., mean and variance of the response)
        for kernel in kernels:
            fimg = cv2.filter2D(image, cv2.CV_8UC3, kernel) # Changed depth to CV_8UC3 as per documentation examples
            # Convert to grayscale for mean/std calculation if fimg is BGR
            if len(fimg.shape) == 3:
                fimg = cv2.cvtColor(fimg, cv2.COLOR_BGR2GRAY)
            gabor_features.extend([np.mean(fimg), np.std(fimg)])
    except Exception:
        gabor_features = [0] * (4 * 2 * 2) # 4 orientations * 2 frequencies * 2 features (mean, std)


    # Zernike Moments (using mahotas) - keeping this
    try:
        # Ensure the image is binary for Zernike moments
        # Use Otsu's thresholding on the current nucleus image ROI
        _, binary_nucleus = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
        # Compute Zernike moments up to a certain degree (e.g., 8)
        # The number of moments depends on the degree. For degree 8, there are 25 moments.
        zernike_moments = mh.features.zernike_moments(binary_nucleus, radius=image.shape[0]//2, degree=8).tolist()
    except Exception:
         # Approximate number of Zernike moments for degree 8
        zernike_moments = [0] * 25


    # SIFT (Scale-Invariant Feature Transform) (using skimage) - keeping this
    try:
        # Initialize SIFT detector
        # Consider adjusting parameters for feature detection
        detector_extractor = SIFT()
        # Find keypoints and compute descriptors
        detector_extractor.detect_and_extract(image)
        descriptors = detector_extractor.descriptors

        # If no keypoints are found, descriptors will be None
        if descriptors is not None:
            # Simple representation: mean and standard deviation of descriptors
            sift_features = [np.mean(descriptors), np.std(descriptors)]
            # Or, you could cluster descriptors or use a bag-of-visual-words approach
        else:
            sift_features = [0] * 2 # Adjust size based on features extracted (e.g., mean, std)
    except Exception:
        sift_features = [0] * 2 # Adjust size based on features extracted


    # Combine all texture features - including both mahotas and skimage haralick
    features = haralick_features + lbp_features + haralick_features_skimage + log_features + gabor_features + zernike_moments + sift_features

    return features


# Extract texture features for training and testing data
train_texture_features = [extract_texture_features(nucleus_img) for nucleus_img, label in all_train_labeled_nuclei]
test_texture_features = [extract_texture_features(nucleus_img) for nucleus_img, label in all_test_labeled_nuclei]

# Convert to numpy arrays
# Ensure all feature lists have the same length before converting to numpy array
def pad_features(features_list):
    max_len = max(len(f) for f in features_list)
    padded_features = []
    for f in features_list:
        # Ensure each element in f is a number before padding
        padded_features.append([float(x) if isinstance(x, (int, float)) else 0.0 for x in f] + [0.0] * (max_len - len(f)))
    return np.array(padded_features)

train_texture_features = pad_features(train_texture_features)
test_texture_features = pad_features(test_texture_features)


print(f"Shape of train_texture_features: {train_texture_features.shape}")
print(f"Shape of test_texture_features: {test_texture_features.shape}")

In [None]:
# Combine shape and texture features for training
# Ensure both arrays have the same number of samples
if train_shape_features.shape[0] == train_texture_features.shape[0]:
    train_texture_features_flat = train_texture_features.reshape(train_texture_features.shape[0], -1)
    train_combined_features = np.concatenate((train_shape_features, train_texture_features_flat), axis=1)
    print(f"Shape of train_combined_features: {train_combined_features.shape}")
else:
    print("Mismatch in the number of training samples for shape and texture features.")
    train_combined_features = None # Or handle the error appropriately


# Combine shape and texture features for testing
# Ensure both arrays have the same number of samples
if test_shape_features.shape[0] == test_texture_features.shape[0]:
    test_texture_features_flat = test_texture_features.reshape(test_texture_features.shape[0], -1)
    test_combined_features = np.concatenate((test_shape_features, test_texture_features_flat), axis=1)
    print(f"Shape of test_combined_features: {test_combined_features.shape}")
else:
    print("Mismatch in the number of testing samples for shape and texture features.")
    test_combined_features = None # Or handle the error appropriately

In [None]:
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import numpy as np # Ensure numpy is imported
from sklearn.feature_selection import SelectKBest, f_classif # Import feature selection

# Apply SMOTE to handle class imbalance on the combined training features
if train_combined_features is not None and train_labels_for_model is not None:
    smote = SMOTE(random_state=42)
    train_features_smote, train_labels_smote = smote.fit_resample(train_combined_features, train_labels_for_model)

    # Define the parameter grid for GridSearchCV
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'class_weight': [None, 'balanced', 'balanced_subsample'] # Experiment with class weighting
    }

    # Instantiate the Random Forest Classifier
    rf = RandomForestClassifier(random_state=42)

    # Instantiate GridSearchCV
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='recall', n_jobs=-1) # Optimize for recall

    # Fit GridSearchCV to the SMOTE-resampled training data
    grid_search.fit(train_features_smote, train_labels_smote)

    # Get the best parameters and the best estimator
    best_params = grid_search.best_params_
    best_rf_classifier = grid_search.best_estimator_

    print("Best parameters found by GridSearchCV:")
    print(best_params)

    print("\nOptimized Random Forest classifier trained successfully.")

    # Evaluate the best model on the original (non-SMOTE) test set
    if test_combined_features is not None and test_labels_for_model is not None:
        predictions = best_rf_classifier.predict(test_combined_features)
        print("\nOptimized Random Forest Classifier Evaluation on Test Set:")
        print(classification_report(test_labels_for_model, predictions))
    else:
        print("\nTest data not available for evaluating the optimized Random Forest classifier.")

else:
    print("Combined training features or labels not available for Random Forest optimization.")

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import pandas as pd
import numpy as np # Ensure numpy is imported

# Make predictions on the test set
predictions = best_rf_classifier.predict(test_combined_features) # Use the best classifier

# Generate and print classification report
print("Optimized Random Forest Classifier Evaluation:")
print(classification_report(test_labels_for_model, predictions))

# Compute and print accuracy score
accuracy = accuracy_score(test_labels_for_model, predictions)
print(f"Accuracy Score: {accuracy}")

# Generate and print confusion matrix
conf_matrix = confusion_matrix(test_labels_for_model, predictions)
print("Confusion Matrix:")
print(conf_matrix)

# Extract feature importances from the best classifier
feature_importances = best_rf_classifier.feature_importances_

# Create a list of feature names (based on the order they were combined)
# This requires knowing the number of features from each category
num_shape_features = train_shape_features.shape[1]

# Correctly calculate the number of flattened texture features
# The number of texture features is simply the second dimension of the flattened array.
# We can get this from the shape of train_texture_features_flat created in the previous step.
try:
    num_texture_features_flat = train_texture_features_flat.shape[1]
except NameError:
    # If train_texture_features_flat was not created, calculate its expected size
    # based on the total combined features and shape features
    if train_combined_features is not None and train_shape_features is not None:
        num_texture_features_flat = train_combined_features.shape[1] - train_shape_features.shape[1]
    else:
        # Fallback if necessary variables are not defined
        num_texture_features_flat = 0 # Default to 0 if cannot determine

# Assuming a simple naming convention for demonstration
shape_feature_names = [f"shape_{i}" for i in range(num_shape_features)]
texture_feature_names = [f"texture_{i}" for i in range(num_texture_features_flat)]
all_feature_names = shape_feature_names + texture_feature_names

# Check if the lengths match before creating the Series
if len(feature_importances) == len(all_feature_names):
    # Create a pandas Series for feature importances
    feature_importance_series = pd.Series(feature_importances, index=all_feature_names)

    # Sort feature importances in descending order
    sorted_feature_importances = feature_importance_series.sort_values(ascending=False)

    # Print the top 20 most important features
    print("\nTop 20 Most Important Features:")
    print(sorted_feature_importances.head(20))
else:
    print(f"Mismatch in feature counts: Model has {len(feature_importances)}, generated names have {len(all_feature_names)}")

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Define the CNN model architecture
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(64, 64, 1)),
    MaxPooling2D((2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Conv2D(128, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid') # Sigmoid for binary classification
])

# Compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Setup data augmentation
datagen = ImageDataGenerator(
    rotation_range=20,
    zoom_range=0.1,
    width_shift_range=0.1,
    height_shift_range=0.1,
    horizontal_flip=True,
    vertical_flip=True
)

# Fit the data generator on the training data
datagen.fit(train_images_for_model) # Use the original train_images_for_model

# Train the model using the augmented data
history = model.fit(datagen.flow(train_images_for_model, train_labels_for_model, batch_size=32),
                    epochs=50, # Number of training epochs
                    validation_data=(test_images_for_model, test_labels_for_model)) # Use the original test_images_for_model and labels

print("CNN model training completed.")

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

# Make predictions on the test set
predictions_prob = model.predict(test_images_for_model)
predictions = (predictions_prob > 0.5).astype("int32")

# Generate and print classification report
print("CNN Classification Report:")
print(classification_report(test_labels_for_model, predictions))

# Generate and print confusion matrix
conf_matrix_cnn = confusion_matrix(test_labels_for_model, predictions)
print("CNN Confusion Matrix:")
print(conf_matrix_cnn)

In [None]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import SelectKBest, f_classif

# Get predictions (probabilities) from the trained CNN on the training and testing data
train_cnn_predictions_prob = model.predict(train_images_for_model) # Use the original train_images_for_model
test_cnn_predictions_prob = model.predict(test_images_for_model) # Use the original test_images_for_model

# Reshape CNN predictions to be 2D arrays
train_cnn_predictions_prob = train_cnn_predictions_prob.reshape(-1, 1)
test_cnn_predictions_prob = test_cnn_predictions_prob.reshape(-1, 1)


# Combine CNN probabilities with handcrafted features for training
# Ensure both arrays have the same number of samples
if train_combined_features.shape[0] == train_cnn_predictions_prob.shape[0]:
    train_ensemble_features = np.concatenate((train_combined_features, train_cnn_predictions_prob), axis=1)
    print(f"Shape of train_ensemble_features: {train_ensemble_features.shape}")
else:
    print("Mismatch in the number of training samples for combined features and CNN predictions.")
    train_ensemble_features = None # Or handle the error appropriately


# Combine CNN probabilities with handcrafted features for testing
# Ensure both arrays have the same number of samples
if test_combined_features.shape[0] == test_cnn_predictions_prob.shape[0]:
    test_ensemble_features = np.concatenate((test_combined_features, test_cnn_predictions_prob), axis=1)
    print(f"Shape of test_ensemble_features: {test_ensemble_features.shape}")
else:
    print("Mismatch in the number of testing samples for shape and texture features.")
    test_ensemble_features = None # Or handle the error appropriately


# --- Feature Selection (Optional but Recommended for Ensemble) ---
# Use SelectKBest to select the top K features based on ANOVA F-value
# You can adjust the value of k (number of features to select)
if train_ensemble_features is not None and train_labels_for_model is not None:
    # Address potential NaNs or Infs in features before selection
    train_ensemble_features_clean = np.nan_to_num(train_ensemble_features)

    # Check if there are enough samples for feature selection
    if train_ensemble_features_clean.shape[0] > 1 and np.var(train_ensemble_features_clean, axis=0).sum() > 0:
        k = min(100, train_ensemble_features_clean.shape[1]) # Select top 100 features or fewer if less are available
        try:
            selector = SelectKBest(score_func=f_classif, k=k)
            train_ensemble_features_selected = selector.fit_transform(train_ensemble_features_clean, train_labels_for_model)
            test_ensemble_features_selected = selector.transform(np.nan_to_num(test_ensemble_features)) # Apply the same selection to test data
            print(f"Shape of train_ensemble_features_selected: {train_ensemble_features_selected.shape}")
            print(f"Shape of test_ensemble_features_selected: {test_ensemble_features_selected.shape}")
        except ValueError as e:
             print(f"Could not perform feature selection: {e}")
             # Fallback to using all features if selection fails
             train_ensemble_features_selected = train_ensemble_features_clean
             test_ensemble_features_selected = np.nan_to_num(test_ensemble_features)
             print("Using all features for ensemble training.")
    else:
         print("Not enough samples or variance for feature selection. Using all features.")
         train_ensemble_features_selected = train_ensemble_features_clean
         test_ensemble_features_selected = np.nan_to_num(test_ensemble_features)
else:
    print("Ensemble features not available for selection.")
    train_ensemble_features_selected = None
    test_ensemble_features_selected = None


# --- Train Second-Level Random Forest Classifier ---
# Apply SMOTE to the selected training features to handle class imbalance for the ensemble model
if train_ensemble_features_selected is not None and train_labels_for_model is not None:
    try:
        smote_ensemble = SMOTE(random_state=42)
        train_ensemble_features_smote, train_labels_ensemble_smote = smote_ensemble.fit_resample(train_ensemble_features_selected, train_labels_for_model)

        # Instantiate and train the second-level Random Forest Classifier
        ensemble_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
        ensemble_classifier.fit(train_ensemble_features_smote, train_labels_ensemble_smote)

        print("\nEnsemble Random Forest classifier trained successfully with SMOTE.")
    except ValueError as e:
         print(f"Could not train ensemble classifier: {e}")
         ensemble_classifier = None
else:
    print("Could not train ensemble classifier due to missing data.")

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Make predictions on the test set using the ensemble model
if test_ensemble_features_selected is not None and ensemble_classifier is not None:
    ensemble_predictions = ensemble_classifier.predict(test_ensemble_features_selected)

    # Generate and print classification report for the ensemble model
    print("Ensemble Model Classification Report:")
    print(classification_report(test_labels_for_model, ensemble_predictions))

    # Compute and print accuracy score for the ensemble model
    ensemble_accuracy = accuracy_score(test_labels_for_model, ensemble_predictions)
    print(f"Ensemble Model Accuracy Score: {ensemble_accuracy}")

    # Generate and print confusion matrix for the ensemble model
    conf_matrix_ensemble = confusion_matrix(test_labels_for_model, ensemble_predictions)
    print("Ensemble Model Confusion Matrix:")
    print(conf_matrix_ensemble)
else:
    print("Ensemble model or test features not available for evaluation.")

In [None]:
import cv2
import os
import numpy as np

def load_and_preprocess_images(base_dir):
    """Loads images from specified directory and converts to grayscale."""
    images = []
    image_filenames = sorted(os.listdir(base_dir))
    for filename in image_filenames:
        img_path = os.path.join(base_dir, filename)
        img = cv2.imread(img_path)

        # Check if image was loaded successfully
        if img is None:
            print(f"Warning: Could not load image {img_path}")
            continue

        # OpenCV loads images in BGR format, convert to grayscale
        gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        images.append(gray_img)
    return np.array(images)

# Swap train and test assignments based on the user's feedback
train_red_images = load_and_preprocess_images('DNA_Damage_Project/test images/red')
test_red_images = load_and_preprocess_images('DNA_Damage_Project/image/red')


print(f"Loaded {len(train_red_images)} training red images.")
print(f"Loaded {len(test_red_images)} testing red images.")

In [None]:
import cv2
import os
import numpy as np

def load_and_preprocess_images(base_dir):
    """Loads images from specified directory and converts to grayscale."""
    images = []
    image_filenames = sorted(os.listdir(base_dir))
    for filename in image_filenames:
        img_path = os.path.join(base_dir, filename)
        img = cv2.imread(img_path)

        # Check if image was loaded successfully
        if img is None:
            print(f"Warning: Could not load image {img_path}")
            continue

        # OpenCV loads images in BGR format, convert to grayscale
        gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        images.append(gray_img)
    return np.array(images)

# Swap train and test assignments based on the user's feedback
train_green_images = load_and_preprocess_images('DNA_Damage_Project/test images/green')
test_green_images = load_and_preprocess_images('DNA_Damage_Project/image/green')


print(f"Loaded {len(train_green_images)} training green images.")
print(f"Loaded {len(test_green_images)} testing green images.")

In [None]:
def pair_images(red_images, green_images):
    """Pairs corresponding red and green images."""
    # Assuming images are already sorted by filename in the loading step
    # and that the filenames match between red and green directories
    paired_images = []
    for i in range(len(red_images)):
        paired_images.append((red_images[i], green_images[i]))
    return paired_images

# Swap train and test paired images assignments based on the user's feedback
train_paired_images = pair_images(train_red_images, train_green_images)
test_paired_images = pair_images(test_red_images, test_green_images)


print(f"Created {len(train_paired_images)} paired training images.")
print(f"Created {len(test_paired_images)} paired testing images.")

In [None]:
def segment_nuclei(image):
    """Segments nuclei using Otsu's thresholding and watershed."""
    # Apply Otsu's thresholding
    _, thresh = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

    # Morphological operations to clean up the mask
    kernel = np.ones((3, 3), np.uint8)
    opening = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel, iterations=2)

    # Sure background area
    sure_bg = cv2.dilate(opening, kernel, iterations=3)

    # Sure foreground area
    dist_transform = cv2.distanceTransform(opening, cv2.DIST_L2, 5)
    _, sure_fg = cv2.threshold(dist_transform, 0.7 * dist_transform.max(), 255, 0)

    # Unknown region
    sure_fg = np.uint8(sure_fg)
    unknown = cv2.subtract(sure_bg, sure_fg)

    # Marker labelling
    _, markers = cv2.connectedComponents(sure_fg)

    # Add one to all labels so that sure background is not 0, but 1
    markers = markers + 1

    # Now, mark the region of unknown with zero
    markers[unknown == 255] = 0

    # Apply watershed algorithm
    # Create a BGR version of the grayscale image for watershed visualization (optional, but watershed expects 3 channels)
    img_bgr = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
    markers = cv2.watershed(img_bgr, markers)

    # Create a binary mask from the watershed result
    segmented_mask = np.zeros_like(image, dtype=np.uint8)
    segmented_mask[markers > 1] = 255  # Nuclei are marked with labels > 1

    # Remove small objects (noise)
    min_size = 100  # Minimum size threshold for nuclei
    num_labels, labels_img, stats, centroids = cv2.connectedComponentsWithStats(segmented_mask, connectivity=8)

    cleaned_mask = np.zeros_like(segmented_mask)
    for i in range(1, num_labels): # Start from 1 to exclude background
        if stats[i, cv2.CC_STAT_AREA] >= min_size:
            cleaned_mask[labels_img == i] = 255

    return cleaned_mask

train_segmented_masks = [segment_nuclei(img) for img in train_red_images]
test_segmented_masks = [segment_nuclei(img) for img in test_red_images]

print(f"Generated {len(train_segmented_masks)} training segmented masks.")
print(f"Generated {len(test_segmented_masks)} testing segmented masks.")

In [None]:
def apply_global_thresholding(image, threshold_value=50):
    """Applies global thresholding to a grayscale image."""
    # Apply global thresholding
    _, thresholded_img = cv2.threshold(image, threshold_value, 255, cv2.THRESH_BINARY)
    return thresholded_img

# Apply thresholding to training and testing green images
# Using the threshold_value from the previous cell if it exists, otherwise use a default
global threshold_value
try:
    threshold_value_to_use = threshold_value
except NameError:
    threshold_value_to_use = 50 # Default value

train_green_thresholded = [apply_global_thresholding(img, threshold_value_to_use) for img in train_green_images]
test_green_thresholded = [apply_global_thresholding(img, threshold_value_to_use) for img in test_green_images]

print(f"Applied global thresholding to {len(train_green_thresholded)} training green images.")
print(f"Applied global thresholding to {len(test_green_thresholded)} testing green images.")

In [None]:
def detect_foci_and_label_nucleus(paired_image, segmented_mask, thresholded_green_image, foci_threshold=10):
    """
    Detects green foci within segmented nuclei and labels each nucleus
    as "damaged" or "normal".

    Args:
        paired_image: A tuple containing the red and green channel images (grayscale).
        segmented_mask: The binary segmented mask for the nuclei.
        thresholded_green_image: The thresholded green channel image.
        foci_threshold: The minimum number of non-zero pixels (foci) to label a nucleus as "damaged".

    Returns:
        A list of tuples, where each tuple contains the normalized
        red nucleus image and its corresponding label ("damaged" or "normal").
    """
    red_img, green_img = paired_image
    labeled_nuclei = []

    # Find contours in the segmented mask to identify individual nuclei
    contours, _ = cv2.findContours(segmented_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    for contour in contours:
        # Create a mask specifically for the current nucleus
        nucleus_mask = np.zeros_like(segmented_mask)
        cv2.drawContours(nucleus_mask, [contour], -1, 255, -1)

        # Apply this nucleus mask to the thresholded green channel image
        masked_green = cv2.bitwise_and(thresholded_green_image, thresholded_green_image, mask=nucleus_mask)

        # Count the number of non-zero pixels (representing potential foci)
        foci_count = np.count_nonzero(masked_green)

        # Determine the label based on foci count
        label = "damaged" if foci_count > foci_threshold else "normal"

        # Find bounding box
        x, y, w, h = cv2.boundingRect(contour)

        # Extract ROI for the red channel
        red_roi = red_img[y:y+h, x:x+w]

        # Calculate minimum enclosing circle
        (center_x, center_y), radius = cv2.minEnclosingCircle(contour)
        center_x -= x  # Adjust center to be relative to ROI
        center_y -= y

        # Determine size of square bounding box for the circle
        square_size = int(np.ceil(2 * radius))
        padding = square_size

        # Pad the red ROI to the square size, centering the nucleus
        # Calculate padding amounts
        pad_x_before = int((padding - w) / 2)
        pad_x_after = padding - w - pad_x_before
        pad_y_before = int((padding - h) / 2)
        pad_y_after = padding - h - pad_y_before

        padded_red_roi = np.pad(red_roi, ((pad_y_before, pad_y_after), (pad_x_before, pad_x_after)), mode='constant')

        # Resize to target size (assuming the target size is 64x64 as used in the previous normalization step)
        target_size = (64, 64)
        normalized_red_nucleus = cv2.resize(padded_red_roi, target_size, interpolation=cv2.INTER_AREA)


        # Store the normalized red nucleus image and its label
        labeled_nuclei.append((normalized_red_nucleus, label))

    return labeled_nuclei

# Apply foci detection and labeling to training and testing sets
train_labeled_nuclei = []
for i in range(len(train_paired_images)):
    labeled_nuclei_in_image = detect_foci_and_label_nucleus(
        train_paired_images[i],
        train_segmented_masks[i],
        train_green_thresholded[i]
    )
    train_labeled_nuclei.extend(labeled_nuclei_in_image)

test_labeled_nuclei = []
for i in range(len(test_paired_images)):
    labeled_nuclei_in_image = detect_foci_and_label_nucleus(
        test_paired_images[i],
        test_segmented_masks[i],
        test_green_thresholded[i]
    )
    test_labeled_nuclei.extend(labeled_nuclei_in_image)

# Count damaged and normal nuclei
train_damaged_count = sum(1 for img, label in train_labeled_nuclei if label == "damaged")
train_normal_count = sum(1 for img, label in train_labeled_nuclei if label == "normal")
test_damaged_count = sum(1 for img, label in test_labeled_nuclei if label == "damaged")
test_normal_count = sum(1 for img, label in test_labeled_nuclei if label == "normal")

print(f"Training set: {train_damaged_count} damaged, {train_normal_count} normal.")
print(f"Testing set: {test_damaged_count} damaged, {test_normal_count} normal.")

In [None]:
def process_image_pair(paired_image):
    """
    Processes a paired image (red, green) through segmentation,
    thresholding, foci detection, and labeling.

    Args:
        paired_image: A tuple containing the red and green channel images (grayscale).

    Returns:
        A list of tuples, where each tuple contains the normalized
        red nucleus image and its corresponding label ("damaged" or "normal").
    """
    red_img, green_img = paired_image

    # 1. Nucleus segmentation (red channel)
    segmented_mask = segment_nuclei(red_img)

    # 2. Green channel global thresholding
    # Use the threshold_value defined previously (or a default if not defined)
    global threshold_value # Access the threshold value from the previous cell if it exists
    try:
        threshold_value_to_use = threshold_value
    except NameError:
        threshold_value_to_use = 50 # Default value if not set

    thresholded_green_image = apply_global_thresholding(green_img, threshold_value_to_use)


    # 3. Foci detection and nucleus labeling
    labeled_nuclei_in_image = detect_foci_and_label_nucleus(
        paired_image,
        segmented_mask,
        thresholded_green_image
    )

    return labeled_nuclei_in_image

# Process training images
all_train_labeled_nuclei = []
for paired_img in train_paired_images:
    labeled_nuclei = process_image_pair(paired_img)
    all_train_labeled_nuclei.extend(labeled_nuclei)

# Process testing images
all_test_labeled_nuclei = []
for paired_img in test_paired_images:
    labeled_nuclei = process_image_pair(paired_img)
    all_test_labeled_nuclei.extend(labeled_nuclei)

# Print the total number of labeled nuclei
print(f"Total labeled nuclei in training set: {len(all_train_labeled_nuclei)}")
print(f"Total labeled nuclei in testing set: {len(all_test_labeled_nuclei)}")

In [None]:
# Separate images and labels for training
train_images_for_model = [nucleus[0] for nucleus in all_train_labeled_nuclei]
train_labels_for_model = [nucleus[1] for nucleus in all_train_labeled_nuclei]

# Separate images and labels for testing
test_images_for_model = [nucleus[0] for nucleus in all_test_labeled_nuclei]
test_labels_for_model = [nucleus[1] for nucleus in all_test_labeled_nuclei]

# Convert images to NumPy arrays and add channel dimension
train_images_for_model = np.array(train_images_for_model).reshape(-1, 64, 64, 1)
test_images_for_model = np.array(test_images_for_model).reshape(-1, 64, 64, 1)

# Convert labels to NumPy arrays and to numerical format
label_mapping = {"normal": 0, "damaged": 1}
train_labels_for_model = np.array([label_mapping[label] for label in train_labels_for_model])
test_labels_for_model = np.array([label_mapping[label] for label in test_labels_for_model])

# Print shapes
print(f"Shape of train_images_for_model: {train_images_for_model.shape}")
print(f"Shape of train_labels_for_model: {train_labels_for_model.shape}")
print(f"Shape of test_images_for_model: {test_images_for_model.shape}")
print(f"Shape of test_labels_for_model: {test_labels_for_model.shape}")

In [None]:
import mahotas
from skimage.feature import hog
import cv2
import numpy as np
from skimage import measure # Import measure for regionprops

def extract_shape_features(image):
    """Extracts shape features from a binary nucleus mask using skimage.measure.regionprops."""
    # Ensure the image is binary (0 or 255) and has the correct dtype for regionprops
    if image.max() <= 1:
        image = image * 255
    image = image.astype(np.uint8)

    # Find contours (still needed for some calculations like perimeter and hull)
    contours, _ = cv2.findContours(image.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    if not contours:
        # Return a list of zeros if no contour is found
        # Adjust size based on the features extracted by regionprops + Hu moments + HOG
        # regionprops provides area, perimeter, major/minor axis length, eccentricity, solidity
        # Hu moments are 7. HOG depends on parameters (e.g., 9 orientations for 1 cell block size 16x16)
        return [0] * (5 + 7 + 9)

    # Assuming the largest contour is the nucleus
    contour = max(contours, key=cv2.contourArea)

    # Use regionprops for basic shape features
    # Need to create a labeled image for regionprops
    labeled_image, num_labels = measure.label(image, connectivity=2, return_num=True)

    if num_labels < 1:
         # Return a list of zeros if no labeled regions are found
         return [0] * (5 + 7 + 9)

    # Find properties of the largest region (assumed to be the nucleus)
    properties = measure.regionprops(labeled_image)[0] # Assuming only one main region after segmentation

    area = properties.area
    perimeter = properties.perimeter
    major_axis_length = properties.major_axis_length if properties.major_axis_length is not None else 0
    minor_axis_length = properties.minor_axis_length if properties.minor_axis_length is not None else 0
    axis_ratio = major_axis_length / minor_axis_length if minor_axis_length > 0 else 0
    eccentricity = properties.eccentricity if properties.eccentricity is not None else 0
    solidity = properties.solidity if properties.solidity is not None else 0

    # Roundness/Form Factor (calculated using area and perimeter from regionprops)
    roundness = 4 * np.pi * area / (perimeter**2) if perimeter > 0 else 0

    # Hu Moments (using cv2 on the contour)
    hu_moments = cv2.HuMoments(cv2.moments(contour)).flatten()

    # Histogram of Oriented Gradients (HOG) - using the image directly
    try:
        # HOG requires a specific input format and parameters. This is a basic example.
        # The parameters for HOG (pixels_per_cell, cells_per_block, orientations) can be tuned.
        hog_features = hog(image, pixels_per_cell=(16, 16), cells_per_block=(1, 1), orientations=9, feature_vector=True)
    except ValueError:
        hog_features = [0] * 9 # Example size, adjust based on HOG parameters

    # Combine features
    features = [area, perimeter, major_axis_length, minor_axis_length, axis_ratio, eccentricity, solidity, roundness] + list(hu_moments) + list(hog_features)

    return features

# Extract features for training and testing data
train_shape_features = [extract_shape_features(nucleus_img) for nucleus_img, label in all_train_labeled_nuclei]
test_shape_features = [extract_shape_features(nucleus_img) for nucleus_img, label in all_test_labeled_nuclei]

# Convert to numpy arrays
train_shape_features = np.array(train_shape_features)
test_shape_features = np.array(test_shape_features)

print(f"Shape of train_shape_features: {train_shape_features.shape}")
print(f"Shape of test_shape_features: {test_shape_features.shape}")

In [None]:
import mahotas as mh
from skimage import feature, measure
import cv2
import numpy as np
from skimage.filters import gaussian#, laplacian # Comment out laplacian import
from skimage.feature import SIFT
import scipy.ndimage # Import scipy for laplacian

def extract_texture_features(image):
    """Extracts various texture features from a grayscale image."""

    # Ensure the image is in the correct format (uint8) for some libraries
    image = image.astype(np.uint8)

    # Check if the image is too small for feature extraction
    if image.shape[0] < 21 or image.shape[1] < 21: # Some features like Gabor might need a minimum size
         return [0] * (13 + 27 + 2 + 16 + 25 + 2 + 4*4) # Haralick + LBP + LoG (mean, std) + Gabor (4 orientations * 2 freqs * 2 features) + Zernike (degree 8) + SIFT (mean, std) + skimage haralick

    # Haralick Texture Features (using mahotas) - keeping this as it was used before
    # Check if the image is not all zeros or constant, which can cause issues with haralick
    if np.all(image == image[0, 0]):
        haralick_features = [0] * 13
    else:
        try:
            haralick_features = mh.features.haralick(image).tolist()
        except ValueError:
            haralick_features = [0] * 13

    # Local Binary Patterns (LBP) (using skimage) - from user's snippet
    try:
        lbp = feature.local_binary_pattern(image, P=8, R=1, method='uniform')
        # Use a fixed number of bins for the histogram (26 uniform patterns + 2 for ends)
        lbp_hist, _ = np.histogram(lbp, bins=np.arange(0, 28), density=True)
        lbp_features = lbp_hist.tolist()
    except Exception:
        lbp_features = [0] * 27 # 26 uniform patterns + 1 for the remaining

    # Haralick Texture Features (using scikit-image) - from user's snippet - using different distances/angles
    try:
        # Ensure image is uint8 for graycomatrix
        image_uint8 = image.astype(np.uint8)
        # Adjust distances and angles as needed
        glcm = measure.graycomatrix(image_uint8, distances=[1, 3, 5], angles=[0, np.pi/4, np.pi/2, 3*np.pi/4], symmetric=True, normed=True)
        # Extract properties - using contrast, correlation, energy, homogeneity as in user's snippet
        haralick_features_skimage = measure.graycoprops(glcm, props=['contrast', 'correlation', 'energy', 'homogeneity']).flatten().tolist()
    except Exception:
        haralick_features_skimage = [0] * (4 * 4) # 4 properties * 4 angles

    # Laplacian of Gaussian (LoG) (using cv2) - from user's snippet
    try:
        # Apply Gaussian smoothing using cv2
        blurred_image = cv2.GaussianBlur(image, (0, 0), 1)
        # Apply Laplacian filter using cv2
        log_image = cv2.Laplacian(blurred_image, cv2.CV_64F)
        # Simple statistics from LoG response (e.g., mean and variance)
        log_features = [np.mean(log_image), np.std(log_image)]
    except Exception:
        log_features = [0] * 2 # Adjust size based on features extracted


    # Gabor Wavelets (using cv2) - from user's snippet
    try:
        gabor_features = []
        # Define Gabor filter parameters (can be extended with different orientations and frequencies)
        kernels = []
        for theta in np.arange(0, np.pi, np.pi / 4): # 4 orientations
            for freq in [5, 10]: # 2 frequencies
                kern = cv2.getGaborKernel((21, 21), 5.0, theta, freq, 0.5, 0, ktype=cv2.CV_32F)
                kernels.append(kern)

        # Apply filters and extract features (e.g., mean and variance of the response)
        for kernel in kernels:
            fimg = cv2.filter2D(image, cv2.CV_8UC3, kernel) # Changed depth to CV_8UC3 as per documentation examples
            # Convert to grayscale for mean/std calculation if fimg is BGR
            if len(fimg.shape) == 3:
                fimg = cv2.cvtColor(fimg, cv2.COLOR_BGR2GRAY)
            gabor_features.extend([np.mean(fimg), np.std(fimg)])
    except Exception:
        gabor_features = [0] * (4 * 2 * 2) # 4 orientations * 2 frequencies * 2 features (mean, std)


    # Zernike Moments (using mahotas) - keeping this
    try:
        # Ensure the image is binary for Zernike moments
        # Use Otsu's thresholding on the current nucleus image ROI
        _, binary_nucleus = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
        # Compute Zernike moments up to a certain degree (e.g., 8)
        # The number of moments depends on the degree. For degree 8, there are 25 moments.
        zernike_moments = mh.features.zernike_moments(binary_nucleus, radius=image.shape[0]//2, degree=8).tolist()
    except Exception:
         # Approximate number of Zernike moments for degree 8
        zernike_moments = [0] * 25


    # SIFT (Scale-Invariant Feature Transform) (using skimage) - keeping this
    try:
        # Initialize SIFT detector
        # Consider adjusting parameters for feature detection
        detector_extractor = SIFT()
        # Find keypoints and compute descriptors
        detector_extractor.detect_and_extract(image)
        descriptors = detector_extractor.descriptors

        # If no keypoints are found, descriptors will be None
        if descriptors is not None:
            # Simple representation: mean and standard deviation of descriptors
            sift_features = [np.mean(descriptors), np.std(descriptors)]
            # Or, you could cluster descriptors or use a bag-of-visual-words approach
        else:
            sift_features = [0] * 2 # Adjust size based on features extracted (e.g., mean, std)
    except Exception:
        sift_features = [0] * 2 # Adjust size based on features extracted


    # Combine all texture features - including both mahotas and skimage haralick
    features = haralick_features + lbp_features + haralick_features_skimage + log_features + gabor_features + zernike_moments + sift_features

    return features


# Extract texture features for training and testing data
train_texture_features = [extract_texture_features(nucleus_img) for nucleus_img, label in all_train_labeled_nuclei]
test_texture_features = [extract_texture_features(nucleus_img) for nucleus_img, label in all_test_labeled_nuclei]

# Convert to numpy arrays
# Ensure all feature lists have the same length before converting to numpy array
def pad_features(features_list):
    max_len = max(len(f) for f in features_list)
    padded_features = []
    for f in features_list:
        # Ensure each element in f is a number before padding
        padded_features.append([float(x) if isinstance(x, (int, float)) else 0.0 for x in f] + [0.0] * (max_len - len(f)))
    return np.array(padded_features)

train_texture_features = pad_features(train_texture_features)
test_texture_features = pad_features(test_texture_features)


print(f"Shape of train_texture_features: {train_texture_features.shape}")
print(f"Shape of test_texture_features: {test_texture_features.shape}")

In [None]:
# Combine shape and texture features for training
# Ensure both arrays have the same number of samples
if train_shape_features.shape[0] == train_texture_features.shape[0]:
    train_texture_features_flat = train_texture_features.reshape(train_texture_features.shape[0], -1)
    train_combined_features = np.concatenate((train_shape_features, train_texture_features_flat), axis=1)
    print(f"Shape of train_combined_features: {train_combined_features.shape}")
else:
    print("Mismatch in the number of training samples for shape and texture features.")
    train_combined_features = None # Or handle the error appropriately


# Combine shape and texture features for testing
# Ensure both arrays have the same number of samples
if test_shape_features.shape[0] == test_texture_features.shape[0]:
    test_texture_features_flat = test_texture_features.reshape(test_texture_features.shape[0], -1)
    test_combined_features = np.concatenate((test_shape_features, test_texture_features_flat), axis=1)
    print(f"Shape of test_combined_features: {test_combined_features.shape}")
else:
    print("Mismatch in the number of testing samples for shape and texture features.")
    test_combined_features = None # Or handle the error appropriately

In [None]:
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import numpy as np # Ensure numpy is imported
from sklearn.feature_selection import SelectKBest, f_classif # Import feature selection

# Apply SMOTE to handle class imbalance on the combined training features
if train_combined_features is not None and train_labels_for_model is not None:
    smote = SMOTE(random_state=42)
    train_features_smote, train_labels_smote = smote.fit_resample(train_combined_features, train_labels_for_model)

    # Define the parameter grid for GridSearchCV
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'class_weight': [None, 'balanced', 'balanced_subsample'] # Experiment with class weighting
    }

    # Instantiate the Random Forest Classifier
    rf = RandomForestClassifier(random_state=42)

    # Instantiate GridSearchCV
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='recall', n_jobs=-1) # Optimize for recall

    # Fit GridSearchCV to the SMOTE-resampled training data
    grid_search.fit(train_features_smote, train_labels_smote)

    # Get the best parameters and the best estimator
    best_params = grid_search.best_params_
    best_rf_classifier = grid_search.best_estimator_

    print("Best parameters found by GridSearchCV:")
    print(best_params)

    print("\nOptimized Random Forest classifier trained successfully.")

    # Evaluate the best model on the original (non-SMOTE) test set
    if test_combined_features is not None and test_labels_for_model is not None:
        predictions = best_rf_classifier.predict(test_combined_features)
        print("\nOptimized Random Forest Classifier Evaluation on Test Set:")
        print(classification_report(test_labels_for_model, predictions))
    else:
        print("\nTest data not available for evaluating the optimized Random Forest classifier.")

else:
    print("Combined training features or labels not available for Random Forest optimization.")

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import pandas as pd
import numpy as np # Ensure numpy is imported

# Make predictions on the test set
predictions = best_rf_classifier.predict(test_combined_features) # Use the best classifier

# Generate and print classification report
print("Optimized Random Forest Classifier Evaluation:")
print(classification_report(test_labels_for_model, predictions))

# Compute and print accuracy score
accuracy = accuracy_score(test_labels_for_model, predictions)
print(f"Accuracy Score: {accuracy}")

# Generate and print confusion matrix
conf_matrix = confusion_matrix(test_labels_for_model, predictions)
print("Confusion Matrix:")
print(conf_matrix)

# Extract feature importances from the best classifier
feature_importances = best_rf_classifier.feature_importances_

# Create a list of feature names (based on the order they were combined)
# This requires knowing the number of features from each category
num_shape_features = train_shape_features.shape[1]

# Correctly calculate the number of flattened texture features
# The number of texture features is simply the second dimension of the flattened array.
# We can get this from the shape of train_texture_features_flat created in the previous step.
try:
    num_texture_features_flat = train_texture_features_flat.shape[1]
except NameError:
    # If train_texture_features_flat was not created, calculate its expected size
    # based on the total combined features and shape features
    if train_combined_features is not None and train_shape_features is not None:
        num_texture_features_flat = train_combined_features.shape[1] - train_shape_features.shape[1]
    else:
        # Fallback if necessary variables are not defined
        num_texture_features_flat = 0 # Default to 0 if cannot determine

# Assuming a simple naming convention for demonstration
shape_feature_names = [f"shape_{i}" for i in range(num_shape_features)]
texture_feature_names = [f"texture_{i}" for i in range(num_texture_features_flat)]
all_feature_names = shape_feature_names + texture_feature_names

# Check if the lengths match before creating the Series
if len(feature_importances) == len(all_feature_names):
    # Create a pandas Series for feature importances
    feature_importance_series = pd.Series(feature_importances, index=all_feature_names)

    # Sort feature importances in descending order
    sorted_feature_importances = feature_importance_series.sort_values(ascending=False)

    # Print the top 20 most important features
    print("\nTop 20 Most Important Features:")
    print(sorted_feature_importances.head(20))
else:
    print(f"Mismatch in feature counts: Model has {len(feature_importances)}, generated names have {len(all_feature_names)}")

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Define the CNN model architecture
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(64, 64, 1)),
    MaxPooling2D((2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Conv2D(128, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid') # Sigmoid for binary classification
])

# Compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Setup data augmentation
datagen = ImageDataGenerator(
    rotation_range=20,
    zoom_range=0.1,
    width_shift_range=0.1,
    height_shift_range=0.1,
    horizontal_flip=True,
    vertical_flip=True
)

# Fit the data generator on the training data
datagen.fit(train_images_for_model) # Use the original train_images_for_model

# Train the model using the augmented data
history = model.fit(datagen.flow(train_images_for_model, train_labels_for_model, batch_size=32),
                    epochs=50, # Number of training epochs
                    validation_data=(test_images_for_model, test_labels_for_model)) # Use the original test_images_for_model and labels

print("CNN model training completed.")

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

# Make predictions on the test set
predictions_prob = model.predict(test_images_for_model)
predictions = (predictions_prob > 0.5).astype("int32")

# Generate and print classification report
print("CNN Classification Report:")
print(classification_report(test_labels_for_model, predictions))

# Generate and print confusion matrix
conf_matrix_cnn = confusion_matrix(test_labels_for_model, predictions)
print("CNN Confusion Matrix:")
print(conf_matrix_cnn)

In [None]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import SelectKBest, f_classif

# Get predictions (probabilities) from the trained CNN on the training and testing data
train_cnn_predictions_prob = model.predict(train_images_for_model) # Use the original train_images_for_model
test_cnn_predictions_prob = model.predict(test_images_for_model) # Use the original test_images_for_model

# Reshape CNN predictions to be 2D arrays
train_cnn_predictions_prob = train_cnn_predictions_prob.reshape(-1, 1)
test_cnn_predictions_prob = test_cnn_predictions_prob.reshape(-1, 1)


# Combine CNN probabilities with handcrafted features for training
# Ensure both arrays have the same number of samples
if train_combined_features.shape[0] == train_cnn_predictions_prob.shape[0]:
    train_ensemble_features = np.concatenate((train_combined_features, train_cnn_predictions_prob), axis=1)
    print(f"Shape of train_ensemble_features: {train_ensemble_features.shape}")
else:
    print("Mismatch in the number of training samples for combined features and CNN predictions.")
    train_ensemble_features = None # Or handle the error appropriately


# Combine CNN probabilities with handcrafted features for testing
# Ensure both arrays have the same number of samples
if test_combined_features.shape[0] == test_cnn_predictions_prob.shape[0]:
    test_ensemble_features = np.concatenate((test_combined_features, test_cnn_predictions_prob), axis=1)
    print(f"Shape of test_ensemble_features: {test_ensemble_features.shape}")
else:
    print("Mismatch in the number of testing samples for shape and texture features.")
    test_ensemble_features = None # Or handle the error appropriately


# --- Feature Selection (Optional but Recommended for Ensemble) ---
# Use SelectKBest to select the top K features based on ANOVA F-value
# You can adjust the value of k (number of features to select)
if train_ensemble_features is not None and train_labels_for_model is not None:
    # Address potential NaNs or Infs in features before selection
    train_ensemble_features_clean = np.nan_to_num(train_ensemble_features)

    # Check if there are enough samples for feature selection
    if train_ensemble_features_clean.shape[0] > 1 and np.var(train_ensemble_features_clean, axis=0).sum() > 0:
        k = min(100, train_ensemble_features_clean.shape[1]) # Select top 100 features or fewer if less are available
        try:
            selector = SelectKBest(score_func=f_classif, k=k)
            train_ensemble_features_selected = selector.fit_transform(train_ensemble_features_clean, train_labels_for_model)
            test_ensemble_features_selected = selector.transform(np.nan_to_num(test_ensemble_features)) # Apply the same selection to test data
            print(f"Shape of train_ensemble_features_selected: {train_ensemble_features_selected.shape}")
            print(f"Shape of test_ensemble_features_selected: {test_ensemble_features_selected.shape}")
        except ValueError as e:
             print(f"Could not perform feature selection: {e}")
             # Fallback to using all features if selection fails
             train_ensemble_features_selected = train_ensemble_features_clean
             test_ensemble_features_selected = np.nan_to_num(test_ensemble_features)
             print("Using all features for ensemble training.")
    else:
         print("Not enough samples or variance for feature selection. Using all features.")
         train_ensemble_features_selected = train_ensemble_features_clean
         test_ensemble_features_selected = np.nan_to_num(test_ensemble_features)
else:
    print("Ensemble features not available for selection.")
    train_ensemble_features_selected = None
    test_ensemble_features_selected = None


# --- Train Second-Level Random Forest Classifier ---
# Apply SMOTE to the selected training features to handle class imbalance for the ensemble model
if train_ensemble_features_selected is not None and train_labels_for_model is not None:
    try:
        smote_ensemble = SMOTE(random_state=42)
        train_ensemble_features_smote, train_labels_ensemble_smote = smote_ensemble.fit_resample(train_ensemble_features_selected, train_labels_for_model)

        # Instantiate and train the second-level Random Forest Classifier
        ensemble_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
        ensemble_classifier.fit(train_ensemble_features_smote, train_labels_ensemble_smote)

        print("\nEnsemble Random Forest classifier trained successfully with SMOTE.")
    except ValueError as e:
         print(f"Could not train ensemble classifier: {e}")
         ensemble_classifier = None
else:
    print("Could not train ensemble classifier due to missing data.")

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Make predictions on the test set using the ensemble model
if test_ensemble_features_selected is not None and ensemble_classifier is not None:
    ensemble_predictions = ensemble_classifier.predict(test_ensemble_features_selected)

    # Generate and print classification report for the ensemble model
    print("Ensemble Model Classification Report:")
    print(classification_report(test_labels_for_model, ensemble_predictions))

    # Compute and print accuracy score for the ensemble model
    ensemble_accuracy = accuracy_score(test_labels_for_model, ensemble_predictions)
    print(f"Ensemble Model Accuracy Score: {ensemble_accuracy}")

    # Generate and print confusion matrix for the ensemble model
    conf_matrix_ensemble = confusion_matrix(test_labels_for_model, ensemble_predictions)
    print("Ensemble Model Confusion Matrix:")
    print(conf_matrix_ensemble)
else:
    print("Ensemble model or test features not available for evaluation.")

In [None]:
def pair_images(red_images, green_images):
    """Pairs corresponding red and green images."""
    # Assuming images are already sorted by filename in the loading step
    # and that the filenames match between red and green directories
    paired_images = []
    for i in range(len(red_images)):
        paired_images.append((red_images[i], green_images[i]))
    return paired_images

# Swap train and test paired images assignments based on the user's feedback
train_paired_images = pair_images(train_red_images, train_green_images)
test_paired_images = pair_images(test_red_images, test_green_images)


print(f"Created {len(train_paired_images)} paired training images.")
print(f"Created {len(test_paired_images)} paired testing images.")

In [None]:
def segment_nuclei(image):
    """Segments nuclei using Otsu's thresholding and watershed."""
    # Apply Otsu's thresholding
    _, thresh = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

    # Morphological operations to clean up the mask
    kernel = np.ones((3, 3), np.uint8)
    opening = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel, iterations=2)

    # Sure background area
    sure_bg = cv2.dilate(opening, kernel, iterations=3)

    # Sure foreground area
    dist_transform = cv2.distanceTransform(opening, cv2.DIST_L2, 5)
    _, sure_fg = cv2.threshold(dist_transform, 0.7 * dist_transform.max(), 255, 0)

    # Unknown region
    sure_fg = np.uint8(sure_fg)
    unknown = cv2.subtract(sure_bg, sure_fg)

    # Marker labelling
    _, markers = cv2.connectedComponents(sure_fg)

    # Add one to all labels so that sure background is not 0, but 1
    markers = markers + 1

    # Now, mark the region of unknown with zero
    markers[unknown == 255] = 0

    # Apply watershed algorithm
    # Create a BGR version of the grayscale image for watershed visualization (optional, but watershed expects 3 channels)
    img_bgr = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
    markers = cv2.watershed(img_bgr, markers)

    # Create a binary mask from the watershed result
    segmented_mask = np.zeros_like(image, dtype=np.uint8)
    segmented_mask[markers > 1] = 255  # Nuclei are marked with labels > 1

    # Remove small objects (noise)
    min_size = 100  # Minimum size threshold for nuclei
    num_labels, labels_img, stats, centroids = cv2.connectedComponentsWithStats(segmented_mask, connectivity=8)

    cleaned_mask = np.zeros_like(segmented_mask)
    for i in range(1, num_labels): # Start from 1 to exclude background
        if stats[i, cv2.CC_STAT_AREA] >= min_size:
            cleaned_mask[labels_img == i] = 255

    return cleaned_mask

train_segmented_masks = [segment_nuclei(img) for img in train_red_images]
test_segmented_masks = [segment_nuclei(img) for img in test_red_images]

print(f"Generated {len(train_segmented_masks)} training segmented masks.")
print(f"Generated {len(test_segmented_masks)} testing segmented masks.")

In [None]:
def apply_global_thresholding(image, threshold_value=50):
    """Applies global thresholding to a grayscale image."""
    # Apply global thresholding
    _, thresholded_img = cv2.threshold(image, threshold_value, 255, cv2.THRESH_BINARY)
    return thresholded_img

# Apply thresholding to training and testing green images
# Using the threshold_value from the previous cell if it exists, otherwise use a default
global threshold_value
try:
    threshold_value_to_use = threshold_value
except NameError:
    threshold_value_to_use = 50 # Default value

train_green_thresholded = [apply_global_thresholding(img, threshold_value_to_use) for img in train_green_images]
test_green_thresholded = [apply_global_thresholding(img, threshold_value_to_use) for img in test_green_images]

print(f"Applied global thresholding to {len(train_green_thresholded)} training green images.")
print(f"Applied global thresholding to {len(test_green_thresholded)} testing green images.")

In [None]:
def detect_foci_and_label_nucleus(paired_image, segmented_mask, thresholded_green_image, foci_threshold=10):
    """
    Detects green foci within segmented nuclei and labels each nucleus
    as "damaged" or "normal".

    Args:
        paired_image: A tuple containing the red and green channel images (grayscale).
        segmented_mask: The binary segmented mask for the nuclei.
        thresholded_green_image: The thresholded green channel image.
        foci_threshold: The minimum number of non-zero pixels (foci) to label a nucleus as "damaged".

    Returns:
        A list of tuples, where each tuple contains the normalized
        red nucleus image and its corresponding label ("damaged" or "normal").
    """
    red_img, green_img = paired_image
    labeled_nuclei = []

    # Find contours in the segmented mask to identify individual nuclei
    contours, _ = cv2.findContours(segmented_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    for contour in contours:
        # Create a mask specifically for the current nucleus
        nucleus_mask = np.zeros_like(segmented_mask)
        cv2.drawContours(nucleus_mask, [contour], -1, 255, -1)

        # Apply this nucleus mask to the thresholded green channel image
        masked_green = cv2.bitwise_and(thresholded_green_image, thresholded_green_image, mask=nucleus_mask)

        # Count the number of non-zero pixels (representing potential foci)
        foci_count = np.count_nonzero(masked_green)

        # Determine the label based on foci count
        label = "damaged" if foci_count > foci_threshold else "normal"

        # Find bounding box
        x, y, w, h = cv2.boundingRect(contour)

        # Extract ROI for the red channel
        red_roi = red_img[y:y+h, x:x+w]

        # Calculate minimum enclosing circle
        (center_x, center_y), radius = cv2.minEnclosingCircle(contour)
        center_x -= x  # Adjust center to be relative to ROI
        center_y -= y

        # Determine size of square bounding box for the circle
        square_size = int(np.ceil(2 * radius))
        padding = square_size

        # Pad the red ROI to the square size, centering the nucleus
        # Calculate padding amounts
        pad_x_before = int((padding - w) / 2)
        pad_x_after = padding - w - pad_x_before
        pad_y_before = int((padding - h) / 2)
        pad_y_after = padding - h - pad_y_before

        padded_red_roi = np.pad(red_roi, ((pad_y_before, pad_y_after), (pad_x_before, pad_x_after)), mode='constant')

        # Resize to target size (assuming the target size is 64x64 as used in the previous normalization step)
        target_size = (64, 64)
        normalized_red_nucleus = cv2.resize(padded_red_roi, target_size, interpolation=cv2.INTER_AREA)


        # Store the normalized red nucleus image and its label
        labeled_nuclei.append((normalized_red_nucleus, label))

    return labeled_nuclei

# Apply foci detection and labeling to training and testing sets
train_labeled_nuclei = []
for i in range(len(train_paired_images)):
    labeled_nuclei_in_image = detect_foci_and_label_nucleus(
        train_paired_images[i],
        train_segmented_masks[i],
        train_green_thresholded[i]
    )
    train_labeled_nuclei.extend(labeled_nuclei_in_image)

test_labeled_nuclei = []
for i in range(len(test_paired_images)):
    labeled_nuclei_in_image = detect_foci_and_label_nucleus(
        test_paired_images[i],
        test_segmented_masks[i],
        test_green_thresholded[i]
    )
    test_labeled_nuclei.extend(labeled_nuclei_in_image)

# Count damaged and normal nuclei
train_damaged_count = sum(1 for img, label in train_labeled_nuclei if label == "damaged")
train_normal_count = sum(1 for img, label in train_labeled_nuclei if label == "normal")
test_damaged_count = sum(1 for img, label in test_labeled_nuclei if label == "damaged")
test_normal_count = sum(1 for img, label in test_labeled_nuclei if label == "normal")

print(f"Training set: {train_damaged_count} damaged, {train_normal_count} normal.")
print(f"Testing set: {test_damaged_count} damaged, {test_normal_count} normal.")

In [None]:
def process_image_pair(paired_image):
    """
    Processes a paired image (red, green) through segmentation,
    thresholding, foci detection, and labeling.

    Args:
        paired_image: A tuple containing the red and green channel images (grayscale).

    Returns:
        A list of tuples, where each tuple contains the normalized
        red nucleus image and its corresponding label ("damaged" or "normal").
    """
    red_img, green_img = paired_image

    # 1. Nucleus segmentation (red channel)
    segmented_mask = segment_nuclei(red_img)

    # 2. Green channel global thresholding
    # Use the threshold_value defined previously (or a default if not defined)
    global threshold_value # Access the threshold value from the previous cell if it exists
    try:
        threshold_value_to_use = threshold_value
    except NameError:
        threshold_value_to_use = 50 # Default value if not set

    thresholded_green_image = apply_global_thresholding(green_img, threshold_value_to_use)


    # 3. Foci detection and nucleus labeling
    labeled_nuclei_in_image = detect_foci_and_label_nucleus(
        paired_image,
        segmented_mask,
        thresholded_green_image
    )

    return labeled_nuclei_in_image

# Process training images
all_train_labeled_nuclei = []
for paired_img in train_paired_images:
    labeled_nuclei = process_image_pair(paired_img)
    all_train_labeled_nuclei.extend(labeled_nuclei)

# Process testing images
all_test_labeled_nuclei = []
for paired_img in test_paired_images:
    labeled_nuclei = process_image_pair(paired_img)
    all_test_labeled_nuclei.extend(labeled_nuclei)

# Print the total number of labeled nuclei
print(f"Total labeled nuclei in training set: {len(all_train_labeled_nuclei)}")
print(f"Total labeled nuclei in testing set: {len(all_test_labeled_nuclei)}")

In [None]:
# Separate images and labels for training
train_images_for_model = [nucleus[0] for nucleus in all_train_labeled_nuclei]
train_labels_for_model = [nucleus[1] for nucleus in all_train_labeled_nuclei]

# Separate images and labels for testing
test_images_for_model = [nucleus[0] for nucleus in all_test_labeled_nuclei]
test_labels_for_model = [nucleus[1] for nucleus in all_test_labeled_nuclei]

# Convert images to NumPy arrays and add channel dimension
train_images_for_model = np.array(train_images_for_model).reshape(-1, 64, 64, 1)
test_images_for_model = np.array(test_images_for_model).reshape(-1, 64, 64, 1)

# Convert labels to NumPy arrays and to numerical format
label_mapping = {"normal": 0, "damaged": 1}
train_labels_for_model = np.array([label_mapping[label] for label in train_labels_for_model])
test_labels_for_model = np.array([label_mapping[label] for label in test_labels_for_model])

# Print shapes
print(f"Shape of train_images_for_model: {train_images_for_model.shape}")
print(f"Shape of train_labels_for_model: {train_labels_for_model.shape}")
print(f"Shape of test_images_for_model: {test_images_for_model.shape}")
print(f"Shape of test_labels_for_model: {test_labels_for_model.shape}")

In [None]:
import mahotas
from skimage.feature import hog
import cv2
import numpy as np
from skimage import measure # Import measure for regionprops

def extract_shape_features(image):
    """Extracts shape features from a binary nucleus mask using skimage.measure.regionprops."""
    # Ensure the image is binary (0 or 255) and has the correct dtype for regionprops
    if image.max() <= 1:
        image = image * 255
    image = image.astype(np.uint8)

    # Find contours (still needed for some calculations like perimeter and hull)
    contours, _ = cv2.findContours(image.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    if not contours:
        # Return a list of zeros if no contour is found
        # Adjust size based on the features extracted by regionprops + Hu moments + HOG
        # regionprops provides area, perimeter, major/minor axis length, eccentricity, solidity
        # Hu moments are 7. HOG depends on parameters (e.g., 9 orientations for 1 cell block size 16x16)
        return [0] * (5 + 7 + 9)

    # Assuming the largest contour is the nucleus
    contour = max(contours, key=cv2.contourArea)

    # Use regionprops for basic shape features
    # Need to create a labeled image for regionprops
    labeled_image, num_labels = measure.label(image, connectivity=2, return_num=True)

    if num_labels < 1:
         # Return a list of zeros if no labeled regions are found
         return [0] * (5 + 7 + 9)

    # Find properties of the largest region (assumed to be the nucleus)
    properties = measure.regionprops(labeled_image)[0] # Assuming only one main region after segmentation

    area = properties.area
    perimeter = properties.perimeter
    major_axis_length = properties.major_axis_length if properties.major_axis_length is not None else 0
    minor_axis_length = properties.minor_axis_length if properties.minor_axis_length is not None else 0
    axis_ratio = major_axis_length / minor_axis_length if minor_axis_length > 0 else 0
    eccentricity = properties.eccentricity if properties.eccentricity is not None else 0
    solidity = properties.solidity if properties.solidity is not None else 0

    # Roundness/Form Factor (calculated using area and perimeter from regionprops)
    roundness = 4 * np.pi * area / (perimeter**2) if perimeter > 0 else 0

    # Hu Moments (using cv2 on the contour)
    hu_moments = cv2.HuMoments(cv2.moments(contour)).flatten()

    # Histogram of Oriented Gradients (HOG) - using the image directly
    try:
        # HOG requires a specific input format and parameters. This is a basic example.
        # The parameters for HOG (pixels_per_cell, cells_per_block, orientations) can be tuned.
        hog_features = hog(image, pixels_per_cell=(16, 16), cells_per_block=(1, 1), orientations=9, feature_vector=True)
    except ValueError:
        hog_features = [0] * 9 # Example size, adjust based on HOG parameters

    # Combine features
    features = [area, perimeter, major_axis_length, minor_axis_length, axis_ratio, eccentricity, solidity, roundness] + list(hu_moments) + list(hog_features)

    return features

# Extract features for training and testing data
train_shape_features = [extract_shape_features(nucleus_img) for nucleus_img, label in all_train_labeled_nuclei]
test_shape_features = [extract_shape_features(nucleus_img) for nucleus_img, label in all_test_labeled_nuclei]

# Convert to numpy arrays
train_shape_features = np.array(train_shape_features)
test_shape_features = np.array(test_shape_features)

print(f"Shape of train_shape_features: {train_shape_features.shape}")
print(f"Shape of test_shape_features: {test_shape_features.shape}")

In [None]:
import mahotas as mh
from skimage import feature, measure
import cv2
import numpy as np
from skimage.filters import gaussian#, laplacian # Comment out laplacian import
from skimage.feature import SIFT
import scipy.ndimage # Import scipy for laplacian

def extract_texture_features(image):
    """Extracts various texture features from a grayscale image."""

    # Ensure the image is in the correct format (uint8) for some libraries
    image = image.astype(np.uint8)

    # Check if the image is too small for feature extraction
    if image.shape[0] < 21 or image.shape[1] < 21: # Some features like Gabor might need a minimum size
         return [0] * (13 + 27 + 2 + 16 + 25 + 2 + 4*4) # Haralick + LBP + LoG (mean, std) + Gabor (4 orientations * 2 freqs * 2 features) + Zernike (degree 8) + SIFT (mean, std) + skimage haralick

    # Haralick Texture Features (using mahotas) - keeping this as it was used before
    # Check if the image is not all zeros or constant, which can cause issues with haralick
    if np.all(image == image[0, 0]):
        haralick_features = [0] * 13
    else:
        try:
            haralick_features = mh.features.haralick(image).tolist()
        except ValueError:
            haralick_features = [0] * 13

    # Local Binary Patterns (LBP) (using skimage) - from user's snippet
    try:
        lbp = feature.local_binary_pattern(image, P=8, R=1, method='uniform')
        # Use a fixed number of bins for the histogram (26 uniform patterns + 2 for ends)
        lbp_hist, _ = np.histogram(lbp, bins=np.arange(0, 28), density=True)
        lbp_features = lbp_hist.tolist()
    except Exception:
        lbp_features = [0] * 27 # 26 uniform patterns + 1 for the remaining

    # Haralick Texture Features (using scikit-image) - from user's snippet - using different distances/angles
    try:
        # Ensure image is uint8 for graycomatrix
        image_uint8 = image.astype(np.uint8)
        # Adjust distances and angles as needed
        glcm = measure.graycomatrix(image_uint8, distances=[1, 3, 5], angles=[0, np.pi/4, np.pi/2, 3*np.pi/4], symmetric=True, normed=True)
        # Extract properties - using contrast, correlation, energy, homogeneity as in user's snippet
        haralick_features_skimage = measure.graycoprops(glcm, props=['contrast', 'correlation', 'energy', 'homogeneity']).flatten().tolist()
    except Exception:
        haralick_features_skimage = [0] * (4 * 4) # 4 properties * 4 angles

    # Laplacian of Gaussian (LoG) (using cv2) - from user's snippet
    try:
        # Apply Gaussian smoothing using cv2
        blurred_image = cv2.GaussianBlur(image, (0, 0), 1)
        # Apply Laplacian filter using cv2
        log_image = cv2.Laplacian(blurred_image, cv2.CV_64F)
        # Simple statistics from LoG response (e.g., mean and variance)
        log_features = [np.mean(log_image), np.std(log_image)]
    except Exception:
        log_features = [0] * 2 # Adjust size based on features extracted


    # Gabor Wavelets (using cv2) - from user's snippet
    try:
        gabor_features = []
        # Define Gabor filter parameters (can be extended with different orientations and frequencies)
        kernels = []
        for theta in np.arange(0, np.pi, np.pi / 4): # 4 orientations
            for freq in [5, 10]: # 2 frequencies
                kern = cv2.getGaborKernel((21, 21), 5.0, theta, freq, 0.5, 0, ktype=cv2.CV_32F)
                kernels.append(kern)

        # Apply filters and extract features (e.g., mean and variance of the response)
        for kernel in kernels:
            fimg = cv2.filter2D(image, cv2.CV_8UC3, kernel) # Changed depth to CV_8UC3 as per documentation examples
            # Convert to grayscale for mean/std calculation if fimg is BGR
            if len(fimg.shape) == 3:
                fimg = cv2.cvtColor(fimg, cv2.COLOR_BGR2GRAY)
            gabor_features.extend([np.mean(fimg), np.std(fimg)])
    except Exception:
        gabor_features = [0] * (4 * 2 * 2) # 4 orientations * 2 frequencies * 2 features (mean, std)


    # Zernike Moments (using mahotas) - keeping this
    try:
        # Ensure the image is binary for Zernike moments
        # Use Otsu's thresholding on the current nucleus image ROI
        _, binary_nucleus = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
        # Compute Zernike moments up to a certain degree (e.g., 8)
        # The number of moments depends on the degree. For degree 8, there are 25 moments.
        zernike_moments = mh.features.zernike_moments(binary_nucleus, radius=image.shape[0]//2, degree=8).tolist()
    except Exception:
         # Approximate number of Zernike moments for degree 8
        zernike_moments = [0] * 25


    # SIFT (Scale-Invariant Feature Transform) (using skimage) - keeping this
    try:
        # Initialize SIFT detector
        # Consider adjusting parameters for feature detection
        detector_extractor = SIFT()
        # Find keypoints and compute descriptors
        detector_extractor.detect_and_extract(image)
        descriptors = detector_extractor.descriptors

        # If no keypoints are found, descriptors will be None
        if descriptors is not None:
            # Simple representation: mean and standard deviation of descriptors
            sift_features = [np.mean(descriptors), np.std(descriptors)]
            # Or, you could cluster descriptors or use a bag-of-visual-words approach
        else:
            sift_features = [0] * 2 # Adjust size based on features extracted (e.g., mean, std)
    except Exception:
        sift_features = [0] * 2 # Adjust size based on features extracted


    # Combine all texture features - including both mahotas and skimage haralick
    features = haralick_features + lbp_features + haralick_features_skimage + log_features + gabor_features + zernike_moments + sift_features

    return features


# Extract texture features for training and testing data
train_texture_features = [extract_texture_features(nucleus_img) for nucleus_img, label in all_train_labeled_nuclei]
test_texture_features = [extract_texture_features(nucleus_img) for nucleus_img, label in all_test_labeled_nuclei]

# Convert to numpy arrays
# Ensure all feature lists have the same length before converting to numpy array
def pad_features(features_list):
    max_len = max(len(f) for f in features_list)
    padded_features = []
    for f in features_list:
        # Ensure each element in f is a number before padding
        padded_features.append([float(x) if isinstance(x, (int, float)) else 0.0 for x in f] + [0.0] * (max_len - len(f)))
    return np.array(padded_features)

train_texture_features = pad_features(train_texture_features)
test_texture_features = pad_features(test_texture_features)


print(f"Shape of train_texture_features: {train_texture_features.shape}")
print(f"Shape of test_texture_features: {test_texture_features.shape}")

In [None]:
# Combine shape and texture features for training
# Ensure both arrays have the same number of samples
if train_shape_features.shape[0] == train_texture_features.shape[0]:
    train_texture_features_flat = train_texture_features.reshape(train_texture_features.shape[0], -1)
    train_combined_features = np.concatenate((train_shape_features, train_texture_features_flat), axis=1)
    print(f"Shape of train_combined_features: {train_combined_features.shape}")
else:
    print("Mismatch in the number of training samples for shape and texture features.")
    train_combined_features = None # Or handle the error appropriately


# Combine shape and texture features for testing
# Ensure both arrays have the same number of samples
if test_shape_features.shape[0] == test_texture_features.shape[0]:
    test_texture_features_flat = test_texture_features.reshape(test_texture_features.shape[0], -1)
    test_combined_features = np.concatenate((test_shape_features, test_texture_features_flat), axis=1)
    print(f"Shape of test_combined_features: {test_combined_features.shape}")
else:
    print("Mismatch in the number of testing samples for shape and texture features.")
    test_combined_features = None # Or handle the error appropriately

In [None]:
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import numpy as np # Ensure numpy is imported
from sklearn.feature_selection import SelectKBest, f_classif # Import feature selection

# Apply SMOTE to handle class imbalance on the combined training features
if train_combined_features is not None and train_labels_for_model is not None:
    smote = SMOTE(random_state=42)
    train_features_smote, train_labels_smote = smote.fit_resample(train_combined_features, train_labels_for_model)

    # Define the parameter grid for GridSearchCV
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'class_weight': [None, 'balanced', 'balanced_subsample'] # Experiment with class weighting
    }

    # Instantiate the Random Forest Classifier
    rf = RandomForestClassifier(random_state=42)

    # Instantiate GridSearchCV
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='recall', n_jobs=-1) # Optimize for recall

    # Fit GridSearchCV to the SMOTE-resampled training data
    grid_search.fit(train_features_smote, train_labels_smote)

    # Get the best parameters and the best estimator
    best_params = grid_search.best_params_
    best_rf_classifier = grid_search.best_estimator_

    print("Best parameters found by GridSearchCV:")
    print(best_params)

    print("\nOptimized Random Forest classifier trained successfully.")

    # Evaluate the best model on the original (non-SMOTE) test set
    if test_combined_features is not None and test_labels_for_model is not None:
        predictions = best_rf_classifier.predict(test_combined_features)
        print("\nOptimized Random Forest Classifier Evaluation on Test Set:")
        print(classification_report(test_labels_for_model, predictions))
    else:
        print("\nTest data not available for evaluating the optimized Random Forest classifier.")

else:
    print("Combined training features or labels not available for Random Forest optimization.")

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import pandas as pd
import numpy as np # Ensure numpy is imported

# Make predictions on the test set
predictions = best_rf_classifier.predict(test_combined_features) # Use the best classifier

# Generate and print classification report
print("Optimized Random Forest Classifier Evaluation:")
print(classification_report(test_labels_for_model, predictions))

# Compute and print accuracy score
accuracy = accuracy_score(test_labels_for_model, predictions)
print(f"Accuracy Score: {accuracy}")

# Generate and print confusion matrix
conf_matrix = confusion_matrix(test_labels_for_model, predictions)
print("Confusion Matrix:")
print(conf_matrix)

# Extract feature importances from the best classifier
feature_importances = best_rf_classifier.feature_importances_

# Create a list of feature names (based on the order they were combined)
# This requires knowing the number of features from each category
num_shape_features = train_shape_features.shape[1]

# Correctly calculate the number of flattened texture features
# The number of texture features is simply the second dimension of the flattened array.
# We can get this from the shape of train_texture_features_flat created in the previous step.
try:
    num_texture_features_flat = train_texture_features_flat.shape[1]
except NameError:
    # If train_texture_features_flat was not created, calculate its expected size
    # based on the total combined features and shape features
    if train_combined_features is not None and train_shape_features is not None:
        num_texture_features_flat = train_combined_features.shape[1] - train_shape_features.shape[1]
    else:
        # Fallback if necessary variables are not defined
        num_texture_features_flat = 0 # Default to 0 if cannot determine

# Assuming a simple naming convention for demonstration
shape_feature_names = [f"shape_{i}" for i in range(num_shape_features)]
texture_feature_names = [f"texture_{i}" for i in range(num_texture_features_flat)]
all_feature_names = shape_feature_names + texture_feature_names

# Check if the lengths match before creating the Series
if len(feature_importances) == len(all_feature_names):
    # Create a pandas Series for feature importances
    feature_importance_series = pd.Series(feature_importances, index=all_feature_names)

    # Sort feature importances in descending order
    sorted_feature_importances = feature_importance_series.sort_values(ascending=False)

    # Print the top 20 most important features
    print("\nTop 20 Most Important Features:")
    print(sorted_feature_importances.head(20))
else:
    print(f"Mismatch in feature counts: Model has {len(feature_importances)}, generated names have {len(all_feature_names)}")

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Define the CNN model architecture
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(64, 64, 1)),
    MaxPooling2D((2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Conv2D(128, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid') # Sigmoid for binary classification
])

# Compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Setup data augmentation
datagen = ImageDataGenerator(
    rotation_range=20,
    zoom_range=0.1,
    width_shift_range=0.1,
    height_shift_range=0.1,
    horizontal_flip=True,
    vertical_flip=True
)

# Fit the data generator on the training data
datagen.fit(train_images_for_model) # Use the original train_images_for_model

# Train the model using the augmented data
history = model.fit(datagen.flow(train_images_for_model, train_labels_for_model, batch_size=32),
                    epochs=50, # Number of training epochs
                    validation_data=(test_images_for_model, test_labels_for_model)) # Use the original test_images_for_model and labels

print("CNN model training completed.")

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

# Make predictions on the test set
predictions_prob = model.predict(test_images_for_model)
predictions = (predictions_prob > 0.5).astype("int32")

# Generate and print classification report
print("CNN Classification Report:")
print(classification_report(test_labels_for_model, predictions))

# Generate and print confusion matrix
conf_matrix_cnn = confusion_matrix(test_labels_for_model, predictions)
print("CNN Confusion Matrix:")
print(conf_matrix_cnn)

In [None]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import SelectKBest, f_classif

# Get predictions (probabilities) from the trained CNN on the training and testing data
train_cnn_predictions_prob = model.predict(train_images_for_model) # Use the original train_images_for_model
test_cnn_predictions_prob = model.predict(test_images_for_model) # Use the original test_images_for_model

# Reshape CNN predictions to be 2D arrays
train_cnn_predictions_prob = train_cnn_predictions_prob.reshape(-1, 1)
test_cnn_predictions_prob = test_cnn_predictions_prob.reshape(-1, 1)


# Combine CNN probabilities with handcrafted features for training
# Ensure both arrays have the same number of samples
if train_combined_features.shape[0] == train_cnn_predictions_prob.shape[0]:
    train_ensemble_features = np.concatenate((train_combined_features, train_cnn_predictions_prob), axis=1)
    print(f"Shape of train_ensemble_features: {train_ensemble_features.shape}")
else:
    print("Mismatch in the number of training samples for combined features and CNN predictions.")
    train_ensemble_features = None # Or handle the error appropriately


# Combine CNN probabilities with handcrafted features for testing
# Ensure both arrays have the same number of samples
if test_combined_features.shape[0] == test_cnn_predictions_prob.shape[0]:
    test_ensemble_features = np.concatenate((test_combined_features, test_cnn_predictions_prob), axis=1)
    print(f"Shape of test_ensemble_features: {test_ensemble_features.shape}")
else:
    print("Mismatch in the number of testing samples for shape and texture features.")
    test_ensemble_features = None # Or handle the error appropriately


# --- Feature Selection (Optional but Recommended for Ensemble) ---
# Use SelectKBest to select the top K features based on ANOVA F-value
# You can adjust the value of k (number of features to select)
if train_ensemble_features is not None and train_labels_for_model is not None:
    # Address potential NaNs or Infs in features before selection
    train_ensemble_features_clean = np.nan_to_num(train_ensemble_features)

    # Check if there are enough samples for feature selection
    if train_ensemble_features_clean.shape[0] > 1 and np.var(train_ensemble_features_clean, axis=0).sum() > 0:
        k = min(100, train_ensemble_features_clean.shape[1]) # Select top 100 features or fewer if less are available
        try:
            selector = SelectKBest(score_func=f_classif, k=k)
            train_ensemble_features_selected = selector.fit_transform(train_ensemble_features_clean, train_labels_for_model)
            test_ensemble_features_selected = selector.transform(np.nan_to_num(test_ensemble_features)) # Apply the same selection to test data
            print(f"Shape of train_ensemble_features_selected: {train_ensemble_features_selected.shape}")
            print(f"Shape of test_ensemble_features_selected: {test_ensemble_features_selected.shape}")
        except ValueError as e:
             print(f"Could not perform feature selection: {e}")
             # Fallback to using all features if selection fails
             train_ensemble_features_selected = train_ensemble_features_clean
             test_ensemble_features_selected = np.nan_to_num(test_ensemble_features)
             print("Using all features for ensemble training.")
    else:
         print("Not enough samples or variance for feature selection. Using all features.")
         train_ensemble_features_selected = train_ensemble_features_clean
         test_ensemble_features_selected = np.nan_to_num(test_ensemble_features)
else:
    print("Ensemble features not available for selection.")
    train_ensemble_features_selected = None
    test_ensemble_features_selected = None


# --- Train Second-Level Random Forest Classifier ---
# Apply SMOTE to the selected training features to handle class imbalance for the ensemble model
if train_ensemble_features_selected is not None and train_labels_for_model is not None:
    try:
        smote_ensemble = SMOTE(random_state=42)
        train_ensemble_features_smote, train_labels_ensemble_smote = smote_ensemble.fit_resample(train_ensemble_features_selected, train_labels_for_model)

        # Instantiate and train the second-level Random Forest Classifier
        ensemble_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
        ensemble_classifier.fit(train_ensemble_features_smote, train_labels_ensemble_smote)

        print("\nEnsemble Random Forest classifier trained successfully with SMOTE.")
    except ValueError as e:
         print(f"Could not train ensemble classifier: {e}")
         ensemble_classifier = None
else:
    print("Could not train ensemble classifier due to missing data.")

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Make predictions on the test set using the ensemble model
if test_ensemble_features_selected is not None and ensemble_classifier is not None:
    ensemble_predictions = ensemble_classifier.predict(test_ensemble_features_selected)

    # Generate and print classification report for the ensemble model
    print("Ensemble Model Classification Report:")
    print(classification_report(test_labels_for_model, ensemble_predictions))

    # Compute and print accuracy score for the ensemble model
    ensemble_accuracy = accuracy_score(test_labels_for_model, ensemble_predictions)
    print(f"Ensemble Model Accuracy Score: {ensemble_accuracy}")

    # Generate and print confusion matrix for the ensemble model
    conf_matrix_ensemble = confusion_matrix(test_labels_for_model, ensemble_predictions)
    print("Ensemble Model Confusion Matrix:")
    print(conf_matrix_ensemble)
else:
    print("Ensemble model or test features not available for evaluation.")