In [1]:
import os
from datetime import datetime

import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow import keras as tfk
from tensorflow.keras import layers as tfkl

import matplotlib.pyplot as plt
%matplotlib inline

np.random.seed(42)
tf.random.set_seed(42)

print(f"TensorFlow version: {tf.__version__}")
print(f"Keras version: {tfk.__version__}")
print(f"GPU devices: {len(tf.config.list_physical_devices('GPU'))}")

TensorFlow version: 2.17.1
Keras version: 3.5.0
GPU devices: 0


In [2]:
data = np.load("/kaggle/input/marsdataset/mars_for_students.npz")

training_set = data["training_set"]
X_train = training_set[:, 0]
y_train = training_set[:, 1]

X_test = data["test_set"]

print(f"Training X shape: {X_train.shape}")
print(f"Training y shape: {y_train.shape}")
print(f"Test X shape: {X_test.shape}")

FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/marsdataset/mars_for_students.npz'

In [None]:
# Add color channel and rescale pixels between 0 and 1
X_train = X_train[..., np.newaxis] / 255.0
X_test = X_test[..., np.newaxis] / 255.0

input_shape = X_train.shape[1:]
num_classes = len(np.unique(y_train))

print(f"Input shape: {input_shape}")
print(f"Number of classes: {num_classes}")

In [None]:
def create_segmentation_colormap(num_classes):
    """
    Create a linear colormap using a predefined palette.
    Uses 'viridis' as default because it is perceptually uniform
    and works well for colorblindness.
    """
    return plt.cm.viridis(np.linspace(0, 1, num_classes))

def apply_colormap(label, colormap=None):
    """
    Apply the colormap to a label.
    """
    # Ensure label is 2D
    label = np.squeeze(label)

    if colormap is None:
        num_classes = len(np.unique(label))
        colormap = create_segmentation_colormap(num_classes)

    # Apply the colormap
    colored = colormap[label.astype(int)]

    return colored

def plot_sample_batch(X_train, y_train, num_samples=3):
    """
    Display some image and label pairs from the training data.
    """
    plt.figure(figsize=(15, 4*num_samples))

    # Randomly select indices
    indices = np.random.choice(len(X_train), size=num_samples, replace=False)

    # Prepare colormap
    num_classes = len(np.unique(y_train))
    colormap = create_segmentation_colormap(num_classes)

    for i, idx in enumerate(indices):
        # Original image
        plt.subplot(num_samples, 2, i*2 + 1)
        plt.imshow(X_train[idx], cmap='gray')
        plt.title(f'Image {idx}')
        plt.axis('off')

        # Segmentation mask
        plt.subplot(num_samples, 2, i*2 + 2)
        colored_label = apply_colormap(y_train[idx], colormap)
        plt.imshow(colored_label)
        plt.title(f'Label {idx}')
        plt.axis('off')

    plt.tight_layout()
    plt.show()
    plt.close()

In [None]:
# Visualize examples
print("Visualizing examples from the training set:")
plot_sample_batch(X_train, y_train, num_samples=3)

In [None]:
def find_similar_masks(reference_index, y_train):
    """
    Find indices of images in y_train with the same segmentation mask as the reference image.
    """
    reference_mask = y_train[reference_index]
    # Compare masks to find exact matches
    similar_indices = [i for i, mask in enumerate(y_train) if np.array_equal(mask, reference_mask)]
    return similar_indices

def plot_similar_masks(reference_index, similar_indices, X_train, y_train, num_samples=5):
    """
    Plot images and masks that have the same segmentation as the reference image.
    """
    # Limit the number of images to plot
    num_samples = min(num_samples, len(similar_indices))
    indices_to_plot = similar_indices[:num_samples]

    # Prepare the figure
    plt.figure(figsize=(15, 6 * num_samples))

    for i, idx in enumerate(indices_to_plot):
        # Plot original image
        plt.subplot(num_samples, 2, i * 2 + 1)
        plt.imshow(X_train[idx], cmap='gray')
        plt.title(f'Image {idx}')
        plt.axis('off')

        # Plot segmentation mask
        plt.subplot(num_samples, 2, i * 2 + 2)
        plt.imshow(y_train[idx], cmap='viridis')
        plt.title(f'Mask {idx}')
        plt.axis('off')

    plt.tight_layout()
    plt.show()


# Find similar masks to the reference image
reference_index = 1005
similar_indices = find_similar_masks(reference_index, y_train)

# Print the indices and plot the images
print(f"Images with matching segmentation masks to image {reference_index}: {similar_indices}")
plot_similar_masks(reference_index, similar_indices, X_train, y_train, num_samples=5)

In [None]:
import numpy as np

def split_dataset_based_on_mask(reference_index, X_train, y_train):
    """
    Split X_train and y_train into two sets based on the segmentation mask similarity to the reference image.
    """
    # Extract the reference mask
    reference_mask = y_train[reference_index]

    # Find indices of images with matching segmentation masks
    alien_indices = [i for i, mask in enumerate(y_train) if np.array_equal(mask, reference_mask)]

    # Find indices of images with different segmentation masks (aliens)
    train_indices = [i for i in range(len(y_train)) if i not in alien_indices]

    # Split the data into 'X_train' (similar) and 'X_aliens' (different masks)
    X_train_split = X_train[train_indices]
    y_train_split = y_train[train_indices]

    X_aliens = X_train[alien_indices]
    y_aliens = y_train[alien_indices]

    return X_train_split, y_train_split, X_aliens, y_aliens

# Example usage
reference_index = 1224  # Reference image index
X_train_split, y_train_split, X_aliens, y_aliens = split_dataset_based_on_mask(reference_index, X_train, y_train)

# Print out the result for verification
print(f"Number of images in X_train (similar masks): {len(X_train_split)}")
print(f"Number of images in X_aliens (different masks): {len(X_aliens)}")

In [None]:
print(X_train_split.shape)
print(y_train_split.shape)
print(X_aliens.shape)
print(y_aliens.shape)
print(X_test.shape)

In [None]:
plot_sample_batch(X_train_split, X_train_split, num_samples=3)

plot_sample_batch(X_aliens, y_aliens, num_samples=3)

In [None]:
def save_cleaned_dataset(X_train, y_train, X_test, filename="cleanedDataset.npz"):

    # Save the data (images and labels) as a .npz file
    np.savez_compressed(filename, X_train=X_train_split, y_train=y_train_split, X_test=X_test)
    print(f"Saved cleaned dataset to {filename}")

# Example usage
save_cleaned_dataset(X_train, y_train, X_test)

Saved cleaned dataset to cleanedDataset.npz
