In [24]:
import os
import zipfile
from PIL import Image
import numpy as np
from sklearn.model_selection import train_test_split

# FILE DIRECTORY

In [25]:
# Path to your ZIP file
zip_path = r"C:\Users\hoang le\Downloads\archive.zip"
extract_dir = r"C:\Users\hoang le\Downloads\extracted"

# Extract the ZIP file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)


# LABEL

In [26]:
categories = ["benign", "malignant", "normal"]  # Label names
label_mapping = {category: idx for idx, category in enumerate(categories)}

print(f"Label mapping: {label_mapping}")

Label mapping: {'benign': 0, 'malignant': 1, 'normal': 2}


# DATA PREPARING


In [27]:
from PIL import Image
import numpy as np

# Initialize storage for images and labels
images = []
labels = []

# Process each category
for category in categories:
    folder_path = os.path.join(extract_dir, category)
    label = label_mapping[category]  # Get the label for this category
    
    for file in os.listdir(folder_path):
        if file.endswith(('.png', '.jpg', '.jpeg', '.bmp')):  # Supported image formats
            file_path = os.path.join(folder_path, file)
            try:
                # Open, convert to grayscale, and resize image
                img = Image.open(file_path).convert('L').resize((500, 500))
                images.append(np.array(img))  # Convert image to numpy array
                labels.append(label)          # Store the label
            except Exception as e:
                print(f"Error processing file {file_path}: {e}")

# Convert lists to numpy arrays
images = np.array(images)
labels = np.array(labels)

# Display shapes
print(f"Images shape: {images.shape}")
print(f"Labels shape: {labels.shape}")


Images shape: (1578, 500, 500)
Labels shape: (1578,)


# SPLIT

In [31]:

# Initialize lists to hold images (original and masks)
original_images = []
mask_images = []

# Folder paths for each category
categories = ['benign', 'normal', 'malignant']

# Iterate over each category and load original and mask images
for category in categories:
    category_folder = os.path.join(extract_dir, category)  # Path to category folder
    original_images_folder = os.path.join(category_folder)  # Path to original images
    mask_images_folder = os.path.join(category_folder)  # Path to mask images
    
    # Iterate over the files in the original folder
    for file in os.listdir(original_images_folder):
        if file.endswith(('.png', '.jpg', '.jpeg')):
            # Extract the base name (without extension or "_mask")
            base_name = os.path.splitext(file)[0]  # Remove file extension
            
            # Construct the path for the original image and corresponding mask image
            original_image_path = os.path.join(original_images_folder, file)
            mask_image_path = os.path.join(mask_images_folder, f"{base_name}_mask{os.path.splitext(file)[1]}")  # Add _mask to the base name
            
            if os.path.exists(mask_image_path):  # Ensure that mask image exists
                # Open and convert to grayscale (if needed) and resize
                original_image = Image.open(original_image_path).convert('L').resize((500, 500))
                mask_image = Image.open(mask_image_path).convert('L').resize((500, 500))
                
                # Append to the lists
                original_images.append(np.array(original_image))
                mask_images.append(np.array(mask_image))

# Convert lists to numpy arrays for easier manipulation
original_images = np.array(original_images)
mask_images = np.array(mask_images)

# Split the dataset into X (original images) and Y (mask images)
X = original_images
y = mask_images

# Now split the dataset into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Display dataset shapes
print(f"Training set: X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"Validation set: X_val shape: {X_val.shape}, y_val shape: {y_val.shape}")
print(f"Test set: X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")

Training set: X_train shape: (468, 500, 500), y_train shape: (468, 500, 500)
Validation set: X_val shape: (156, 500, 500), y_val shape: (156, 500, 500)
Test set: X_test shape: (156, 500, 500), y_test shape: (156, 500, 500)


# NORMALIZATION

In [30]:
# Normalize the original images (X) by scaling pixel values to [0, 1]
X_train = X_train / 255.0
X_val = X_val / 255.0
X_test = X_test / 255.0


