In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from tqdm import tqdm
import cv2
import os
import numpy as np
import pandas as pd
import shutil
import matplotlib.pyplot as plt
import random

In [None]:
# Directories for png_images,png_masks,stage_1_test_images.csv, stage_1_train_images.csv
MskFolder = "/content/drive/MyDrive/siim-acr-pneumothorax/png_masks"
ImgFolder = "/content/drive/MyDrive/siim-acr-pneumothorax/png_images"
TestCsv = "/content/drive/MyDrive/siim-acr-pneumothorax/stage_1_train_images.csv"
TrainCsv = "/content/drive/MyDrive/siim-acr-pneumothorax/stage_1_test_images.csv"
# List Mask Files
MaskFiles = sorted(os.listdir(MskFolder))
# List Image Files
ImgFiles = sorted(os.listdir(ImgFolder))
# Count the total no of Images
print(f"Total Images: {len(ImgFiles)}")
# Count the total no of Masks
print(f"Total Masks: {len(MaskFiles)}")
# Read TestCsv
DataTest = pd.read_csv(TestCsv)
# Read TrainCsv
DataTrain = pd.read_csv(TrainCsv)
# Display the head of Train Data
print("Train Data Sample:\n", DataTrain.head())
# Display the head of Test Data
print("Test Data Sample:\n", DataTest.head())

In [None]:
# Mask name set
MskNames = set([f for f in MaskFiles])
# Image name set
ImgNames = set([f for f in ImgFiles])
# Find the Missing masks
MissingMsks = ImgNames - MskNames
# Display the counts of Missing masks
print(f"Images missing masks: {len(MissingMsks)}")
# Display the examples for Missing Masks
print("Examples:", list(MissingMsks)[:5])

In [None]:
# Remove duplicates for MaskFiles
MskFilesClean = [f for f in MaskFiles if '(1)' not in f]
# Remove duplicates for ImageFiles
ImgFilesClean = [f for f in ImgFiles if '(1)' not in f]
# Display the no of ImgFilesClean
print(f"Cleaned image files: {len(ImgFilesClean)}")

In [None]:
# Mask Folder Directory
MaskDir = MskFolder
# Image Folder Directory
ImgDir = ImgFolder
# List Mask Filenames
MaskFiles = [f for f in os.listdir(MaskDir) if f.endswith('.png')]
# List Image Filenames
ImgFiles = [f for f in os.listdir(ImgDir) if f.endswith('.png')]
# Ensure both images and masks have the same files
assert set(ImgFiles) == set(MaskFiles), "Image files and mask files don't match!"
# Display Negative Images showing no Pneumothorax
NegativeImgs = [f for f in ImgFiles if '0' in f]
# Display Positive Images showing Pneumothorax
PositiveImgs = [f for f in ImgFiles if '1' in f]
# Shuffle both the Images
random.shuffle(NegativeImgs)
random.shuffle(PositiveImgs)
# Define the number of samples for training
TrainRatio = 0.3
TrainNegativeSize = int(len(NegativeImgs) * TrainRatio)
TrainPositiveSize = int(len(PositiveImgs) * TrainRatio)
# Split the Negative Image into Train and Test Image
TrainNegative = NegativeImgs[:TrainNegativeSize]
TestNegative = NegativeImgs[TrainNegativeSize:]
# Split the Positive Image into Train and Test Image
TrainPositive = PositiveImgs[:TrainPositiveSize]
TestPositive = PositiveImgs[TrainPositiveSize:]
# Compute the total no of TrainImgs and TestImgs
TestImgs = TestPositive + TestNegative
TrainImgs = TrainPositive + TrainNegative
# Output directories for train_images,test_images,train_masks,test_masks
TestMskDir = '/content/drive/MyDrive/siim-acr-pneumothorax/test_masks'
TrainMskDir = '/content/drive/MyDrive/siim-acr-pneumothorax/train_masks'
TrainImgDir = '/content/drive/MyDrive/siim-acr-pneumothorax/train_images'
TestImgDir = '/content/drive/MyDrive/siim-acr-pneumothorax/test_images'
os.makedirs(TestMskDir, exist_ok=True)
os.makedirs(TrainImgDir, exist_ok=True)
os.makedirs(TrainMskDir, exist_ok=True)
os.makedirs(TestImgDir, exist_ok=True)
# Move training set
print("Moving images and masks to training set...")
for image in tqdm(TrainImgs, desc="Training set", unit="file"):
    mask = image
    # Directory path of the ImageSource
    ImageSource = os.path.join(ImgDir, image)
    # Directory path of the MaskSource
    MskSource = os.path.join(MaskDir, mask)
    # Directory path of the ImageDestination
    ImageDestination = os.path.join(TrainImgDir, image)
    # Directory path of the MaskDestination
    MskDestination = os.path.join(TrainMskDir, mask)
    if os.path.exists(MskSource):
        shutil.copy(ImageSource, ImageDestination)
        shutil.copy(MskSource, MskDestination)
    else:
        print(f"Warning: Mask file for {image} not found, skipping...")
# Move testing set
print("Moving images and masks to testing set...")
for image in tqdm(TestImgs, desc="Testing set", unit="file"):
    mask = image
    # Directory path of the ImageSource
    ImageSource = os.path.join(ImgDir, image)
    # Directory path of the MaskSource
    MskSource = os.path.join(MaskDir, mask)
    # Directory path of the ImageDestination
    ImageDestination = os.path.join(TestImgDir, image)
    # Directory path of the MaskDestination
    MskDestination = os.path.join(TestMskDir, mask)
    if os.path.exists(MskSource):
        shutil.copy(ImageSource, ImageDestination)
        shutil.copy(MskSource, MskDestination)
    else:
        print(f"Warning: Mask file for {image} not found, skipping...")
# Display the length of TrainImgs
print(f"Train set: {len(TrainImgs)} images and masks")
# Display the length of TestImgs
print(f"Test set: {len(TestImgs)} images and masks")