# download and import modules

In [2]:
!pip install pydicom

Collecting pydicom
  Downloading pydicom-2.4.4-py3-none-any.whl (1.8 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.6/1.8 MB[0m [31m17.9 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m1.7/1.8 MB[0m [31m24.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pydicom
Successfully installed pydicom-2.4.4


In [1]:
#load the modules for data preprocess
import os
import cv2
import shutil
import torch
from torchvision import transforms
import pydicom
from PIL import Image
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
from google.colab import drive
import re


In [4]:
#connect with google drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#unzip the images you are processing
#it could be others folders as well
!unzip /content/drive/MyDrive/CBIS-DDSM/roi_crop_train.zip

# Define how to name the converted images

In [6]:
train = pd.read_csv('/content/drive/MyDrive/CBIS-DDSM/mass_case_description_train_set.csv')

test = pd.read_csv('/content/drive/MyDrive/CBIS-DDSM/mass_case_description_test_set.csv')

In [7]:
train_need = train[['patient_id', 'pathology', 'image file path', 'cropped image file path', 'ROI mask file path']]
test_need = test[['patient_id', 'pathology', 'image file path', 'cropped image file path','ROI mask file path']]

In [8]:
merged_df = pd.concat([train_need, test_need], axis=0)

In [9]:
merged_df.reset_index(drop=True, inplace=True)

In [12]:
merged_df.head(2)

Unnamed: 0,patient_id,pathology,image file path,cropped image file path,ROI mask file path,crop_-1,ROI_-1,full_-1
0,P_00001,MALIGNANT,Mass-Training_P_00001_LEFT_CC/1.3.6.1.4.1.9590...,Mass-Training_P_00001_LEFT_CC_1/1.3.6.1.4.1.95...,Mass-Training_P_00001_LEFT_CC_1/1.3.6.1.4.1.95...,Mass-Training_P_00001_LEFT_CC_1,Mass-Training_P_00001_LEFT_CC_1,Mass-Training_P_00001_LEFT_CC
1,P_00001,MALIGNANT,Mass-Training_P_00001_LEFT_MLO/1.3.6.1.4.1.959...,Mass-Training_P_00001_LEFT_MLO_1/1.3.6.1.4.1.9...,Mass-Training_P_00001_LEFT_MLO_1/1.3.6.1.4.1.9...,Mass-Training_P_00001_LEFT_MLO_1,Mass-Training_P_00001_LEFT_MLO_1,Mass-Training_P_00001_LEFT_MLO


In [11]:
merged_df['crop_-1'] = merged_df['cropped image file path'].apply(lambda x: x.split('/')[0])
merged_df['ROI_-1'] = merged_df['ROI mask file path'].apply(lambda x: x.split('/')[0])
merged_df['full_-1'] = merged_df['image file path'].apply(lambda x: x.split('/')[0])

In [13]:
# Check if items in column1 and column2 of the same row are the same
merged_df['same_items'] = merged_df['crop_-1'] == merged_df['ROI_-1']
merged_df['full_same_items'] = merged_df['full_-1'].isin(merged_df['crop_-1'])

#check if items in column A share the same pathology info or not when the items are same.
merged_df['pathology_crop'] = merged_df['pathology'][merged_df['crop_-1'].duplicated(keep=False)].duplicated(keep=False)
merged_df['pathology_roi'] = merged_df['pathology'][merged_df['ROI_-1'].duplicated(keep=False)].duplicated(keep=False)
merged_df['pathology_full'] = merged_df['pathology'][merged_df['full_-1'].duplicated(keep=False)].duplicated(keep=False)

# Count the number of False values
count_false = (merged_df['same_items'] == False).sum()
full_false = (merged_df['full_same_items'] == False).sum()
pathology_crop = (merged_df['pathology_crop'] == False).sum()
pathology_roi = (merged_df['pathology_roi'] == False).sum()
pathology_full = (merged_df['pathology_full'] == False).sum()

# Display the count
print("Number of False values:", count_false)
print("Number of full_False values:", count_false)
print("Number of pathology crop:", pathology_crop)
print("Number of pathylogy roi:", pathology_roi)
print("Number of pathylogy full:", pathology_full)

Number of False values: 0
Number of full_False values: 0
Number of pathology crop: 0
Number of pathylogy roi: 0
Number of pathylogy full: 0


We have decided to use the Subject ID as a method to name the images and track the corresponding ROI and pathology. Please note that one full image may contain multiple ROIs. To avoid missing any of them, it is important to add different suffixes, prefixes, etc., while keeping the Subject ID the same before saving the converted version.
Furthermore, to match ROIs to the corresponding full image, we will leverage the shared patient ID, left or right breast, and image view information. When the ROI and full image share these three pieces of information, we will match the ROI to the corresponding full image.
Please note that one full image could have multiple ROIs.

In [14]:
##double check that the pathology is related to the Subject ID only for "image file path", "ROI path", and "cropped path"
##here we put the code for for "image file path" only feel free to try others as well.

# Identify duplicate 'image file path' values
merged_df['name'] = merged_df['image file path'].str.split('/').str[0]
is_duplicate = merged_df.duplicated(subset=['image file path'], keep='first')

# Filter the DataFrame to keep only rows with unique 'image file path' values
unique_df = merged_df[~is_duplicate]

filtered_df = pd.DataFrame(columns=unique_df.columns)

# Grouping by 'patient_id' and checking if 'pathology' values are the same within each group
for name, group in unique_df.groupby('name'):
    if len(group['pathology'].unique()) > 1:
        print("Inconsistent pathology for name:", name)
        print(group)
        unique_df = pd.concat([unique_df, group])

# Resetting index for the filtered DataFrame
filtered_df.reset_index(drop=True, inplace=True)
print(len(filtered_df))

0


# convert dcm to png and save to a folder

In [None]:
#make sure the source and destination folder is ready
#for full images, pixel_array = ds.pixel_array.astype('uint16').
#for ROI and original cropped images(provided by CBIS-DDSM contributors),
#pixel_array = ds.pixel_array.astype('uint8').

def convert_dcm_to_png(source_folder, destination_root):

    # Initialize a counter for failed conversions
    failed_count = 0

    # Initialize lists to store data for DataFrame
    source_paths = []
    png_paths = []
    source_parts = []
    png_parts = []

    # Traverse through each item in the source folder
    for item in os.listdir(source_folder):
        source_item_path = os.path.join(source_folder, item)
        # Check if it's a directory
        if os.path.isdir(source_item_path):
            # Recursively convert .dcm files in subdirectories
            sub_failed_count, sub_df = convert_dcm_to_png(source_item_path, destination_root)
            failed_count += sub_failed_count
            source_paths.extend(sub_df['source_path'])
            png_paths.extend(sub_df['png_path'])
            source_parts.extend(sub_df['source_part'])
            png_parts.extend(sub_df['png_part'])

        elif item.endswith('.dcm'):
            try:
                # Read DICOM file
                ds = pydicom.dcmread(source_item_path, force=True)

                # Convert pixel data to unit 16 or uint8
                # pixel_array = ds.pixel_array.astype('uint16') #for full image
                pixel_array = ds.pixel_array.astype('uint8') # for ROI and croped image

                # Create an 8-bit image from the array
                image = Image.fromarray(pixel_array)

                # Extract the filename without extension
                name = os.path.splitext(os.path.basename(source_item_path))[0]

                # Extract the last 4 parts of the path
                source_part = source_item_path.split('/')[-4]

                # Construct the destination folder path
                destination_folder = os.path.join(destination_root, source_part)

                # Create destination folder if it doesn't exist
                os.makedirs(destination_folder, exist_ok=True)

                # Construct the destination PNG file path
                png_path = os.path.join(destination_folder, f"_{name}.png")

                # Check if the PNG file already exists
                while os.path.exists(png_path):
                    # Add a prefix "_" before the name
                    name = f"_{name}"
                    png_path = os.path.join(destination_folder, f"_{name}.png")

                # Save as PNG directly in the destination folder
                image.save(png_path, format='PNG')

                # Append paths and parts to lists
                source_paths.append(source_item_path)
                png_paths.append(png_path)
                source_parts.append(source_part)
                png_parts.append(png_path.split('/')[-2])

            except Exception as e:
                print(f"Failed to convert {source_item_path}: {e}")
                failed_count += 1

    # Return the total number of failed conversions and DataFrame
    return failed_count, pd.DataFrame({
        'source_path': source_paths,
        'png_path': png_paths,
        'source_part': source_parts,
        'png_part': png_parts
    })

# Specify the source folder and destination root
source_folder = "/content/mass_train_ROI/manifest-LyDgOQGl3853937313152078328/CBIS-DDSM"
!mkdir /content/roi_train
destination_root = "/content/roi_train"

# Convert DICOM files to PNG and get the total number of failed conversions and DataFrame
full_failed, full_df = convert_dcm_to_png(source_folder, destination_root)

In [None]:
#check how many images under the root folder
# Specify the root folder
root_folder = "/content/roi_train"

# Initialize a dictionary to store unique base names and their counts
unique_base_names_counts = {}

# Iterate through the files in the root folder
for folder, _, files in os.walk(root_folder):
    for file in files:
        # Check if the file ends with ".png"
        if file.endswith(".png"): #could be .dcm, .png, or other format as the user's preference
            # Extract the base name without extension
            base_name = os.path.splitext(file)[0]
            # Update the count for the base name in the dictionary
            unique_base_names_counts[base_name] = unique_base_names_counts.get(base_name, 0) + 1

# Count the number of unique base names
num_unique_base_names = len(unique_base_names_counts)

# Print the number of unique base names
print(f"Number of unique base names of images ending with '.png': {num_unique_base_names}")

# Print the unique base names and their total counts
print("Unique base names of images ending with '.png' and their total counts:")
for base_name, count in unique_base_names_counts.items():
    print(f"{base_name}: {count}")

In [None]:
#move converted png images to one folder
def move_images(root_dir, dest_dir):
    # Walk through the directory structure
    for foldername, _, filenames in os.walk(root_dir):
        # Iterate over each file in the current directory
        for filename in filenames:
            # Construct the source and destination paths
            source_path = os.path.join(foldername, filename)
            dest_path = os.path.join(dest_dir, foldername.replace('/', '_') + '_' + filename)

            # Create destination directory if it doesn't exist
            os.makedirs(os.path.dirname(dest_path), exist_ok=True)

            # Move the file
            shutil.move(source_path, dest_path)
            print(f"Moved: {source_path} -> {dest_path}")

# Specify the root directory containing images
root_dir = "/content/roi_train"

!mkdir /content/roi_train_needed
# Specify the destination directory
dest_dir = "/content/roi_train_needed"

# Call the function to move images
move_images(root_dir, dest_dir)


Please run the previously codes multiple times with different root and destination folders to convert all mass full images and ROIs to PNG format.

To make it easier to identify and remove the converted cropped images:

Download the converted ROIs and cropped images.
Sort the images by size; most of the cropped images should have a larger size and contain only black and white dots.
Manually remove the identified images.

# Find the mismatched ROI and full images

Ensure that the PNG format full images and ROI images are ready.

In [None]:
def extract_names_from_folder(folder_path):
    # Get the list of files in the folder
    files = os.listdir(folder_path)

    # Define a regular expression pattern to match the desired parts of the filename
    # pattern = re.compile(r"Mass-(.*?)_1(.*?)\.png")
    pattern = re.compile(r"Mass-(.*?)_(MLO|CC)(.*?)\.png")
    # Create a list to store the extracted data
    extracted_data = []

    # Iterate over the files and extract the data
    for filename in files:
        match = pattern.search(filename)
        if match:
            # extract the content
            name = f"Mass-{match.group(1)}_{match.group(2)}"
            last_number = re.findall(r'\d+', match.group(3))[-1]
            extracted_data.append({
                'name': name,
                '1or2': last_number,
                'path': os.path.join(folder_path, filename)
            })

    return extracted_data

# Specify the paths of the two folders to compare
folder1_path = "/content/roi_train_needed"
folder2_path = "/content/full_train_needed"

# Extract data from both folders
data_in_folder1 = extract_names_from_folder(folder1_path)
data_in_folder2 = extract_names_from_folder(folder2_path)

# Convert the lists to pandas DataFrames
df_folder1 = pd.DataFrame(data_in_folder1)
df_folder2 = pd.DataFrame(data_in_folder2)

In [None]:
# Merge the DataFrames on the 'name' column to find common names
common_data_df = pd.merge(df_folder1, df_folder2, on='name', suffixes=('_ROI', '_Full'), how='left')

In [None]:
# !find /content/roi -type f -name "*.png" | wc -l

378


In [None]:
# common_data_df DataFrame with columns 'path_ROI', 'path_Full', and 'name'

mismated_info = []  # List to store information about mismated images

for index, row in common_data_df.iterrows():
    roi_path = row['path_ROI']
    full_path = row['path_Full']
    name = row['path_ROI'].split('/')[-1]
    name = name.replace("_content_mass_test_failed_", "")

    # Open ROI and full images
    roi_image = Image.open(roi_path)
    full_image = Image.open(full_path)

    # Resize the roi image to meet the full image size
    if roi_image.size != full_image.size:
        mismated_info.append((name, full_image.size, roi_image.size))  # Save mismated image info
        print(f"Found one difference in sizes for {name}.")

print("All images successfully processed.")

# Save mismated images information to CSV
if mismated_info:
    df_mismated = pd.DataFrame(mismated_info, columns=['Image Name', 'Full Image Size', 'ROI Size'])
    df_mismated.to_csv('mismated_image_info.csv', index=False) #save the mismatched info to your desirved folder
    print("Mismated images information saved to mismated_images_info.csv.")
else:
    print("No mismated images found.")

# map ROI to the original image

In [None]:
for index, row in common_data_df.iterrows():
    roi_path = row['path_ROI']
    full_path = row['path_Full']
    name = row['path_ROI'].split('/')[-1]
    name = name.replace("_content_mass_test_failed_", "")

    # Open ROI and full images
    roi_image = Image.open(roi_path)
    full_image = Image.open(full_path)

    # Convert ROI image to RGBA
    roi_image = roi_image.convert("RGBA")
    datas = roi_image.getdata()
    new_data = []

    for i in range(len(datas)):
        item = datas[i]
        # If pixel's grayscale value is greater than 125, set it to transparent
        if item[0] == 255:
            new_data.append((255, 255, 255, 0))  # Set white pixels to transparent
        else:
            new_data.append(item)

    roi_image.putdata(new_data)

    # Resize the roi image to meet the full image size
    if roi_image.size != full_image.size:
        mask_size = full_image.size
        roi_image = roi_image.resize(mask_size)
        print(f"found one difference sizes {name} ")
    else:
        mask_size = full_image.size

    # set the mask position
    mask_position = (0, 0)  # Top-left corner

    # map ROI to the full image
    full_image.paste(roi_image, mask_position, roi_image)

    # Prepare the save path with the corresponding number of underscores
    save_path = f"/content/train_same_size/{name}"

    # Save the processed full image
    full_image.save(save_path)
    # print(f"Mapped image {name} saved.")

    # Print the number of images generated for each iteration
    # print(f"Image {name} saved.")
print("All images successfully processed")

# extract 598 by 598 pixel

Ensure the mapped images are ready

In [None]:
def find_nonzero_center(img):
    # get the width and height from the image size
    width, height = img.size

    # Initialize the total pixel count and cumulative values of two coordinates
    total_pixels = 0
    sum_x = 0
    sum_y = 0

    # iterate and calculate the values
    for x in range(width):
        for y in range(height):
            pixel_value = img.getpixel((x, y))
            if pixel_value != 0:
                total_pixels += 1
                sum_x += x
                sum_y += y

    # Calculate the geometric center coordinates of non-zero pixels
    center_x = sum_x // total_pixels if total_pixels > 0 else width // 2
    center_y = sum_y // total_pixels if total_pixels > 0 else height // 2

    return (center_x, center_y)

def extract_center_for_folder(input_folder, output_folder, output_size):
    # create the output_folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # iterate all images
    for filename in os.listdir(input_folder):
        if filename.endswith(".png"):
            # Construct input and output file paths
            input_path = os.path.join(input_folder, filename)
            output_path = os.path.join(output_folder, filename)

            # open the image
            img = Image.open(input_path)

            # Find the geometric center coordinates of non-zero pixels
            center_x, center_y = find_nonzero_center(img)

            # Calculate the coordinates of the top-left and bottom-right corners of the cropping area
            left = max(0, center_x - output_size // 2)
            top = max(0, center_y - output_size // 2)
            right = min(center_x + output_size // 2, img.width)
            bottom = min(center_y + output_size // 2, img.height)

            # crop the image
            cropped_img = img.crop((left, top, right, bottom))

            # save the cropped image
            cropped_img.save(output_path)
            print(f"Processed {filename}")

# Invoke the function to process images in a specified folder
extract_center_for_folder("/content/train_same_size", "/content/train_598", 598)

In [None]:
print("Image process finished")

Image process finished


# review examples which have clear white edge in mapped images

In [None]:
###
#we went back and converted the suspecious original .dcm images and double checked that them contain white edges

import os
import shutil

# Define source folders and destination folder
source_folders = "/content/mass_train/CBIS-DDSM"
destination_folder = "/content/test"

# Define example keywords to search for in file names
keywords = ["Mass-Training_P_00110_LEFT_CC",
            "Mass-Training_P_00708_RIGHT_CC",
            "Mass-Training_P_01493_RIGHT_CC"]

# Iterate through each source folder
for source_folder in source_folders:
    # Iterate through each root, dirs, and files in the source folder recursively
    for root, dirs, files in os.walk(source_folder):
        # Check if any of the keywords are in the root folder name
        if any(keyword in root for keyword in keywords):
            # Iterate through files in the current root
            for file in files:
                # Check if the file ends with .dcm
                if file.endswith(".dcm"):
                    # Construct the new file name using the keyword
                    keyword_name = next((keyword for keyword in keywords if keyword in root), None)
                    if keyword_name:
                        new_file_name = f"{keyword_name}.dcm"
                    else:
                        new_file_name = file

                    # Copy the file to the destination folder with the new name
                    source_path = os.path.join(root, file)
                    destination_path = os.path.join(destination_folder, new_file_name)
                    shutil.copyfile(source_path, destination_path)
                    print(f"File '{file}' copied and saved as '{new_file_name}' in '{destination_folder}'")

print("All .dcm files copied and saved successfully.")

File '1-1.dcm' copied and saved as 'Mass-Training_P_01493_RIGHT_CC.dcm' in '/content/test'
File '1-1.dcm' copied and saved as 'Mass-Training_P_00708_RIGHT_CC.dcm' in '/content/test'
File '1-1.dcm' copied and saved as 'Mass-Training_P_00110_LEFT_CC.dcm' in '/content/test'
All .dcm files copied and saved successfully.


In [None]:
###
#since the selected exmaple .dcm filse is unable to view, we converted them to .png for visualization
###

import os
import pydicom
from PIL import Image

# Define source folder containing .dcm files and destination folder for .png files
source_folder = "/content/test"
!mkdir /content/test_
destination_folder = "/content/test_"

# Iterate through each .dcm file in the source folder
for filename in os.listdir(source_folder):
    if filename.endswith(".dcm"):
        # Load the DICOM file
        dcm_path = os.path.join(source_folder, filename)
        dcm_data = pydicom.dcmread(dcm_path)

        # Convert DICOM to PIL Image
        image = Image.fromarray(dcm_data.pixel_array)

        # Save the PIL Image as PNG with the same name
        png_path = os.path.join(destination_folder, os.path.splitext(filename)[0] + ".png")
        image.save(png_path)

        print(f"Converted '{filename}' to '{png_path}'")

print("Conversion completed.")


Converted 'Mass-Training_P_01493_RIGHT_CC.dcm' to '/content/test_/Mass-Training_P_01493_RIGHT_CC.png'
Converted 'Mass-Training_P_00708_RIGHT_CC.dcm' to '/content/test_/Mass-Training_P_00708_RIGHT_CC.png'
Converted 'Mass-Training_P_00110_LEFT_CC.dcm' to '/content/test_/Mass-Training_P_00110_LEFT_CC.png'
Conversion completed.


In [None]:
###find corresponding roi images and converted full image in png format as well.
##############

import os
import shutil

# Define source folders and destination folder
source_folders = ["/content/roi_crop_train/roi_train_needed", "/content/full_train_needed"]
!mkdir /content/review
destination_folder = "/content/review"

# Define keywords to search for in file names
keywords = ["train_Mass-Training_P_00110_LEFT_CC",
            "train_Mass-Training_P_00708_RIGHT_CC",
            "train_Mass-Training_P_01493_RIGHT_CC"]

# Iterate through each source folder
for source_folder in source_folders:
    # Get the list of files in the source folder
    files = os.listdir(source_folder)

    # Iterate through each file
    for file in files:
        # Check if the file name contains any of the keywords
        if any(keyword in file for keyword in keywords):
            # Construct the source and destination paths
            source_path = os.path.join(source_folder, file)
            destination_path = os.path.join(destination_folder, file)

            # Copy the file to the destination folder
            shutil.copyfile(source_path, destination_path)
            print(f"File '{file}' copied to '{destination_folder}'")

print("All files copied successfully.")


File '_content_roi_train_Mass-Training_P_00110_LEFT_CC_1__1-2.png' copied to '/content/review'
File '_content_roi_train_Mass-Training_P_00708_RIGHT_CC_1__1-1.png' copied to '/content/review'
File '_content_roi_train_Mass-Training_P_01493_RIGHT_CC_1__1-1.png' copied to '/content/review'
File '_content_full_train_Mass-Training_P_00708_RIGHT_CC__1-1.png' copied to '/content/review'
File '_content_full_train_Mass-Training_P_01493_RIGHT_CC__1-1.png' copied to '/content/review'
File '_content_full_train_Mass-Training_P_00110_LEFT_CC__1-1.png' copied to '/content/review'
All files copied successfully.


In [None]:
#generate transparent images of ROI
import os
import cv2
import numpy as np

# Function to process images
def process_image(input_path, output_path):
    # Read the image
    image = cv2.imread(input_path)

    # Convert to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Canny edge detection
    edges = cv2.Canny(gray, 50, 150)

    # Find contours
    contours, _ = cv2.findContours(edges.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    # Create a blank image with transparency
    output = np.zeros_like(image, dtype=np.uint8)

    # Fill contours with green color
    cv2.drawContours(output, contours, -1, (0, 255, 0), thickness=3)

    # Set black parts as transparent
    output = cv2.cvtColor(output, cv2.COLOR_BGR2BGRA)
    output[..., 3] = np.where((output[..., :3] == [0, 0, 0]).all(axis=2), 0, 255)

    # Save the processed image
    cv2.imwrite(output_path, output)
    print(f"Processed {input_path} and saved to {output_path}")

# Input and output folders
input_folder = '/content/roi_crop_train/roi_train_needed'
!mkdir /content/full_trainsparent
output_folder = '/content/train_roi_transparent'

# Create the output folder if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Process each image in the input folder
for filename in os.listdir(input_folder):
    if filename.endswith('.png') or filename.endswith('.jpg'):  # Adjust file extensions as needed
        input_path = os.path.join(input_folder, filename)
        output_path = os.path.join(output_folder, filename)
        process_image(input_path, output_path)

print("All images processed and saved successfully.")


compare and ensure that the white edge is pure background.

# take care white edge

ensure the extracted images which contain unwanted white edges are ready.

In [None]:
!unzip "/content/white edge.zip"

In [None]:
# define the source and destinaation folder
source_folder = '/content/white edge'
destination_folder = '/content/adjusted/'

# create the destination folder if not existed
if not os.path.exists(destination_folder):
    os.makedirs(destination_folder)

# iterate all images inside a folder
for filename in os.listdir(source_folder):
    if filename.endswith('.png'):
        file_path = os.path.join(source_folder, filename)

        img = Image.open(file_path)

        width, height = img.size

        for x in range(width):
            for y in range(height):
                pixel_value = img.getpixel((x, y))

                if pixel_value == 65535:  #we only removed the pure white part, the pixel value = 65535
                    img.putpixel((x, y), 0)

        destination_filename = filename.replace('_content_', 'adjusted_')

        destination_file_path = os.path.join(destination_folder, destination_filename)

        img.save(destination_file_path)

print("All images processed and saved in the destination folder.")

All images processed and saved in the destination folder.


# size adjustment: 598 by 598

In [None]:
def adjust_and_save_image(image_path, target_size=(598, 598), save_dir='same_images', adjusted_dir='adjusted'):
    # create the destination folder
    os.makedirs(save_dir, exist_ok=True)
    os.makedirs(adjusted_dir, exist_ok=True)

    # open the image
    image = Image.open(image_path)

    # get the width and height
    width, height = image.size

    # if the size doesn't match witht he target size
    if width != target_size[0] or height != target_size[1]:
        # calculate the size we need to add
        left = (target_size[0] - width) // 2
        top = (target_size[1] - height) // 2
        right = target_size[0] - width - left
        bottom = target_size[1] - height - top

        # Create a new image and paste the 'image' onto it
        new_image = Image.new(image.mode, target_size, 0)  # fill in with black color
        new_image.paste(image, (left, top))

        # Save the adjusted image to the corresponding directory
        adjusted_path = os.path.join(adjusted_dir, os.path.basename(image_path))
        new_image.save(adjusted_path)
    else:
        # Directly copy images of equal dimensions to a new folder
        save_path = os.path.join(save_dir, os.path.basename(image_path))
        image.save(save_path)

# input folder
image_folder = '/content/train_598'

# save the size updated images
adjusted_folder = '/content/train_598_updated'

# save the original images
same_folder = '/content/train_598_same'

# iterate all images in the input folder
for filename in os.listdir(image_folder):
    if filename.endswith('.png'):
        image_path = os.path.join(image_folder, filename)
        adjust_and_save_image(image_path, save_dir=same_folder, adjusted_dir=adjusted_folder)

In [None]:
import os
from PIL import Image

def check_image_size(folder_path):
    for filename in os.listdir(folder_path):
        if filename.endswith(".png"):  # Check only PNG files, adjust if needed
            image_path = os.path.join(folder_path, filename)
            image = Image.open(image_path)
            width, height = image.size
            if width != 598 or height != 598:
                print(f"Image '{filename}' has size {width}x{height} pixels.")

# Replace 'path_to_folder' with the actual path to your folder containing images
check_image_size('/content/train_598_same')
check_image_size('/content/train_598_updated')
print("Done")

Done


# data augmentation

In [None]:
import os

# input folder
folder_path = '/content/all_598'

# get the file path of the folder
file_list = os.listdir(folder_path)

# check the amount of files, by the way
num_images = len(file_list)

print(f"There are {num_images} images in the folder.")


There are 1696 images in the folder.


In [None]:
import cv2
import numpy as np
from keras.preprocessing.image import ImageDataGenerator
import os

# Define the directory containing the original images
original_dir = '/content/all_598'

# Define the directory to save the augmented images
augmented_dir = '/content/all_598_augmented'

# Ensure the augmented directory exists, create if not
if not os.path.exists(augmented_dir):
    os.makedirs(augmented_dir)

# ImageDataGenerator for augmentation
datagen = ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.1,
    height_shift_range=0.1,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True
)

# List all original image files
original_files = os.listdir(original_dir)

# Function to save augmented images
def save_augmented_images(img, prefix, idx):
    filename = os.path.join(augmented_dir, f"{prefix}_{idx}.png")
    cv2.imwrite(filename, img)

# Iterate through each original image
for filename in original_files:
    # Load the original image
    img = cv2.imread(os.path.join(original_dir, filename), cv2.IMREAD_UNCHANGED)

    # Reshape to add a channel dimension if it's missing
    if len(img.shape) == 2:
        img = np.expand_dims(img, axis=-1)

    # Rescale to [0, 1]
    img = img.astype(np.float32) / 65535.0

    # Reshape to 4D array (batch_size, rows, columns, channels) for augmentation
    img = np.expand_dims(img, axis=0)

    # Generate augmented images
    i = 0
    for batch in datagen.flow(img, batch_size=1):
        augmented_img = (batch[0] * 65535).astype(np.uint16)  # Rescale back to 16-bit
        save_augmented_images(augmented_img,os.path.splitext(filename)[0], i + 1)
        print(f"Augmented image {i + 1} saved to : {filename}")
        i += 1
        if i >= 5:  # Generate 5 augmented images for each original image
            break
print("Data augmentation completed.")

In [None]:
#move the input to the output
!mv /content/all_598/* /content/all_598_augmented/

In [None]:
import os

# input folder
folder_path = '/content/all_598_augmented'

# image name
files_list = os.listdir(folder_path)

# check if it is unique
unique_paths = len(set(files_list)) == len(files_list)

# calculate the image size
total_files = len(files_list)

print(f"Are all paths unique? {unique_paths}")
print(f"Total amount of files: {total_files}")


Are all paths unique? True
Total amount of files: 10176


In [None]:
!zip -r /content/drive/MyDrive/CBIS-DDSM/all_598_augmented_mode.zip /content/all_598_augmented

# merge train and test descirption csv files

In [None]:
train = pd.read_csv('/content/drive/MyDrive/CBIS-DDSM/mass_case_description_train_set.csv')

test = pd.read_csv('/content/drive/MyDrive/CBIS-DDSM/mass_case_description_test_set.csv')

In [None]:
train_need = train[['patient_id', 'pathology', 'image file path', 'cropped image file path', 'ROI mask file path']]
test_need = test[['patient_id', 'pathology', 'image file path', 'cropped image file path','ROI mask file path']]

In [None]:
merged_df = pd.concat([train_need, test_need], axis=0)

In [None]:
merged_df.reset_index(drop=True, inplace=True)

In [None]:
merged_df['crop_name'] = merged_df['cropped image file path'].apply(lambda x: x.split('/')[0])
merged_df['ROI_name'] = merged_df['ROI mask file path'].apply(lambda x: x.split('/')[0])
merged_df['full_name'] = merged_df['image file path'].apply(lambda x: x.split('/')[0])

In [None]:
merged_df['ROI number'] = merged_df['ROI mask file path'].apply(lambda x: re.findall(r'\d+(?=.dcm)', x)[-1][-1] if re.findall(r'\d+(?=.dcm)', x) else None)
merged_df['Crop number'] = merged_df['cropped image file path'].apply(lambda x: re.findall(r'\d+(?=.dcm)', x)[-1][-1] if re.findall(r'\d+(?=.dcm)', x) else None)

In [None]:
merged_df['ROI number'].value_counts()

1    1593
0     103
Name: ROI number, dtype: int64

In [None]:
merged_df.to_csv('all_mass_pathology.csv', index=False)

# calculate the non-0 area and full image size

calculate cropped area per 598 by 598.

calculate full images size.

In [None]:
#for non-zero area calculation
def process_image_and_extract_groups(image_path):
    # Define the regex pattern
    pattern = r'Mass-Training_P_(\d+)_(\w+)_(\w+)_' #for training dataset, make sure you pick the proper one
    pattern = r'Mass-Test_P_(\d+)_(\w+)_(\w+)_' #for training dataset, make sure you pick the proper one


    match = re.search(pattern, image_path)
    if match:
        number = match.group(1)
        left = match.group(2)
        cc = match.group(3)
        result = f'Mass-Test_P_{number}_{left}_{cc}'

        # Open the image
        image = Image.open(image_path)

        # Convert the grayscale image to a NumPy array
        image_array = np.array(image)

        # Calculate the non-zero area
        nonzero_pixels = np.count_nonzero(image_array)
        total_pixels = image_array.size
        nonzero_area_percentage = (nonzero_pixels / total_pixels) * 100

        return result, image, nonzero_area_percentage
    else:
        print(f"No match found for file: {image_path}")
        return None, None, None

# Create an empty DataFrame
df_test = pd.DataFrame(columns=['name', 'file_path', 'area_percentage'])

# Replace 'path_to_folder' with the actual path to your folder containing images
folder_path = '/content/test_598_same'

for filename in os.listdir(folder_path):
    if filename.endswith(".png"):  # Check only PNG files, adjust if needed
        image_path = os.path.join(folder_path, filename)
        results, image, nonzero_area_percentage = process_image_and_extract_groups(image_path)
        if results and image and nonzero_area_percentage:
            df_test = df_test.append({'name': results, 'file_path': image_path, 'area_percentage': nonzero_area_percentage}, ignore_index=True)

# Save the df
# df.to_csv('output.csv', index=False)
print("Done")


In [None]:
# Merge the DataFrames using pd.concat()
merged_df = pd.concat([df, df_test])
sorted_merged_df = merged_df.sort_values(by='name')
sorted_merged_df = sorted_merged_df.reset_index(drop=True)

In [None]:
#add the pathology information to the output as well
pathology = pd.read_csv('/content/content/all_mass_pathology.csv')
sorted_pathology = pathology.sort_values(by='full_name')
sorted_pathology = sorted_pathology.reset_index(drop=True)
# creat a column pathology in sorted_merged_df
sorted_merged_df['pathology'] = None

# copy the pathology if meet the requirement
def copy_pathology(row):
    for name in sorted_pathology['full_name']:
        if name in row['name']:
            return sorted_pathology.loc[sorted_pathology['full_name'] == name, 'pathology'].values[0]
    return None

sorted_merged_df['pathology'] = sorted_merged_df.apply(copy_pathology, axis=1)

In [None]:
#save the output
sorted_merged_df.to_csv('/content/content/598_percentage_all.csv')

In [None]:
#ensure all full images are ready at hand
#####
#put all full images in side the folder /content/full_train_needed
#####

# Folder containing the images
folder_path = '/content/content/full_train_needed'

# Initialize lists to store image path, width, and height information
image_path_list = []
width_list = []
height_list = []

# Loop through each file in the folder
for file_name in os.listdir(folder_path):
    if file_name.endswith('.png'):  # Check if the file is a PNG image
        # Construct the full path to the image
        image_path = os.path.join(folder_path, file_name)

        # Extract the part between 'full_' and '.png' in the filename
        image_name = file_name.split('full_')[1].split('.png')[0]

        # Open the image using PIL
        image = Image.open(image_path)

        # Get width and height
        width, height = image.size

        # Append image path, width, and height to lists
        image_path_list.append(image_name)
        width_list.append(width)
        height_list.append(height)

# Create a DataFrame from the lists
data = {'Subject_ID': image_path_list, 'Width': width_list, 'Height': height_list}
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
# df.to_csv('image_pixel.csv', index=False)

In [None]:
# creat a column pathology in df
df['pathology'] = None

# copy the pathology if meet the requirement
def copy_pathology(row):
    for name in sorted_pathology['full_name']:
        if name in row['Subject_ID']:
            return sorted_pathology.loc[sorted_pathology['full_name'] == name, 'pathology'].values[0]
    return None

df['pathology'] = df.apply(copy_pathology, axis=1)

In [None]:
df_sort = df.sort_values(by='Subject_ID')

In [None]:
df_sort.to_csv('/content/metadata/heaght_width_FULL.csv')