In [27]:
import os
import glob
import shutil
import random
import pandas as pd

### Rename images

In [49]:
# Path to the main directory containing subfolders
main_dir = 'C:/Users/m294666/Documents/nevus_data_500_risk_factors_processed'

# Path to the Excel file
excel_file_path = 'C:/Users/m294666/Documents/nevus_data/230929_Ch_Nevus_deleted_items_to_ignore.xlsx'

# Load the Excel file into a pandas DataFrame
df = pd.read_excel(excel_file_path)

# Iterate over each subfolder in the main directory
for folder_name in os.listdir(main_dir):
    folder_path = os.path.join(main_dir, folder_name)

    # Check if the item in the directory is a subfolder
    if os.path.isdir(folder_path):
        # Get the MRN value from the folder name
        mrn = folder_name

        # Find the corresponding row in the Excel file based on MRN
        row = df[df['MRN'].astype(str) == mrn]

        # Check if there is a matching row in the Excel file
        if not row.empty:
            affected_eye = row['Affected Eye'].values[0]
            
            l,r = 0,0

            # Iterate over each file in the subfolder
            for i,filename in enumerate(os.listdir(folder_path)):
                
                if os.path.isfile(os.path.join(folder_path, filename)):
                    # Check the affected eye and rename the file accordingly
                    if affected_eye == 'Left':
                        new_filename = f'{mrn}_L_{i}.jpg'
                    elif affected_eye == 'Right':
                        new_filename = f'{mrn}_R_{i}.jpg'
                    elif affected_eye == 'Both':
                        if filename.endswith('_L.jpg'):
                            new_filename = f'{mrn}_L_{l}.jpg'
                            l +=1 
                        elif filename.endswith('_R.jpg'):
                            new_filename = f'{mrn}_R_{r}.jpg'
                            r += 1
                        else:
                            # Handle other cases as needed
                            new_filename = filename
                    else:
                        # Handle other cases as needed
                        new_filename = filename

                    # Rename the file
                    os.rename(os.path.join(folder_path, filename), os.path.join(folder_path, new_filename))

### Create directories for RETFound (dia5)

In [50]:
# Function to create directories
def create_directories(base_path, folders):
    for folder in folders:
        folder_path = os.path.join(base_path, folder)
        os.makedirs(folder_path, exist_ok=True)
        os.makedirs(os.path.join(folder_path, '0'), exist_ok=True)
        os.makedirs(os.path.join(folder_path, '1'), exist_ok=True)

mrn_directory_path = 'C:/Users/m294666/Documents/nevus_data_500_risk_factors_processed'

base_directory_path = 'C:/Users/m294666/Documents/data_dia5_retfound'

# Delete the contents of the main directory if it is not empty
shutil.rmtree(base_directory_path, ignore_errors=True)

# Create train, val, and test directories
create_directories(base_directory_path, ['train', 'val', 'test'])

# Specify the path to your Excel file
excel_file_path = 'C:/Users/m294666/Documents/nevus_data/230929_Ch_Nevus_deleted_items_to_ignore.xlsx'

# Load the Excel file into a pandas DataFrame
df = pd.read_excel(excel_file_path)

# Function to determine set (train, val, test) for each MRN
def determine_set(mrn_list):
    random.shuffle(mrn_list)
    total_mrn = len(mrn_list)
    train_size = int(0.6 * total_mrn)
    val_size = int(0.2 * total_mrn)
    
    train_set = mrn_list[:train_size]
    val_set = mrn_list[train_size:train_size + val_size]
    test_set = mrn_list[train_size + val_size:]
    
    return train_set, val_set, test_set

# Function to copy images based on diameter condition
def copy_images(source_path, dest_path, mrn, set_type, df):
    
    mrn_row = df[df['MRN'].astype(str) == mrn]
    
    for image_name in os.listdir(os.path.join(source_path, mrn)):
        diameter_column = 'OD Largest tumor diameter (mm)' if '_R' in image_name else 'OS Largest tumor diameter (mm)'
        diameter = mrn_row[diameter_column].iloc[0]
        if diameter > 5.0:
            shutil.copy(os.path.join(source_path, mrn, image_name), os.path.join(dest_path, set_type, '1'))
        else:
            shutil.copy(os.path.join(source_path, mrn, image_name), os.path.join(dest_path, set_type, '0'))

# Get the list of MRNs from the MRN directory
mrn_list = os.listdir(mrn_directory_path)

# Split MRNs into train, val, test sets
train_set, val_set, test_set = determine_set(mrn_list)

# Iterate over each MRN folder and copy images based on the specified conditions
for mrn in mrn_list:
    if mrn in train_set:
        copy_images(mrn_directory_path, base_directory_path, mrn, 'train', df)
    elif mrn in val_set:
        copy_images(mrn_directory_path, base_directory_path, mrn, 'val', df)
    elif mrn in test_set:
        copy_images(mrn_directory_path, base_directory_path, mrn, 'test', df)

In [27]:
# # Perform train/val/test split
# train_df = filtered_df.sample(frac=0.6, random_state=1)
# val_test_df = filtered_df.drop(train_df.index)
# val_df = val_test_df.sample(frac=0.5, random_state=1)
# test_df = val_test_df.drop(val_df.index)

# # Define a function to copy images based on 'dia>5' column
# def copy_images(source_folder, destination_folder, mrn_list, dia_value):
#     dia_value = dia_value.astype(str).tolist()
#     for i,mrn in enumerate(mrn_list):
#         source_path = os.path.join(source_folder, str(mrn))
#         destination_path = os.path.join(destination_folder, dia_value[i])
#         os.makedirs(destination_path, exist_ok=True)
#         files = os.listdir(source_path)
#         for file in files:
#             shutil.copy(os.path.join(source_path, file), destination_path)

# # Copy images for the train set
# copy_images(mrn_directory_path, os.path.join(base_directory_path, 'train'), train_df['MRN'], train_df['dia>5'].astype(str))

# # Copy images for the val set
# copy_images(mrn_directory_path, os.path.join(base_directory_path, 'val'), val_df['MRN'], val_df['dia>5'].astype(str))

# # Copy images for the test set
# copy_images(mrn_directory_path, os.path.join(base_directory_path, 'test'), test_df['MRN'], test_df['dia>5'].astype(str))

### Create directories for RETFound (thick2)

In [65]:
# Function to create directories
def create_directories(base_path, folders):
    for folder in folders:
        folder_path = os.path.join(base_path, folder)
        os.makedirs(folder_path, exist_ok=True)
        os.makedirs(os.path.join(folder_path, '0'), exist_ok=True)
        os.makedirs(os.path.join(folder_path, '1'), exist_ok=True)

mrn_directory_path = 'C:/Users/m294666/Documents/nevus_data_500_risk_factors_processed'

base_directory_path = 'C:/Users/m294666/Documents/data_thick2_retfound'

# Delete the contents of the main directory if it is not empty
shutil.rmtree(base_directory_path, ignore_errors=True)

# Create train, val, and test directories
create_directories(base_directory_path, ['train', 'val', 'test'])

# Specify the path to your Excel file
excel_file_path = 'C:/Users/m294666/Documents/nevus_data/230929_Ch_Nevus_deleted_items_to_ignore.xlsx'

# Load the Excel file into a pandas DataFrame
df = pd.read_excel(excel_file_path)

# Function to determine set (train, val, test) for each MRN
def determine_set(mrn_list):
    random.shuffle(mrn_list)
    total_mrn = len(mrn_list)
    train_size = int(0.6 * total_mrn)
    val_size = int(0.2 * total_mrn)
    
    train_set = mrn_list[:train_size]
    val_set = mrn_list[train_size:train_size + val_size]
    test_set = mrn_list[train_size + val_size:]
    
    return train_set, val_set, test_set

# Function to copy images based on diameter condition
def copy_images(source_path, dest_path, mrn, set_type, df):
    
    mrn_row = df[df['MRN'].astype(str) == mrn]
    
    for image_name in os.listdir(os.path.join(source_path, mrn)):
        column = 'OD Largest tumor thickness (mm)' if '_R' in image_name else 'OS Largest tumor thickness (mm)'
        thickness = mrn_row[column].iloc[0]
        if thickness > 2.0:
            shutil.copy(os.path.join(source_path, mrn, image_name), os.path.join(dest_path, set_type, '1'))
        else:
            shutil.copy(os.path.join(source_path, mrn, image_name), os.path.join(dest_path, set_type, '0'))

# Get the list of MRNs from the MRN directory
mrn_list = os.listdir(mrn_directory_path)

# Split MRNs into train, val, test sets
train_set, val_set, test_set = determine_set(mrn_list)

# Iterate over each MRN folder and copy images based on the specified conditions
for mrn in mrn_list:
    if mrn in train_set:
        copy_images(mrn_directory_path, base_directory_path, mrn, 'train', df)
    elif mrn in val_set:
        copy_images(mrn_directory_path, base_directory_path, mrn, 'val', df)
    elif mrn in test_set:
        copy_images(mrn_directory_path, base_directory_path, mrn, 'test', df)

### Create CSV file for CNN-based approach

In [64]:
def process_images(main_dir, excel_path, output_csv_path):
    # Get all MRNs (group names from main_dir)
    mrns = [folder for folder in os.listdir(main_dir) if os.path.isdir(os.path.join(main_dir, folder))]

    # Create a new dataframe with two columns, 'Image Path' and 'Label'
    output_data = []

    # Load excel file
    excel_data = pd.read_excel(excel_path)

    # Iterate over each folder
    for mrn in mrns:
        folder_path = os.path.join(main_dir, mrn)

        # The first row of the excel file contains MRNs. Use the MRN to get the row
        mrn_row = excel_data[excel_data['MRN'].astype(str) == mrn]

        # iterate over all the images in the folder
        for image_name in os.listdir(folder_path):
            image_path = os.path.join(folder_path, image_name)

            # If image name contains '_R', then get diameter from 'OD Largest tumor diameter (mm)' column,
            # else get it from 'OS Largest tumor diameter (mm)'
            if '_R' in image_name:
                diameter = mrn_row['OD Largest tumor diameter (mm)'].values[0]
            else:
                diameter = mrn_row['OS Largest tumor diameter (mm)'].values[0]

            # If diameter is greater than 5.0, save the path of the image in 'Image Path' and 1 in 'Label'
            # If diameter is less than 5.0, save the path of image in 'Image Path' and 0 in 'Label'
            label = 1 if diameter > 5.0 else 0

            # Replace 'C:/Users/m294666/Documents/' in image path with '/research/labs/ophthalmology/iezzi/m294666/'
            image_path = image_path.replace('C:/Users/m294666/Documents/', '/research/labs/ophthalmology/iezzi/m294666/').replace('\\', '/')

            # Append to the dataframe
            output_data.append({'Image Path': image_path, 'Label': label})

    output_df = pd.DataFrame(output_data)
    
    # Save the dataframe to CSV
    output_df.to_csv(output_csv_path, index=False)
    print(f"Output CSV saved to: {output_csv_path}")

# Example usage:
main_directory = 'C:/Users/m294666/Documents/nevus_data_500_risk_factors_processed'
excel_file_path = 'C:/Users/m294666/Documents/nevus_data/230929_Ch_Nevus_deleted_items_to_ignore.xlsx'
output_csv_file_path = 'C:/Users/m294666/Documents/nevus_data_500_for_resnet.csv'

process_images(main_directory, excel_file_path, output_csv_file_path)

Output CSV saved to: C:/Users/m294666/Documents/nevus_data_500_for_resnet.csv


### Count total images in given directory

In [55]:
def count_images(directory_path, image_extensions=('jpg', 'jpeg', 'png', 'gif')):
    # Ensure the directory path ends with a slash
    directory_path = os.path.join(directory_path, '')

    # Use glob to get a list of files matching the specified extensions recursively
    image_files = []
    for extension in image_extensions:
        pattern = directory_path + f'**/*.{extension}'
        image_files.extend(glob.glob(pattern, recursive=True))

    # Count the number of image files
    num_images = len(image_files)

    return num_images, image_files

# directory_path = 'C:/Users/m294666/Documents/nevus_data_500_risk_factors'
directory_path = 'C:/Users/m294666/Documents/data_dia5_retfound'
num_images, image_files = count_images(directory_path)

print(f"Number of images in '{directory_path}': {num_images}")

Number of images in 'C:/Users/m294666/Documents/data_dia5_retfound': 501


### Find max images in a folder

In [17]:
import os

def count_images_in_folder(folder_path):
    image_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.bmp']  # Add more extensions if needed
    image_count = 0

    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        if os.path.isfile(file_path) and any(file_name.lower().endswith(ext) for ext in image_extensions):
            image_count += 1

    return image_count

def top_n_folders_with_images(main_directory, n):
    folder_image_count = []

    for subfolder_name in os.listdir(main_directory):
        subfolder_path = os.path.join(main_directory, subfolder_name)
        if os.path.isdir(subfolder_path):
            image_count = count_images_in_folder(subfolder_path)
            folder_image_count.append((subfolder_name, image_count))

    # Sort the folders based on image count in descending order
    sorted_folders = sorted(folder_image_count, key=lambda x: x[1], reverse=True)

    # Display the top N folders
    for i in range(min(n, len(sorted_folders))):
        folder_name, image_count = sorted_folders[i]
        print(f"{folder_name}: {image_count} images")

# Example usage
main_directory_path = "C:/Users/m294666/Documents/nevus_data_500_risk_factors"
top_n_folders_with_images(main_directory_path, 10)

12252292: 4 images
1455618: 4 images
1767396: 4 images
2007399: 4 images
2248076: 4 images
2578116: 4 images
1770547: 3 images
2439833: 3 images
2498796: 3 images
2635577: 3 images
