# Generates raw nnUNet data for nnUNet from a CSV with the oversampled cases

In [None]:
# Note: This notebook should be run only after running the upsample_large_lesions.ipynb 
# notebook as it uses the CSV generated by that script to copy the oversampled images
# with new IDs.

import pandas as pd
import os
import shutil
import sys

In [None]:
# Read the oversampled training data CSV file
train_csv = pd.read_csv('SegFormer3D-main/data/train_all_data_oversampled.csv')

In [None]:
import re

# Define the source and destination directories
dst_root_dir = '/d/hpc/home/jf73497/projects//aimi-project-data/raw_oversampled_complete/Dataset002_MIX/'
ori_root_dir = '/d/hpc/home/jf73497/projects//aimi-project-data/raw_complete/Dataset001_MIX/'
os.makedirs(dst_root_dir, exist_ok=True)
os.makedirs(f'{dst_root_dir}imagesTr/', exist_ok=True)
os.makedirs(f'{dst_root_dir}labelsTr/', exist_ok=True)
os.makedirs(f'{dst_root_dir}imagesTs/', exist_ok=True)
os.makedirs(f'{dst_root_dir}labelsTs/', exist_ok=True)

def get_raw_data_path(case_number):
    """
    Constructs the file paths for the original data based on the case number.
    Args:
        case_number (int): The case number to construct the file paths for.
    """
    base_path_im = f'{ori_root_dir}imagesTr/'
    base_path_label = f'{ori_root_dir}labelsTr/'
    case_name_expanded_im = f'MIX_{case_number:05d}_0000.nii.gz'
    case_name_expanded_label = f'MIX_{case_number:05d}.nii.gz'
    raw_path_im = os.path.join(base_path_im, case_name_expanded_im)
    raw_path_label = os.path.join(base_path_label, case_name_expanded_label)
    return raw_path_im, raw_path_label

def get_case_number(case_name):
    """
    Extracts the case number from the case name.

    Args:
        case_name (str): The name of the case, expected to be in the format 'MIX_XXXXX'.
    """
    match = re.search(r'MIX_(\d+)', case_name)
    if match:
        number = match.group(1)
        return int(number)
    else:
        print(f"Warning: Case name '{case_name}' does not match expected format.")
        return None

def copy_case(raw_path_im, raw_path_label, dest_path_im, dest_path_label):
    # only copy if the destination does not already exist
    if not (os.path.exists(dest_path_im) and os.path.exists(dest_path_label)):
        print(f"Copied {raw_path_im} to {dest_path_im} and {raw_path_label} to {dest_path_label}.")
        shutil.copy(raw_path_im, dest_path_im)
        shutil.copy(raw_path_label, dest_path_label)

    else:
        print(f"Destination {dest_path_im} already exists. Skipping copy for this case.")

def copy_deduplicated_cases(csv:pd.DataFrame):
    """
    Copies the deduplicated cases from the original data directory to the new destination directory.
    Args:
        csv (pd.DataFrame): DataFrame containing the case numbers and their deduplicated counterparts.
    """

    for index, row in csv.iterrows():
        case_number_ori = row['case_number']
        case_number_new = row['case_number_dedup']
        raw_path_im, raw_path_label = get_raw_data_path(case_number_ori)
        dest_name_im = f'MIX_{case_number_new:05d}_0000.nii.gz'
        dest_name_label = f'MIX_{case_number_new:05d}.nii.gz'
        dest_path_im = f'{dst_root_dir}/imagesTr/{dest_name_im}'
        dest_path_label = f'{dst_root_dir}/labelsTr/{dest_name_label}'
        copy_case(raw_path_im, raw_path_label, dest_path_im, dest_path_label)

def deduplicate_case_numbers(df):
    """ Deduplicates case numbers in the DataFrame by assigning new unique numbers to duplicates.
    Args:
        df (pd.DataFrame): DataFrame containing the case numbers to deduplicate.
    """
    seen = set()
    used_numbers = set(df['case_number'])
    max_number = max(used_numbers)
    new_numbers = []
    next_number = max_number + 1
    for num in df['case_number']:
        if num not in seen:
            seen.add(num)
            new_numbers.append(num)
        else:
            # Assign a new unique number
            while next_number in used_numbers:
                next_number += 1
            new_numbers.append(next_number)
            used_numbers.add(next_number)
            next_number += 1
    df['case_number_dedup'] = new_numbers
    return df

train_csv['case_number'] = train_csv['case_name'].apply(get_case_number)
train_csv = deduplicate_case_numbers(train_csv)


In [None]:
# Verify that duplicates were generated correctly
train_csv.groupby('case_number').size().sort_values(ascending=False)

case_number
4447    157
2963    150
302     136
3216    115
5919     42
       ... 
2501      1
2500      1
2499      1
2498      1
2512      1
Length: 7208, dtype: int64

In [None]:
# Copy the deduplicated cases to the new directory
copy_deduplicated_cases(train_csv)

Copied ../aimi-project-data/raw_complete/Dataset001_MIX/imagesTr/MIX_03618_0000.nii.gz to ../aimi-project-data/raw_oversampled_complete/Dataset002_MIX//imagesTr/MIX_03618_0000.nii.gz and ../aimi-project-data/raw_complete/Dataset001_MIX/labelsTr/MIX_03618.nii.gz to ../aimi-project-data/raw_oversampled_complete/Dataset002_MIX//labelsTr/MIX_03618.nii.gz.
Copied ../aimi-project-data/raw_complete/Dataset001_MIX/imagesTr/MIX_01349_0000.nii.gz to ../aimi-project-data/raw_oversampled_complete/Dataset002_MIX//imagesTr/MIX_01349_0000.nii.gz and ../aimi-project-data/raw_complete/Dataset001_MIX/labelsTr/MIX_01349.nii.gz to ../aimi-project-data/raw_oversampled_complete/Dataset002_MIX//labelsTr/MIX_01349.nii.gz.
Copied ../aimi-project-data/raw_complete/Dataset001_MIX/imagesTr/MIX_03101_0000.nii.gz to ../aimi-project-data/raw_oversampled_complete/Dataset002_MIX//imagesTr/MIX_03101_0000.nii.gz and ../aimi-project-data/raw_complete/Dataset001_MIX/labelsTr/MIX_03101.nii.gz to ../aimi-project-data/raw_ov