In [None]:
import pandas as pd
import os
import shutil
import sys

In [4]:
os.listdir()

['data_training.py',
 'src',
 'aimi-project-storage',
 'README.md',
 'experiments',
 '.git',
 'scripts',
 'customTrainer.py',
 'test',
 'oversampling_nnunet.ipynb',
 'data',
 'visualize_data.ipynb',
 'nnUNet3DEnsemble.zip',
 'requirements.txt',
 '.env',
 'logs',
 'venv',
 'nnUNet',
 '.gitignore',
 'SegFormer3D-main']

In [6]:
train_csv = pd.read_csv('data/train_oversampling.csv')
val_csv = pd.read_csv('data/validation_oversampling.csv')

In [7]:
train_csv.head()

Unnamed: 0,data_path,case_name
0,../../../data/uls2023_seg/ULS2023_Training_Dat...,MIX_03216
1,../../../data/uls2023_seg/ULS2023_Training_Dat...,MIX_01472
2,../../../data/uls2023_seg/ULS2023_Training_Dat...,MIX_01640
3,../../../data/uls2023_seg/ULS2023_Training_Dat...,MIX_01142
4,../../../data/uls2023_seg/ULS2023_Training_Dat...,MIX_03216


In [38]:
import re

def get_raw_data_path(case_number):
    base_path_im = './data/raw/Dataset001_MIX/imagesTr/'
    base_path_label = './data/raw/Dataset001_MIX/labelsTr/'
    case_name_expanded_im = f'MIX_{case_number:05d}_0000.nii.gz'
    case_name_expanded_label = f'MIX_{case_number:05d}.nii.gz'
    raw_path_im = os.path.join(base_path_im, case_name_expanded_im)
    raw_path_label = os.path.join(base_path_label, case_name_expanded_label)
    return raw_path_im, raw_path_label

def get_case_number(case_name):
    match = re.search(r'MIX_(\d+)', case_name)
    if match:
        number = match.group(1)
        return int(number)
    else:
        print(f"Warning: Case name '{case_name}' does not match expected format.")
        return None

def copy_case(raw_path_im, raw_path_label, dest_path_im, dest_path_label):
    # only copy if the destination does not already exist
    if not (os.path.exists(dest_path_im) and os.path.exists(dest_path_label)):
        print(f"Copied {raw_path_im} to {dest_path_im} and {raw_path_label} to {dest_path_label}.")
        shutil.copy(raw_path_im, dest_path_im)
        shutil.copy(raw_path_label, dest_path_label)

    else:
        print(f"Destination {dest_path_im} already exists. Skipping copy for this case.")

def copy_deduplicated_cases(csv:pd.DataFrame):
    for index, row in csv.iterrows():
        case_name = row['case_name']
        case_number_ori = row['case_number']
        case_number_new = row['case_number_dedup']
        raw_path_im, raw_path_label = get_raw_data_path(case_number_ori)
        dest_name_im = f'MIX_{case_number_new:05d}_0000.nii.gz'
        dest_name_label = f'MIX_{case_number_new:05d}.nii.gz'
        dest_path_im = f'./data/raw_oversampled/Dataset001_MIX/imagesTr/{dest_name_im}'
        dest_path_label = f'./data/raw_oversampled/Dataset001_MIX/labelsTr/{dest_name_label}'
        copy_case(raw_path_im, raw_path_label, dest_path_im, dest_path_label)

def deduplicate_case_numbers(df):
    seen = set()
    used_numbers = set(df['case_number'])
    max_number = max(used_numbers)
    new_numbers = []
    next_number = max_number + 1
    for num in df['case_number']:
        if num not in seen:
            seen.add(num)
            new_numbers.append(num)
        else:
            # Assign a new unique number
            while next_number in used_numbers:
                next_number += 1
            new_numbers.append(next_number)
            used_numbers.add(next_number)
            next_number += 1
    df['case_number_dedup'] = new_numbers
    return df

train_csv['case_number'] = train_csv['case_name'].apply(get_case_number)
train_csv = deduplicate_case_numbers(train_csv)


In [39]:
copy_deduplicated_cases(train_csv)

Destination ./data/raw_oversampled/Dataset001_MIX/imagesTr/MIX_03216_0000.nii.gz already exists. Skipping copy for this case.
Destination ./data/raw_oversampled/Dataset001_MIX/imagesTr/MIX_01472_0000.nii.gz already exists. Skipping copy for this case.
Destination ./data/raw_oversampled/Dataset001_MIX/imagesTr/MIX_01640_0000.nii.gz already exists. Skipping copy for this case.
Destination ./data/raw_oversampled/Dataset001_MIX/imagesTr/MIX_01142_0000.nii.gz already exists. Skipping copy for this case.
Copied ./data/raw/Dataset001_MIX/imagesTr/MIX_03216_0000.nii.gz to ./data/raw_oversampled/Dataset001_MIX/imagesTr/MIX_05773_0000.nii.gz and ./data/raw/Dataset001_MIX/labelsTr/MIX_03216.nii.gz to ./data/raw_oversampled/Dataset001_MIX/labelsTr/MIX_05773.nii.gz.
Destination ./data/raw_oversampled/Dataset001_MIX/imagesTr/MIX_04517_0000.nii.gz already exists. Skipping copy for this case.
Destination ./data/raw_oversampled/Dataset001_MIX/imagesTr/MIX_05644_0000.nii.gz already exists. Skipping copy

In [23]:
os.listdir("./data/raw/Dataset001_MIX/labelsTr/")

['MIX_04323.nii.gz',
 'MIX_04504.nii.gz',
 'MIX_04739.nii.gz',
 'MIX_00187.nii.gz',
 'MIX_02881.nii.gz',
 'MIX_00475.nii.gz',
 'MIX_00252.nii.gz',
 'MIX_00648.nii.gz',
 'MIX_02300.nii.gz',
 'MIX_02527.nii.gz',
 'MIX_02969.nii.gz',
 'MIX_05020.nii.gz',
 'MIX_05607.nii.gz',
 'MIX_03003.nii.gz',
 'MIX_03624.nii.gz',
 'MIX_01905.nii.gz',
 'MIX_01776.nii.gz',
 'MIX_01151.nii.gz',
 'MIX_01284.nii.gz',
 'MIX_03857.nii.gz',
 'MIX_03419.nii.gz',
 'MIX_02669.nii.gz',
 'MIX_00321.nii.gz',
 'MIX_00506.nii.gz',
 'MIX_00948.nii.gz',
 'MIX_02781.nii.gz',
 'MIX_02454.nii.gz',
 'MIX_02273.nii.gz',
 'MIX_04839.nii.gz',
 'MIX_04477.nii.gz',
 'MIX_04250.nii.gz',
 'MIX_04185.nii.gz',
 'MIX_03757.nii.gz',
 'MIX_03170.nii.gz',
 'MIX_03482.nii.gz',
 'MIX_01876.nii.gz',
 'MIX_01438.nii.gz',
 'MIX_03098.nii.gz',
 'MIX_01022.nii.gz',
 'MIX_01605.nii.gz',
 'MIX_03924.nii.gz',
 'MIX_05549.nii.gz',
 'MIX_05286.nii.gz',
 'MIX_05153.nii.gz',
 'MIX_04879.nii.gz',
 'MIX_04210.nii.gz',
 'MIX_04437.nii.gz',
 'MIX_04991.n