In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm

import pickle

# Constants

In [2]:
RANDOM_STATE = 42
TEST_SIZE = 0.2
VAL_SIZE = 0.5
DATASET_CSV = "organs_selected_downsampled.csv"
DATASET_FOLDER = "./dataset_obj/"

# Load data

In [3]:
organs = pd.read_csv(DATASET_CSV)
organs.rename(columns={
    "Unnamed: 0": "original_index",
    "Unnamed: 0.1": "new_index"
}, inplace=True)

In [4]:
available_latents_paths = [os.path.join(DATASET_FOLDER, latent) for latent in os.listdir(DATASET_FOLDER)]

In [5]:
organs

Unnamed: 0,new_index,original_index,Name,Category,URL,SubCategory,id
0,1061,45713,s0178_aorta.nii.g_1.stl,aorta,https://uni-duisburg-essen.sciebo.de/s/HeShw1G...,aorta,0s0178_aorta.nii.g_1
1,127,8720,016793_kidneyright.stl,kidneyright,https://uni-duisburg-essen.sciebo.de/s/HeShw1G...,kidney,1016793_kidneyright
2,714,30529,086609_aorta.stl,aorta,https://uni-duisburg-essen.sciebo.de/s/HeShw1G...,aorta,2086609_aorta
3,2285,92354,s1016_liver.nii.g_1.stl,liver,https://uni-duisburg-essen.sciebo.de/s/HeShw1G...,liver,3s1016_liver.nii.g_1
4,1687,68838,s0611_aorta.nii.g_1.stl,aorta,https://uni-duisburg-essen.sciebo.de/s/HeShw1G...,aorta,4s0611_aorta.nii.g_1
...,...,...,...,...,...,...,...
2495,2385,96504,s1094_liver.nii.g_1.stl,liver,https://uni-duisburg-essen.sciebo.de/s/HeShw1G...,liver,2495s1094_liver.nii.g_1
2496,1088,46642,s0196_liver.nii.g_1.stl,liver,https://uni-duisburg-essen.sciebo.de/s/HeShw1G...,liver,2496s0196_liver.nii.g_1
2497,426,19830,047727_kidneyright.stl,kidneyright,https://uni-duisburg-essen.sciebo.de/s/HeShw1G...,kidney,2497047727_kidneyright
2498,2608,105140,s1257_liver.nii.g_1.stl,liver,https://uni-duisburg-essen.sciebo.de/s/HeShw1G...,liver,2498s1257_liver.nii.g_1


In [6]:
len(organs), len(available_latents_paths)

(2500, 1381)

# Delete corrupted .pt files (if any)

In [7]:
for file in tqdm(available_latents_paths):
    try:
        if file.endswith(".pt") and ("ipynb_checkpoints" not in file):
            torch.load(file)
    except:
        print(f"Delete corrupted file: {file}")
        os.remove(file)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1381/1381 [00:03<00:00, 413.48it/s]


# Train-Val-Test split

In [8]:
available_latents = [latent.replace(".pt", "") for latent in os.listdir(DATASET_FOLDER)]

In [9]:
available_latents

['727_087979_kidneyright',
 '1634_s0578_liver.nii.g_1',
 '970_s0107_aorta.nii.g_1',
 '362_041469_aorta',
 '1494_s0475_aorta.nii.g_1',
 '704_085229_liver',
 '1974_s0791_liver.nii.g_1',
 '2434_s1131_aorta.nii.g_1',
 '2573_s1237_liver.nii.g_1',
 '1161_s0245_liver.nii.g_1',
 '2516_s1189_liver.nii.g_1',
 '1789_s0674_liver.nii.g_1',
 '864_s0031_liver.nii.g_1',
 '2279_s1012_aorta.nii.g_1',
 '662_078645_liver',
 '2262_s0999_liver.nii.g_1',
 '96_012980_aorta',
 '1265_s0324_aorta.nii.g_1',
 '1428_s0429_liver.nii.g_1',
 '431_048249_aorta',
 '938_s0082_liver.nii.g_1',
 '1537_s0505_aorta.nii.g_1',
 '1886_s0732_aorta.nii.g_1',
 '2157_s0924_aorta.nii.g_1',
 '1146_s0238_aorta.nii.g_1',
 '1007_s0137_aorta.nii.g_1',
 '2163_s0928_liver.nii.g_1',
 '2430_s1128_aorta.nii.g_1',
 '391_044694_liver',
 '756_091235_aorta',
 '1340_s0369_liver.nii.g_1',
 '513_057428_kidneyright',
 '738_089268_aorta',
 '1104_s0211_liver.nii.g_1',
 '369_042071_kidneyright',
 '1234_s0303_aorta.nii.g_1',
 '1868_s0720_aorta.nii.g_1',
 

In [10]:
len(organs), len(available_latents)

(2500, 1381)

In [11]:
# id setting was bugged, had to do it again
organs['id']  = organs["new_index"].astype(str) + '_' + organs['Name'].apply(lambda x: x.split('.stl')[0])

In [12]:
organs

Unnamed: 0,new_index,original_index,Name,Category,URL,SubCategory,id
0,1061,45713,s0178_aorta.nii.g_1.stl,aorta,https://uni-duisburg-essen.sciebo.de/s/HeShw1G...,aorta,1061_s0178_aorta.nii.g_1
1,127,8720,016793_kidneyright.stl,kidneyright,https://uni-duisburg-essen.sciebo.de/s/HeShw1G...,kidney,127_016793_kidneyright
2,714,30529,086609_aorta.stl,aorta,https://uni-duisburg-essen.sciebo.de/s/HeShw1G...,aorta,714_086609_aorta
3,2285,92354,s1016_liver.nii.g_1.stl,liver,https://uni-duisburg-essen.sciebo.de/s/HeShw1G...,liver,2285_s1016_liver.nii.g_1
4,1687,68838,s0611_aorta.nii.g_1.stl,aorta,https://uni-duisburg-essen.sciebo.de/s/HeShw1G...,aorta,1687_s0611_aorta.nii.g_1
...,...,...,...,...,...,...,...
2495,2385,96504,s1094_liver.nii.g_1.stl,liver,https://uni-duisburg-essen.sciebo.de/s/HeShw1G...,liver,2385_s1094_liver.nii.g_1
2496,1088,46642,s0196_liver.nii.g_1.stl,liver,https://uni-duisburg-essen.sciebo.de/s/HeShw1G...,liver,1088_s0196_liver.nii.g_1
2497,426,19830,047727_kidneyright.stl,kidneyright,https://uni-duisburg-essen.sciebo.de/s/HeShw1G...,kidney,426_047727_kidneyright
2498,2608,105140,s1257_liver.nii.g_1.stl,liver,https://uni-duisburg-essen.sciebo.de/s/HeShw1G...,liver,2608_s1257_liver.nii.g_1


In [13]:
organs = organs[organs["id"].isin(available_latents)]
len(organs), len(available_latents)

(1376, 1381)

In [14]:
organs_train, organs_test = train_test_split(
    organs,
    test_size=TEST_SIZE,
    stratify=organs["Category"]
)
organs_test, organs_val = train_test_split(
    organs_test,
    test_size=VAL_SIZE,
    stratify=organs_test["Category"]
)

organs_train.shape, organs_test.shape, organs_val.shape

((1100, 7), (138, 7), (138, 7))

In [15]:
organs_train["Category"].value_counts()

Category
aorta          540
liver          437
kidneyright    123
Name: count, dtype: int64

In [16]:
organs_test["Category"].value_counts()

Category
aorta          68
liver          55
kidneyright    15
Name: count, dtype: int64

In [17]:
organs_val["Category"].value_counts()

Category
aorta          68
liver          54
kidneyright    16
Name: count, dtype: int64

In [18]:
lista_ids_train = list(organs_train.id.values)
lista_ids_test = list(organs_test.id.values)
lista_ids_val = list(organs_val.id.values)

In [19]:
with open('train_organs.pickle', 'wb') as file:
    pickle.dump(lista_ids_train, file)

with open('test_organs.pickle', 'wb') as file:
    pickle.dump(lista_ids_test, file)

with open('val_organs.pickle', 'wb') as file:
    pickle.dump(lista_ids_val, file)

In [20]:
csv_organs = organs[['id', 'Category']]

csv_organs.to_csv('organs_selected_complete.csv', index = False, header=False)