In [1]:
import os
import csv
import pandas as pd
import pickle
import time
import random

### Preparation of Dental data

#### 1. Construct dataframe with all information

In [None]:
root_path = "/home/vlab/Documents/Collections/Dental"

collections_list = ["STS-Tooth",
                    # "Medicim", 
                    "Xiang_li"]

list_files = []
list_error_files = []
df_data = []
for collection_name in collections_list:
    db_path = os.path.join(root_path, collection_name, "db_organization.csv")
    df = pd.read_csv(db_path)
    relative_paths = df["relative_path"].tolist()
    file_names = df["file_name"].tolist()

    for idx, row in df.iterrows():
        if row["useful"] != 1: continue
        if row['file_name'] in relative_paths[idx]:
            relative_paths[idx] = relative_paths[idx].replace(row['file_name'], "")

        file_path = os.path.join(
            root_path, collection_name, relative_paths[idx], row['file_name']
        )
        if not os.path.exists(file_path):
            list_error_files.append(file_path)
            print("ERROR", collection_name, file_path)
        list_files.append(file_path)
        if row['file_name'].endswith(".nii.gz"):
          file_name_noext = row['file_name'][:-7]
        else:
          file_name_noext = row['file_name'][:-4]
          
        df_data.append(
            {
                "database": collection_name,
                "relative_path": relative_paths[idx],
                "file_name": row['file_name'],
                "full_path": file_path,
                "id_name":collection_name +'_'+ file_name_noext
            }
        )

# dataframe_path = 'database_dental.csv'
# pd.DataFrame(df_data).to_csv(dataframe_path, index=False)

#### 2. Convert data to nrrd using plastimatch_convert.sh which read the database_dental.csv file


#### 3. Create scan_list.pkl completo

In [2]:
path_nrrd_files = '/data/home/jaruanob/Documents/Collections/Dental/CT-FM_data_v1/'
# path_nrrd_files = '/home/vlab/Documents/Collections/Dental/CT-FM_data_v1/'

list_files = os.listdir(path_nrrd_files)
print(list_files)
print(len(list_files))

root_data_path = '/data/home/jaruanob/Documents/Repositories/Foundational_models/Experiments/CT-FM'
# root_data_path = '/home/vlab/Documents/Repositories/Foundational_models/Experiments/CT-FM'

# expe_name = 'test'
expe_name = str(int(time.time()))
# expe_name = '1761820031'
expe_path = os.path.join(root_data_path,expe_name)
if not os.path.exists(expe_path): os.mkdir(expe_path)

pkl_name = 'scan_list-CT-FM_data_v1.pkl'
pkl_path = os.path.join(expe_path,pkl_name)

list_path_files = []
for file_name in list_files:
  list_path_files.append(os.path.join(path_nrrd_files,file_name))

with open(pkl_path, 'wb') as file:
    pickle.dump(list_path_files, file)
    
print(list_path_files)
print('Expe name:', expe_path)

['STS-Tooth_Integrity_U_149.nrrd', 'Xiang_li_054_cbct.nrrd', 'STS-Tooth_Integrity_U_102.nrrd', 'STS-Tooth_Integrity_U_066.nrrd', 'Xiang_li_227_cbct.nrrd', 'Xiang_li_279_cbct.nrrd', 'Xiang_li_091_cbct.nrrd', 'Xiang_li_134_cbct.nrrd', 'Xiang_li_142_cbct.nrrd', 'Xiang_li_294_cbct.nrrd', 'STS-Tooth_Integrity_U_191.nrrd', 'STS-Tooth_Integrity_U_057.nrrd', 'Xiang_li_022_cbct.nrrd', 'Xiang_li_187_cbct.nrrd', 'STS-Tooth_Integrity_U_178.nrrd', 'Xiang_li_251_cbct.nrrd', 'STS-Tooth_Integrity_U_097.nrrd', 'Xiang_li_278_cbct.nrrd', 'STS-Tooth_Integrity_U_214.nrrd', 'Xiang_li_226_cbct.nrrd', 'Xiang_li_055_cbct.nrrd', 'STS-Tooth_Integrity_U_035.nrrd', 'STS-Tooth_Integrity_U_151.nrrd', 'Xiang_li_090_cbct.nrrd', 'Xiang_li_295_cbct.nrrd', 'Xiang_li_143_cbct.nrrd', 'STS-Tooth_Integrity_U_160.nrrd', 'STS-Tooth_Integrity_U_004.nrrd', 'STS-Tooth_Integrity_U_225.nrrd', 'STS-Tooth_Integrity_U_189.nrrd', 'Xiang_li_250_cbct.nrrd', 'Xiang_li_186_cbct.nrrd', 'Xiang_li_023_cbct.nrrd', 'Xiang_li_168_cbct.nrrd', 'ST

#### 4. Create partitions fos train, validation and test

In [3]:

random.seed(1)
random.shuffle(list_path_files)

# Define split ratios
train_ratio = 0.75
val_ratio = 0.05
test_ratio = 0.20

# Compute split indices
n_total = len(list_path_files)
n_train = int(n_total * train_ratio)
n_val = int(n_total * val_ratio)

# Split
train_paths = list_path_files[:n_train]
val_paths = list_path_files[n_train:n_train + n_val]
test_paths = list_path_files[n_train + n_val:]

print("Train:", len(train_paths))
print(train_paths)
pkl_name = 'scan_list-CT-FM_data_v1_train.pkl'
with open(os.path.join(expe_path,pkl_name), 'wb') as file:
    pickle.dump(train_paths, file)
    
print("Validation:", len(val_paths))
print(val_paths)
pkl_name = 'scan_list-CT-FM_data_v1_validation.pkl'
with open(os.path.join(expe_path,pkl_name), 'wb') as file:
    pickle.dump(val_paths, file)

print("Test:", len(test_paths))
print(test_paths)
pkl_name = 'scan_list-CT-FM_data_v1_test.pkl'
with open(os.path.join(expe_path,pkl_name), 'wb') as file:
    pickle.dump(test_paths, file)

Train: 386
['/data/home/jaruanob/Documents/Collections/Dental/CT-FM_data_v1/Xiang_li_130_cbct.nrrd', '/data/home/jaruanob/Documents/Collections/Dental/CT-FM_data_v1/Xiang_li_173_cbct.nrrd', '/data/home/jaruanob/Documents/Collections/Dental/CT-FM_data_v1/STS-Tooth_Integrity_U_037.nrrd', '/data/home/jaruanob/Documents/Collections/Dental/CT-FM_data_v1/STS-Tooth_Integrity_U_101.nrrd', '/data/home/jaruanob/Documents/Collections/Dental/CT-FM_data_v1/STS-Tooth_Integrity_U_105.nrrd', '/data/home/jaruanob/Documents/Collections/Dental/CT-FM_data_v1/STS-Tooth_Integrity_U_187.nrrd', '/data/home/jaruanob/Documents/Collections/Dental/CT-FM_data_v1/Xiang_li_133_cbct.nrrd', '/data/home/jaruanob/Documents/Collections/Dental/CT-FM_data_v1/STS-Tooth_Integrity_U_147.nrrd', '/data/home/jaruanob/Documents/Collections/Dental/CT-FM_data_v1/Xiang_li_174_cbct.nrrd', '/data/home/jaruanob/Documents/Collections/Dental/CT-FM_data_v1/Xiang_li_118_cbct.nrrd', '/data/home/jaruanob/Documents/Collections/Dental/CT-FM_da