In [2]:
import sys; 
import ast
from glob import glob
import cv2
from skimage import io
import os
from datetime import datetime
import time
import random
from tqdm import tqdm
from contextlib import contextmanager
import math

import numpy as np
import pandas as pd
import sklearn
from sklearn.metrics import roc_auc_score, log_loss
from sklearn import metrics
from sklearn.model_selection import GroupKFold, StratifiedKFold, KFold
import torch
import torchvision
from torchvision import transforms
from torch import nn
from torch.utils.data import Dataset,DataLoader
from torch.utils.data.sampler import SequentialSampler, RandomSampler
from torch.nn.modules.loss import _WeightedLoss
import torch.nn.functional as F
import matplotlib.pyplot as plt

from torch.optim import Adam, SGD, AdamW
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, CosineAnnealingLR, ReduceLROnPlateau
from warmup_scheduler import GradualWarmupScheduler
import timm
import warnings
import joblib
from scipy.ndimage.interpolation import zoom
import nibabel as nib
import pydicom as dicom

  from scipy.ndimage.interpolation import zoom


In [3]:
datadir = "/Volumes/Data/kaggle/raw"

In [4]:

# Store segmentation paths in a dataframe
seg_paths = glob(f"{datadir}/segmentations/*")
seg_df = pd.DataFrame({'path': seg_paths})
seg_df['StudyInstanceUID'] = seg_df['path'].apply(lambda x:x.split('/')[-1][:-4])
seg_df = seg_df[['StudyInstanceUID','path']]
print('seg_df shape:', seg_df.shape)
seg_df.head(3)

seg_df shape: (87, 2)


Unnamed: 0,StudyInstanceUID,path
0,1.2.826.0.1.3680043.2243,/Volumes/Data/kaggle/raw/segmentations/1.2.826...
1,1.2.826.0.1.3680043.3376,/Volumes/Data/kaggle/raw/segmentations/1.2.826...
2,1.2.826.0.1.3680043.18906,/Volumes/Data/kaggle/raw/segmentations/1.2.826...


In [5]:
def load_dicom(path):
    """
    This supports loading both regular and compressed JPEG images. 
    See the first sell with `pip install` commands for the necessary dependencies
    """
    img = dicom.dcmread(path)
    img.PhotometricInterpretation = 'YBR_FULL'
    data = img.pixel_array
    data = data - np.min(data)
    if np.max(data) != 0:
        data = data / np.max(data)
    data = (data * 255).astype(np.uint8)
    return cv2.cvtColor(data, cv2.COLOR_GRAY2RGB), img

In [6]:
study_uid_list = seg_df["StudyInstanceUID"].tolist()

In [7]:
dataframe_list = []
os.makedirs(f"{datadir}/seg_25d_image", exist_ok=True)
os.makedirs(f"{datadir}/seg_25d_mask", exist_ok=True)

for file_name in tqdm(study_uid_list):
    ex_path = f"{datadir}/segmentations/{file_name}.nii"
    mask = nib.load(ex_path)
    mask = mask.get_fdata()  # convert to numpy array
    mask = mask[:, ::-1, ::-1].transpose(1, 0, 2)
    mask = np.clip(mask,0,8).astype(np.uint8)
    mask = np.ascontiguousarray(mask)

    train_image_path = glob(f"{datadir}/train_images/{file_name}/*")
    train_image_path = sorted(train_image_path, key=lambda x:int(x.split("/")[-1].replace(".dcm","")))
    image_list = []
    for path in train_image_path:
        im, meta = load_dicom(path)
        image_list.append(im[:,:,0])
    image = np.stack(image_list, axis=2)
    
    assert image.shape == mask.shape, f"Image and mask {file_name} should be the same size, but are {image.shape} and {mask.shape}"
    slice_num = image.shape[2]

    for i in range(1, slice_num-1):
        image_25d = image[:,:, i-1:i+2]
        mask_25d = mask[:,:, i-1:i+2]
        assert image_25d.shape == mask_25d.shape == (512, 512, 3), f"Image and mask {file_name} should be (512, 512, 3), but are {image_25d.shape} and {mask_25d.shape}"
        image_save_path = f"{datadir}/seg_25d_image/{file_name}_{i}.npy"
        mask_save_path =  f"{datadir}/seg_25d_mask/{file_name}_{i}.npy"
        np.save(image_save_path, image_25d)
        np.save(mask_save_path, mask_25d)
        dataframe_list.append([f"{file_name}_{i}", file_name, i, image_save_path, mask_save_path])

100%|██████████| 87/87 [2:07:23<00:00, 87.85s/it]   


In [8]:
seg_25d_df = pd.DataFrame(dataframe_list, columns=["id", "StudyInstanceUID", "slice_num", "image_path", "mask_path"])
seg_25d_df["fold"] = -1

gkf = GroupKFold(n_splits=5)
for idx, (train_index, test_index) in enumerate(gkf.split(X=seg_25d_df, groups=seg_25d_df['StudyInstanceUID'].values)):
    seg_25d_df.loc[test_index, 'fold'] = idx


In [9]:
for i in range(5):
    study_num = len(np.unique(seg_25d_df[seg_25d_df["fold"] == i]["StudyInstanceUID"]))
    print(f"fold{i} num: {study_num}")

fold0 num: 17
fold1 num: 18
fold2 num: 17
fold3 num: 17
fold4 num: 18


In [10]:
seg_25d_df.to_csv(f"{datadir}/seg_25d.csv", index=False)