In [39]:
import os
import cv2
import glob
import h5py
import numpy as np
import pandas as pd
from tqdm import tqdm

from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings(action='ignore')


def generate_patch_df(flist, label):
    df = pd.DataFrame({"fpath": flist})
    df['slide_id'] = df['fpath'].map(lambda x: x.split("/")[-1].split(".")[0].split("_")[0])
    df['patient_id'] = df['slide_id'].map(lambda x: x.split("-")[0])
    df['target'] = label

    df = df.loc[:, ["patient_id", "slide_id", "fpath", "target"]]
    
    return df


def define_dataset(positive_df, negative_df, normal_df, sampling_rate=0.2):
    X_train_pos, X_test_pos, y_train_pos, y_test_pos = train_test_split(positive_df['fpath'], positive_df['target'], test_size=sampling_rate, random_state=1234)
    X_train_pos, X_valid_pos, y_train_pos, y_valid_pos = train_test_split(X_train_pos, y_train_pos, test_size=sampling_rate, random_state=1234)

    X_train_neg, X_test_neg, y_train_neg, y_test_neg = train_test_split(negative_df['fpath'], negative_df['target'], test_size=sampling_rate, random_state=1234)
    X_train_neg, X_valid_neg, y_train_neg, y_valid_neg = train_test_split(X_train_neg, y_train_neg, test_size=sampling_rate, random_state=1234)
    
    X_train_normal, X_test_normal, y_train_normal, y_test_normal = train_test_split(normal_df['fpath'], normal_df['target'], test_size=sampling_rate, random_state=1234)
    X_train_normal, X_valid_normal, y_train_normal, y_valid_normal = train_test_split(X_train_normal, y_train_normal, test_size=sampling_rate, random_state=1234)
    
    X_train = np.hstack([X_train_pos, X_train_neg, X_train_normal])
    X_valid = np.hstack([X_valid_pos, X_valid_neg, X_valid_normal])
    X_test = np.hstack([X_test_pos, X_test_neg, X_test_normal])

    y_train = np.hstack([y_train_pos, y_train_neg, y_train_normal])
    y_valid = np.hstack([y_valid_pos, y_valid_neg, y_valid_normal])
    y_test = np.hstack([y_test_pos, y_test_neg, y_test_normal])
    
    return X_train, X_valid, X_test, y_train, y_valid, y_test
    
    
positive_flist = glob.glob("../data/LVI_dataset/patch_image_size-300_overlap-0/LVI/*.png")
negative_flist = glob.glob("../data/LVI_dataset/patch_image_size-300_overlap-0/Negative/*.png")
normal_flist = glob.glob("../data/LVI_dataset/patch_image_size-300_overlap-0/Normal/*.png")

positive_df = generate_patch_df(positive_flist, 1)
negative_df = generate_patch_df(negative_flist, 0)
normal_df = generate_patch_df(normal_flist, 2)

X_train, X_valid, X_test, y_train, y_valid, y_test = define_dataset(positive_df, negative_df, normal_df, sampling_rate=0.2)
print(f"X train: {X_train.shape}\nX valid: {X_valid.shape}\nX test: {X_test.shape}")
print(f"y train: {y_train.shape}\ny valid: {y_valid.shape}\ny test: {y_test.shape}")


X train: (122737,)
X valid: (30686,)
X test: (38358,)
y train: (122737,)
y valid: (30686,)
y test: (38358,)


In [45]:
X_test.shape[0]

38358

In [47]:
np.empty((X_test.shape[0], 300, 300, 3)).shape

(38358, 300, 300, 3)

In [None]:
import pickle

image_ds = np.empty((X_test.shape[0], 300, 300, 3))

for i, path in enumerate(tqdm(X_test)):
        image = cv2.imread(path, cv2.IMREAD_COLOR)
        image_ds[i, :, :, :] = image

 32%|███████████▌                        | 12381/38358 [01:11<02:18, 187.30it/s]

In [33]:
# def make_h5_dataset(fname, X, y):
#     with h5py.File(fname, 'w', libver='latest') as f:
#         image_ds = f.create_dataset('images', shape=(len(X), 300, 300, 3), dtype=int,
#                                     compression='gzip', compression_opts=2, chunks=True)
#         target_ds = f.create_dataset('target', shape=(len(X)), dtype=int,
#                                     compression='gzip', compression_opts=2, chunks=True)
    
#         for i, path in enumerate(tqdm(X)):
#             image = cv2.imread(path, cv2.IMREAD_COLOR)
#             image_ds[i, :, :, :] = image

#             target = y[i]
#             target_ds[i] = target

In [34]:
# # make_h5_dataset("../data/LVI_dataset/patch_300_train_dataset.h5", X_train, y_train)
# # make_h5_dataset("../data/LVI_dataset/patch_300_valid_dataset.h5", X_valid, y_valid)
# make_h5_dataset("../data/LVI_dataset/patch_300_test_dataset.h5", X_test, y_test)

  0%|                                      | 6/38358 [00:31<56:33:41,  5.31s/it]


KeyboardInterrupt: 