In [1]:
import os 
import glob
import numpy as np
import pandas as pd

positive_flist = glob.glob("../data/LVI_dataset/patch_image_size-400_overlap-100/positive/*.png")
negative_flist = glob.glob("../data/LVI_dataset/patch_image_size-400_overlap-100/negative/*.png")

In [2]:
def generate_patch_df(flist, label):
    df = pd.DataFrame({"fpath": flist})
    df['slide_id'] = df['fpath'].map(lambda x: x.split("/")[-1].split(".")[0].split("_")[0])
    df['patient_id'] = df['slide_id'].map(lambda x: x.split("-")[0])
    df['target'] = label

    df = df.loc[:, ["patient_id", "slide_id", "fpath", "target"]]
    
    return df

positive_df = generate_patch_df(positive_flist, 1)
negative_df = generate_patch_df(negative_flist, 0)

negative_df.head()

Unnamed: 0,patient_id,slide_id,fpath,target
0,351,351-21,../data/LVI_dataset/patch_image_size-400_overl...,0
1,351,351-21,../data/LVI_dataset/patch_image_size-400_overl...,0
2,351,351-21,../data/LVI_dataset/patch_image_size-400_overl...,0
3,351,351-21,../data/LVI_dataset/patch_image_size-400_overl...,0
4,351,351-21,../data/LVI_dataset/patch_image_size-400_overl...,0


In [15]:
def train_test_split(positive_df, negative_df, sampling_level=2, sampling_rate=0.2):
    # sampling_level {0: "patient_id", 1: "slide_id", 2: "patch"}
    if sampling_level == 0:
        column_name = "patient_id"
    elif sampling_level == 1:
        column_name = "slide_id"
    elif sampling_level == 2:
        column_name = "fpath"
    else:
        print("Set sampling level in [0, 1, 2]")
        raise
    
    N = len(pd.unique(positive_df[column_name]))
    
    test_index = np.random.choice(pd.unique(positive_df[column_name]), round(N * sampling_rate), replace=False)
    
    train_positive = positive_df[~positive_df[column_name].isin(test_index)]
    test_positive = positive_df[positive_df[column_name].isin(test_index)]

    train_negative = negative_df[~negative_df[column_name].isin(test_index)]
    test_negative = negative_df[negative_df[column_name].isin(test_index)]
    
    train_df = pd.concat([train_positive, train_negative]).reset_index(drop=True)
    test_df = pd.concat([test_positive, test_negative]).reset_index(drop=True)
    
    train_df, valid_df = train_valid_split(train_df, column_name, sampling_rate)
    
    return train_df, valid_df, test_df


def train_valid_split(train_df, column_name, sampling_rate):
    N = len(pd.unique(train_df[column_name]))
    valid_index = np.random.choice(pd.unique(train_df[column_name]), round(N * sampling_rate), replace=False)
        
    valid_df = train_df[train_df[column_name].isin(valid_index)]
    train_df = train_df[~train_df[column_name].isin(valid_index)]
    
    return train_df.reset_index(drop=True), valid_df.reset_index(drop=True)


train_test_split(positive_df, negative_df, sampling_level=2)

(     patient_id slide_id                                              fpath  \
 0           351   351-21  ../data/LVI_dataset/patch_image_size-400_overl...   
 1           351   351-21  ../data/LVI_dataset/patch_image_size-400_overl...   
 2           351   351-21  ../data/LVI_dataset/patch_image_size-400_overl...   
 3           351   351-21  ../data/LVI_dataset/patch_image_size-400_overl...   
 4           351   351-21  ../data/LVI_dataset/patch_image_size-400_overl...   
 ...         ...      ...                                                ...   
 1486        351   351-21  ../data/LVI_dataset/patch_image_size-400_overl...   
 1487        351   351-21  ../data/LVI_dataset/patch_image_size-400_overl...   
 1488        351   351-21  ../data/LVI_dataset/patch_image_size-400_overl...   
 1489        351   351-21  ../data/LVI_dataset/patch_image_size-400_overl...   
 1490        351   351-21  ../data/LVI_dataset/patch_image_size-400_overl...   
 
       target  
 0          1  
 1    