In [22]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0"  # specify which GPU(s) to be used

In [23]:
import torch
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold

### Define training folders and files

In [24]:
LABELS = './prostate-cancer-grade-assessment/train.csv'
TRAIN = './panda-16x128x128-tiles-data/train/'

### Define K-fold and random seed

In [29]:
nfolds = 5
SEED = 2020

### Assign folder to each case

In [37]:
df = pd.read_csv(LABELS).set_index('image_id') ## read in "train.csv" and set 'image_id' as index column
files = sorted(set([p[:32] for p in os.listdir(TRAIN)])) ## extract image id from the img patch folder
df = df.loc[files]
df = df.reset_index() ## remove 'image_id' that without masks
## stratified KFold class that can preserve the sample percentatage in each fold. 
splits = StratifiedKFold(n_splits=nfolds, random_state=SEED, shuffle=True) 
splits = list(splits.split(df,df.isup_grade)) ## list[(fold1_train_idx, fold1_test_idx), (fold2_train_idx, fold2_test_idx), ...]

## mark each sample to the fold, in which the sample serve as a test case
folds_splits = np.zeros(len(df)).astype(np.int) ## [0, 0, ...] with the number of cases
for i in range(nfolds): 
    folds_splits[splits[i][1]] = i

df['split'] = folds_splits ## add the K fold assignment column
df = df.set_index('image_id')
df.head(10)

Unnamed: 0_level_0,data_provider,isup_grade,gleason_score,split
image_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0005f7aaab2800f6170c399693a96917,karolinska,0,0+0,0
000920ad0b612851f8e01bcc880d9b3d,karolinska,0,0+0,1
0018ae58b01bdadc8e347995b69f99aa,radboud,4,4+4,1
001c62abd11fa4b57bf7a6c603a11bb9,karolinska,4,4+4,0
001d865e65ef5d2579c190a0e0350d8f,karolinska,0,0+0,1
002a4db09dad406c85505a00fb6f6144,karolinska,0,0+0,4
003046e27c8ead3e3db155780dc5498e,karolinska,1,3+3,2
0032bfa835ce0f43a92ae0bbab6871cb,karolinska,1,3+3,1
003a91841da04a5a31f808fb5c21538a,karolinska,1,3+3,1
003d4dd6bd61221ebc0bfb9350db333f,karolinska,1,3+3,4


### Save df to file

In [38]:
df.to_csv('./panda-16x128x128-tiles-data/{}_fold_train.csv'.format(nfolds))

## Read the file back to test index

In [41]:
TRAINFOLD = './panda-16x128x128-tiles-data/{}_fold_train.csv'.format(nfolds)

In [44]:
df = pd.read_csv(TRAINFOLD) ## read in "train.csv" and set 'image_id' as index column
df.head(10)

Unnamed: 0,image_id,data_provider,isup_grade,gleason_score,split
0,0005f7aaab2800f6170c399693a96917,karolinska,0,0+0,0
1,000920ad0b612851f8e01bcc880d9b3d,karolinska,0,0+0,1
2,0018ae58b01bdadc8e347995b69f99aa,radboud,4,4+4,1
3,001c62abd11fa4b57bf7a6c603a11bb9,karolinska,4,4+4,0
4,001d865e65ef5d2579c190a0e0350d8f,karolinska,0,0+0,1
5,002a4db09dad406c85505a00fb6f6144,karolinska,0,0+0,4
6,003046e27c8ead3e3db155780dc5498e,karolinska,1,3+3,2
7,0032bfa835ce0f43a92ae0bbab6871cb,karolinska,1,3+3,1
8,003a91841da04a5a31f808fb5c21538a,karolinska,1,3+3,1
9,003d4dd6bd61221ebc0bfb9350db333f,karolinska,1,3+3,4


In [51]:
val_inx = df.index[df['split'] == 1].tolist()
print(val_inx[:10])

[1, 2, 4, 7, 8, 16, 19, 22, 32, 36]


In [57]:
train_inx = list(set([x for x in range(len(df))]) - set(val_inx))
print(train_inx[:10])

[0, 3, 5, 6, 9, 10, 11, 12, 13, 14]


In [58]:
len(train_inx), len(val_inx)

(8413, 2103)

In [64]:
df.loc[val_inx[0], 'image_id'], df.loc[val_inx[0], 'isup_grade']

('000920ad0b612851f8e01bcc880d9b3d', 0)