In [11]:
stratified = True
k = 5

save_org = False  # 대회에서 나누어준 train, test set을 csv로 저장 여부

py_random_seed = 42
np_random_seed = 42
kfold_random_state = 42

base_path = '/opt/ml/input/a-trac-colon'
colon_positive_dir = 'colon_positive'
colon_negative_dir = 'colon_negative'
colon_positive_test_dir = 'colon_positive_test'
colon_negative_test_dir = 'colon_negative_test'

train_csv_file_prefix = './train_'
test_csv_file_prefix = './test_'

In [20]:
import os
import random

import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, StratifiedKFold

## Set random seed

In [3]:
random.seed(py_random_seed)
np.random.seed(np_random_seed)

## Collect All Images

In [30]:
all_images = list() # list of (image_id, image_path, labels)

In [31]:
train_positive_file_names = os.listdir(os.path.join(base_path, colon_positive_dir))
train_negative_file_names = os.listdir(os.path.join(base_path, colon_negative_dir))
test_positive_file_names = os.listdir(os.path.join(base_path, colon_positive_test_dir))
test_negative_file_names = os.listdir(os.path.join(base_path, colon_negative_test_dir))

In [32]:
train_negative_images = [(file_name, os.path.join(base_path, colon_negative_dir, file_name), 0) 
                         for file_name in train_negative_file_names]
train_positive_images = [(file_name, os.path.join(base_path, colon_positive_dir, file_name), 1) 
                         for file_name in train_positive_file_names]
test_negative_images = [(file_name, os.path.join(base_path, colon_negative_test_dir, file_name), 0) 
                        for file_name in test_negative_file_names]
test_positive_images = [(file_name, os.path.join(base_path, colon_positive_test_dir, file_name), 1) 
                        for file_name in test_positive_file_names]

In [33]:
all_images.extend(train_negative_images)
all_images.extend(train_positive_images)
all_images.extend(test_negative_images)
all_images.extend(test_positive_images)

## Save Orginal Train, Test Set to CSV

In [34]:
if save_org:
    train_pd = pd.DataFrame(train_negative_images+train_positive_images, columns = ['image_id','image_path','labels'])
    test_pd = pd.DataFrame(test_negative_images+test_positive_images, columns = ['image_id','image_path','labels'])
    
    train_pd = train_pd.sort_values(by=['image_path']).reset_index(drop=True)
    test_pd = test_pd.sort_values(by=['image_path']).reset_index(drop=True)
    
    train_pd.to_csv(train_csv_file_prefix + 'org' + '.csv')
    test_pd.to_csv(test_csv_file_prefix + 'org' + '.csv')

## Configure K's Train, Test Set

In [35]:
all_images_pd = pd.DataFrame(all_images, columns = ['image_id','image_path','labels'])

In [36]:
if stratified:
    skf = StratifiedKFold(n_splits=k, random_state=kfold_random_state, shuffle=True)
    k_split_sets = skf.split(np.zeros(len(all_images_pd)), all_images_pd['labels'])
else:
    skf = KFold(n_splits=k, random_state=kfold_random_state, shuffle=True)
    k_split_sets = skf.split(all_images_pd)

for i_set, (train_indexes, test_indexes) in enumerate(k_split_sets):
    split_train_images_pd = all_images_pd.iloc[train_indexes, :]
    split_test_images_pd = all_images_pd.iloc[test_indexes, :]
    
    split_train_images_pd = split_train_images_pd.sort_values(by=['image_path']).reset_index(drop=True)
    split_test_images_pd = split_test_images_pd.sort_values(by=['image_path']).reset_index(drop=True)
    
    split_train_images_pd.to_csv(f'{train_csv_file_prefix}{i_set+1}_{k}.csv')
    split_test_images_pd.to_csv(f'{test_csv_file_prefix}{i_set+1}_{k}.csv')
