# Code for creating test/train splits in different folders

For clarity, I actually want to save the training and testing data in different folders. Let's do that. 

Imports: 

In [46]:
import os
import random
from random import Random
from random import shuffle
from math import floor
from shutil import copyfile


## 1. Get a list of the files
(thanks, https://stackoverflow.com/questions/42471570/how-to-split-documents-into-training-set-and-test-set)

In [47]:
def get_file_list_from_dir(datadir):
    all_files = os.listdir(os.path.abspath(datadir))
    data_files = list(filter(lambda file: file.endswith('.png'), all_files))
    return data_files

phase_list = get_file_list_from_dir('./PHASE/')
print(len(phase_list))

dapi_list = get_file_list_from_dir('./DAPI/')
print(len(dapi_list))

phase_list = sorted(phase_list)
dapi_list = sorted(dapi_list)

print(phase_list[:3])
print(dapi_list[:3])


12288
12288
['PHASE000001.png', 'PHASE000002.png', 'PHASE000003.png']
['DAPI000001.png', 'DAPI000002.png', 'DAPI000003.png']


## 2. Randomize the files

In [48]:
seed = 42
random.Random(seed).shuffle(phase_list)
random.Random(seed).shuffle(dapi_list)

print(phase_list[:3])
print(dapi_list[:3])

['PHASE008950.png', 'PHASE005442.png', 'PHASE007077.png']
['DAPI008950.png', 'DAPI005442.png', 'DAPI007077.png']


## 3. Split into training and test sets 

In [49]:
def get_training_and_testing_sets(file_list):
    split = 0.8
    split_index = floor(len(file_list) * split)
    training = file_list[:split_index]
    testing = file_list[split_index:]
    return training, testing

phase_train, phase_test = get_training_and_testing_sets(phase_list)
dapi_train, dapi_test = get_training_and_testing_sets(dapi_list)

print(phase_train[:3])
print(phase_test[:3])
print(dapi_train[:3])
print(dapi_test[:3])

['PHASE008950.png', 'PHASE005442.png', 'PHASE007077.png']
['PHASE000612.png', 'PHASE008768.png', 'PHASE011036.png']
['DAPI008950.png', 'DAPI005442.png', 'DAPI007077.png']
['DAPI000612.png', 'DAPI008768.png', 'DAPI011036.png']


## 4. Find these files and save them into their own directories

In [50]:
def get_and_copy_files(file_list, old_dir, new_dir):
    for x in file_list:
        src = old_dir + x
        dst = new_dir + x
        copyfile(src, dst)


get_and_copy_files(phase_train, './PHASE/', './Training/PHASE/')
get_and_copy_files(phase_test, './PHASE/', './Testing/PHASE/')
get_and_copy_files(dapi_train, './DAPI/', './Training/DAPI/')
get_and_copy_files(dapi_test, './DAPI/', './Testing/DAPI/')


