# Creation of an all-in-one dataset

This notebook combines all cut-out training and test images from all regions with validated, labeled data into one folder for training, validation and test images.

The validation set is _randomly sampled_ from the training data.

In [1]:
from os import makedirs
from os.path import exists, join, isfile
from os import listdir
import numpy as np

In [2]:
regions = ['borde_rural', 'borde_soacha', 'mixco_3', 'mixco_1_and_ebenezer', 'dennery']

Define the paths

In [3]:
data_dir = join('..', '..', 'data', 'data2')
train_dir = join('..', '..', 'data', 'all_in_one', 'train')
valid_dir = join('..', '..', 'data', 'all_in_one', 'valid')
test_dir = join('..', '..', 'data', 'all_in_one', 'test')

In [4]:
materials = {'concrete_cement':0, 'healthy_metal':1, 'incomplete':2, 'irregular_metal':3, 'other':4}

Create the necessary folders

In [5]:
from shutil import copy

In [None]:
for mat in materials.keys():
    for folder in [train_dir, valid_dir]:
        directory = join(folder, mat) 
        if not exists(directory):
            makedirs(directory)

Copy training files into the right directory (a share into the training folder the rest in the validation folder)

In [None]:
for mat in materials.keys():
    train_mat_dir = join(train_dir, mat)
    valid_mat_dir = join(valid_dir, mat)
    
    for region in regions:
        region_mat_dir = join(data_dir, region, 'roofs_train', mat)
        print(region_mat_dir)
        
        all_files = listdir(region_mat_dir)
        validation_ratio = 0.3
        
        valid_files = np.random.choice(all_files, int(len(all_files)*validation_ratio))
        train_files = [x for x in all_files if x not in valid_files]

        for file_name in valid_files:
            full_file_name = join(region_mat_dir, file_name)
            if isfile(full_file_name):
                copy(full_file_name, valid_mat_dir)
                
        for file_name in train_files:
            full_file_name = join(region_mat_dir, file_name)
            if isfile(full_file_name):
                copy(full_file_name, train_mat_dir)

Do a similar thing for the test files

In [6]:
test_dir = join(test_dir)
for region in regions:
    region_dir = join(data_dir, region, 'roofs_test')
    print(region_dir)
    all_files = listdir(region_dir)
    
    for file_name in all_files:
        full_file_name = join(region_dir, file_name)
        if isfile(full_file_name):
            copy(full_file_name, test_dir)

../../data/data2/borde_rural/roofs_test
../../data/data2/borde_soacha/roofs_test
../../data/data2/mixco_3/roofs_test
../../data/data2/mixco_1_and_ebenezer/roofs_test
../../data/data2/dennery/roofs_test
