In [1]:
# Adaptive histogram equalization
from sklearn.model_selection import train_test_split
import os
import shutil

In [2]:
DATA_DIRECTORY = '../../data'

ORIGINAL_DATA = os.path.join(DATA_DIRECTORY, 'original', 'asl-alphabet')
FABRICATED_DATA = os.path.join(DATA_DIRECTORY, 'fabricated', 'asl_alphabet')

ORIGINAL_TRAIN_DIR = os.path.join(ORIGINAL_DATA, 'asl_alphabet_train')

NEW_TRAIN_DIR = os.path.join(FABRICATED_DATA, 'split_asl_alphabet_train')
NEW_TEST_DIR = os.path.join(FABRICATED_DATA, 'split_asl_alphabet_test')

FABRICATED_DIRS = [
    NEW_TRAIN_DIR,
    NEW_TEST_DIR
]

In [3]:
TEST_TRAIN_SPLIT = .95 # Train Size

In [4]:
# Go through training folder, get just the filenames of each folder. Split these files into test/training sets

In [5]:
# A, B, C, D, DELETE, etc
ASL_FOLDERS = os.listdir(ORIGINAL_TRAIN_DIR)

In [6]:
def create_test_folders_with_letter(letter):
    for test_dir in FABRICATED_DIRS:
        test_dir_with_letter = os.path.join(test_dir, letter)
        os.makedirs(test_dir_with_letter)

def make_sure_folders_exist():
    for sign in ASL_FOLDERS:
        create_test_folders_with_letter(sign)

In [7]:
def get_folder_for_batch_type(batch_type='train'):
    if batch_type == 'train':
        return NEW_TRAIN_DIR
    else:
        return NEW_TEST_DIR

In [8]:
def copy_files(sign, files, batch_type='train'):
    infolder = os.path.join(ORIGINAL_TRAIN_DIR, sign)
    for file in files:
        infile = os.path.join(infolder, file)
        new_dir = get_folder_for_batch_type(batch_type)
        outdir = os.path.join(new_dir, sign)
        outfile = os.path.join(outdir, file)
        shutil.copyfile(infile, outfile)

In [9]:
# this may throw errors if directories already exist
try:
    make_sure_folders_exist()
except:
    print('Cannot create all folders')

In [10]:
# take random samples from the original dataset and create test/train images out of this
def split_data():
    for sign in ASL_FOLDERS:
        filenames = os.listdir(os.path.join(ORIGINAL_TRAIN_DIR, sign))
        train, test = train_test_split(filenames, train_size=TEST_TRAIN_SPLIT)
        copy_files(sign, train, 'train')
        copy_files(sign, test, 'test')

%time split_data()



CPU times: user 5.96 s, sys: 14.9 s, total: 20.9 s
Wall time: 47.4 s


In [11]:
training_length = len(os.listdir(os.path.join(NEW_TRAIN_DIR, 'A')))
testing_length = len(os.listdir(os.path.join(NEW_TEST_DIR, 'A')))

print('{} images in training dir; and {} images in testing dir'.format(training_length, testing_length))

2850 images in training dir; and 150 images in testing dir
