In [1]:
# Adaptive histogram equalization
from sklearn.model_selection import train_test_split
import os
import shutil

In [2]:
DATA_DIRECTORY = os.path.join('..', '..', 'data')

ORIGINAL_DATA = os.path.join(DATA_DIRECTORY, 'original', 'asl-alphabet')
FABRICATED_DATA = os.path.join(DATA_DIRECTORY, 'fabricated', 'asl_alphabet')

ORIGINAL_TRAIN_DIR = os.path.join(ORIGINAL_DATA, 'asl_alphabet_train')
IMAGES_FROM_MOVIES_DIR = os.path.join(DATA_DIRECTORY, 'fabricated', 'movies')

NEW_TRAIN_DIR = os.path.join(FABRICATED_DATA, 'split_asl_alphabet_train')
NEW_TEST_DIR = os.path.join(FABRICATED_DATA, 'split_asl_alphabet_test')

FABRICATED_DIRS = [
    NEW_TRAIN_DIR,
    NEW_TEST_DIR
]

In [3]:
TEST_TRAIN_SPLIT = .95 # Train Size
DRY_RUN = False

In [4]:
# Go through training folder, get just the filenames of each folder. Split these files into test/training sets

In [5]:
# https://stackoverflow.com/questions/141291/how-to-list-only-top-level-directories-in-python
# We don't want all files, just "folders" from the given directory
def get_real_directories(directory):
    return next(os.walk(directory))[1]


In [6]:
def get_sign_names_from_directory(directory):
    try:
        return get_real_directories(directory)
    except:
        print('{} directory appears to not exist'.format(directory))
        return []

In [7]:
# A, B, C, D, DELETE, etc
TRAIN_NAMES = get_sign_names_from_directory(ORIGINAL_TRAIN_DIR)
MOVIE_NAMES = get_sign_names_from_directory(IMAGES_FROM_MOVIES_DIR)

TRAIN_PATHS = [os.path.join(ORIGINAL_TRAIN_DIR, name) for name in TRAIN_NAMES]
MOVIE_PATHS = [os.path.join(IMAGES_FROM_MOVIES_DIR, name) for name in MOVIE_NAMES]

ASL_FOLDERS = TRAIN_PATHS + MOVIE_PATHS
ASL_SIGNS_NAMES = TRAIN_NAMES + MOVIE_NAMES
print('{} total signs'.format(len(ASL_FOLDERS)))

30 total signs


In [8]:
def create_test_folders_with_letter(letter):
    for test_dir in FABRICATED_DIRS:
        test_dir_with_letter = os.path.join(test_dir, letter)
        try:
            os.makedirs(test_dir_with_letter)
        except:
            continue

def make_sure_folders_exist():
    for sign in ASL_SIGNS_NAMES:
        create_test_folders_with_letter(sign)

In [9]:
def get_folder_for_batch_type(batch_type='train'):
    if batch_type == 'train':
        return NEW_TRAIN_DIR
    else:
        return NEW_TEST_DIR

In [10]:
def copy_files(sign, infolder, files, batch_type='train'):
    for file in files:
        infile = os.path.join(infolder, file)
        new_dir = get_folder_for_batch_type(batch_type)
        outdir = os.path.join(new_dir, sign)
        outfile = os.path.join(outdir, file)
        shutil.copyfile(infile, outfile)

In [11]:
# this may throw errors if directories already exist
try:
    make_sure_folders_exist()
except:
    print('Cannot create all folders')

In [12]:
# take random samples from the original dataset and create test/train images out of this
def split_data():
    if DRY_RUN:
        print('This is just a dry run - will not create files')
        print()
    print('# train | # test | sign')
    print('-----------------------')
    for sign, input_folder_path in zip(ASL_SIGNS_NAMES, ASL_FOLDERS):
        filenames = os.listdir(input_folder_path)
        train, test = train_test_split(filenames, train_size=TEST_TRAIN_SPLIT)
        print('{}    | {}    | {}'.format(len(train), len(test), sign))
        if not DRY_RUN:
            copy_files(sign, input_folder_path, train, 'train')
            copy_files(sign, input_folder_path, test, 'test')

%time split_data()

# train | # test | sign
-----------------------
2850    | 150    | R




2850    | 150    | U
2850    | 150    | I
2850    | 150    | N
2850    | 150    | G
2850    | 150    | Z
2850    | 150    | T
2850    | 150    | S
2850    | 150    | A
2850    | 150    | F
2850    | 150    | O
2850    | 150    | H
2850    | 150    | del
2850    | 150    | nothing
2850    | 150    | space
2850    | 150    | M
2850    | 150    | J
2850    | 150    | C
2850    | 150    | D
2850    | 150    | V
2850    | 150    | Q
2850    | 150    | X
2850    | 150    | E
2850    | 150    | B
2850    | 150    | K
2850    | 150    | L
2850    | 150    | Y
2850    | 150    | P
2850    | 150    | W
137    | 8    | mother
CPU times: user 6.73 s, sys: 20.8 s, total: 27.6 s
Wall time: 55.9 s
