# This will take the original training set (and images scraped from movies) and create a test/training set

In [1]:
# Adaptive histogram equalization
from sklearn.model_selection import train_test_split
import os
import shutil
import random


In [2]:
DATA_DIRECTORY = os.path.join('..', '..', 'data')

ORIGINAL_DATA = os.path.join(DATA_DIRECTORY, 'original', 'asl-alphabet')
FABRICATED_DATA = os.path.join(DATA_DIRECTORY, 'fabricated', 'asl_alphabet')

ORIGINAL_TRAIN_DIR = os.path.join(ORIGINAL_DATA, 'asl_alphabet_train')
IMAGES_FROM_MOVIES_DIR = os.path.join(DATA_DIRECTORY, 'fabricated', 'movies')

NEW_TRAIN_DIR = os.path.join(FABRICATED_DATA, 'split_asl_alphabet_train')
NEW_TEST_DIR = os.path.join(FABRICATED_DATA, 'split_asl_alphabet_test')

FABRICATED_DIRS = [
    NEW_TRAIN_DIR,
    NEW_TEST_DIR
]

In [3]:
TEST_TRAIN_SPLIT = .95 # Train Size
DRY_RUN = False
MAX_PER_CLASS = 300
DELETE_FIRST = True # Delete new folders before starting

TRAIN_WITH_MOVIES_IMAGES = True
TRAIN_WITH_ORIGINAL_IMAGES = False
#MAX_PER_CLASS = None

In [4]:
# Go through training folder, get just the filenames of each folder. Split these files into test/training sets

In [5]:
# https://stackoverflow.com/questions/141291/how-to-list-only-top-level-directories-in-python
# We don't want all files, just "folders" from the given directory
def get_real_directories(directory):
    return next(os.walk(directory))[1]


In [6]:
def get_sign_names_from_directory(directory):
    try:
        return get_real_directories(directory)
    except:
        print('{} directory appears to not exist'.format(directory))
        return []

In [7]:
# A, B, C, D, DELETE, etc
TRAIN_NAMES = get_sign_names_from_directory(ORIGINAL_TRAIN_DIR)
MOVIE_NAMES = get_sign_names_from_directory(IMAGES_FROM_MOVIES_DIR)

TRAIN_PATHS = [os.path.join(ORIGINAL_TRAIN_DIR, name) for name in TRAIN_NAMES]
MOVIE_PATHS = [os.path.join(IMAGES_FROM_MOVIES_DIR, name) for name in MOVIE_NAMES]

ASL_FOLDERS = []
ASL_SIGNS_NAMES = []
if TRAIN_WITH_ORIGINAL_IMAGES:
    print('Training with original images')
    ASL_FOLDERS = ASL_FOLDERS + TRAIN_PATHS
    ASL_SIGNS_NAMES = ASL_SIGNS_NAMES + TRAIN_NAMES
if TRAIN_WITH_MOVIES_IMAGES:
    print('Training with images from movies')
    ASL_FOLDERS = ASL_FOLDERS + MOVIE_PATHS
    ASL_SIGNS_NAMES = ASL_SIGNS_NAMES + MOVIE_NAMES

print('{} total signs'.format(len(ASL_FOLDERS)))

Training with images from movies
41 total signs


In [8]:
def create_test_folders_with_letter(letter):
    for test_dir in FABRICATED_DIRS:
        test_dir_with_letter = os.path.join(test_dir, letter)
        try:
            os.makedirs(test_dir_with_letter)
        except:
            continue

def make_sure_folders_exist():
    for sign in ASL_SIGNS_NAMES:
        create_test_folders_with_letter(sign)

In [9]:
def get_folder_for_batch_type(batch_type='train'):
    if batch_type == 'train':
        return NEW_TRAIN_DIR
    else:
        return NEW_TEST_DIR

In [10]:
def copy_files(sign, infolder, files, batch_type='train'):
    for file in files:
        infile = os.path.join(infolder, file)
        new_dir = get_folder_for_batch_type(batch_type)
        outdir = os.path.join(new_dir, sign)
        outfile = os.path.join(outdir, file)
        shutil.copyfile(infile, outfile)

In [11]:
def delete_folders(folders):
    for folder in folders:
        shutil.rmtree(folder)

In [12]:
if DELETE_FIRST:
    try:
        delete_folders(FABRICATED_DIRS)
    except:
        print('It looks like the folders may not exist (no need to delete)')

In [13]:
# this may throw errors if directories already exist
try:
    make_sure_folders_exist()
except:
    print('Cannot create all folders')

In [14]:
def get_random_n_items_from_list(items, numer_of_items):
    if len(items) < numer_of_items:
        return items
    return random.sample(items, numer_of_items)

In [15]:
def retrieve_filenames(directory):
    all_filenames = os.listdir(directory)
    if MAX_PER_CLASS is None:
        return all_filenames
    else:
        return get_random_n_items_from_list(all_filenames, MAX_PER_CLASS)

In [16]:
# take random samples from the original dataset and create test/train images out of this
def split_data():
    if DRY_RUN:
        print('This is just a dry run - will not create files')
        print()
    print('# train | # test | sign')
    print('-----------------------')
    for sign, input_folder_path in zip(ASL_SIGNS_NAMES, ASL_FOLDERS):
        filenames = retrieve_filenames(input_folder_path)
        train, test = train_test_split(filenames, train_size=TEST_TRAIN_SPLIT)
        print('{}    | {}    | {}'.format(len(train), len(test), sign))
        if not DRY_RUN:
            copy_files(sign, input_folder_path, train, 'train')
            copy_files(sign, input_folder_path, test, 'test')

%time split_data()

# train | # test | sign
-----------------------
166    | 9    | father
168    | 9    | R




166    | 9    | U
208    | 11    | 9
164    | 9    | 7
198    | 11    | I
116    | 7    | N
107    | 6    | G
142    | 8    | 6
199    | 11    | Z
97    | 6    | 1
171    | 9    | 10
199    | 11    | 8
151    | 8    | T
199    | 11    | S
173    | 10    | A
149    | 8    | F
128    | 7    | O
187    | 10    | H
139    | 8    | me
126    | 7    | my
285    | 15    | nothing
275    | 15    | mother
240    | 13    | M
225    | 12    | J
194    | 11    | C
154    | 9    | D
198    | 11    | V
160    | 9    | Q
141    | 8    | 4
171    | 9    | X
208    | 11    | 3
154    | 9    | E
175    | 10    | B
192    | 11    | K
217    | 12    | L
124    | 7    | 2
116    | 7    | Y
109    | 6    | 5
128    | 7    | P
128    | 7    | W
CPU times: user 704 ms, sys: 2.28 s, total: 2.98 s
Wall time: 8.27 s
