# Create Training/Test Sets

This is an older notebook where we tried to predict if there was a hand present in an image. It turns out this really was not very helpful.

Also, note that this notebook uses [11K Images of hands](https://sites.google.com/view/11khands) which is not used in the other notebooks.

### Create Cross Validation

In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
import os
import shutil

In [2]:
DATA_DIRECTORY = '../../data'

ORIGINAL_DATA = os.path.join(DATA_DIRECTORY, 'original')
FABRICATED_DATA = os.path.join(DATA_DIRECTORY, 'fabricated')

STATIC_SIGNS_DIRECTORY = os.path.join(ORIGINAL_DATA, 'sign-language-static/user_3')
HANDS_DATA_DIRECTORY = os.path.join(ORIGINAL_DATA, 'hands')
NOT_HANDS_DATA_DIRECTORY = os.path.join(FABRICATED_DATA, 'not-hands')

IMAGES_VALIDATION_DIR = os.path.join(FABRICATED_DATA, 'hands_validation')
IMAGES_TRAIN_DIR = os.path.join(FABRICATED_DATA, 'hands_train')
IMAGES_TEST_DIR = os.path.join(FABRICATED_DATA, 'hands_test')

IMAGES_HANDS_TRAIN_DIR = os.path.join(IMAGES_TRAIN_DIR, 'hands')
IMAGES_NOT_HANDS_TRAIN_DIR = os.path.join(IMAGES_TRAIN_DIR, 'not_hands')
IMAGES_HANDS_VALIDATION_DIR = os.path.join(IMAGES_VALIDATION_DIR, 'hands')
IMAGES_NOT_HANDS_VALIDATION_DIR = os.path.join(IMAGES_VALIDATION_DIR, 'not_hands')
IMAGES_HANDS_TEST_DIR = os.path.join(IMAGES_TEST_DIR, 'hands')
IMAGES_NOT_HANDS_TEST_DIR = os.path.join(IMAGES_TEST_DIR, 'not_hands')

directories_to_create = [
    IMAGES_VALIDATION_DIR,
    IMAGES_TRAIN_DIR,
    IMAGES_TEST_DIR,
    IMAGES_HANDS_TRAIN_DIR,
    IMAGES_NOT_HANDS_TRAIN_DIR,
    IMAGES_HANDS_VALIDATION_DIR,
    IMAGES_NOT_HANDS_VALIDATION_DIR,
    IMAGES_HANDS_TEST_DIR,
    IMAGES_NOT_HANDS_TEST_DIR,
]

In [3]:
def get_filenames_for_hands_dataset():
    files = os.listdir(HANDS_DATA_DIRECTORY)
    return files
def get_filenames_for_not_hands_dataset():
    files = os.listdir(NOT_HANDS_DATA_DIRECTORY)
    return files
def get_filenames_for_signs_dataset(number_items_to_pull):
    files = os.listdir(STATIC_SIGNS_DIRECTORY)
    return files[:number_items_to_pull]

In [4]:

def get_cross_validated_data():
    TRAIN_SIZE = .85
    hands_filenames = get_filenames_for_hands_dataset()
    all_not_hands_filenames = get_filenames_for_not_hands_dataset()
    
    # only take the number of "hands" images so taht the they are equal - use the rest for test data
    number_of_not_hands_to_pull = len(hands_filenames)
    not_hands_filenames = all_not_hands_filenames[:number_of_not_hands_to_pull]
    
    # Create test data
    not_hands_test = all_not_hands_filenames[-number_of_not_hands_to_pull:]
    print('Using {} of {} "not-hands" files for test'.format(len(not_hands_filenames), len(not_hands_test)))
    hands_test = get_filenames_for_signs_dataset(number_of_not_hands_to_pull)
    test_hands_df = pd.DataFrame({
        'filenames': hands_test
    })
    test_hands_df['hands'] = True
    test_hands_df['dir'] = STATIC_SIGNS_DIRECTORY
    
    test_not_hands_df = pd.DataFrame({
        'filenames': not_hands_test
    })
    test_not_hands_df['hands'] = False
    test_not_hands_df['dir'] = NOT_HANDS_DATA_DIRECTORY
    
    X_test = pd.concat([test_hands_df, test_not_hands_df])
    

    # Create Train/Validation Data
    hands_df = pd.DataFrame({
        'filenames': hands_filenames
    })
    hands_df['hands'] = True
    hands_df['dir'] = HANDS_DATA_DIRECTORY
    
    not_hands_df = pd.DataFrame({
        'filenames': not_hands_filenames
    })
    not_hands_df['hands'] = False
    not_hands_df['dir'] = NOT_HANDS_DATA_DIRECTORY
    
    df = pd.concat([hands_df, not_hands_df])

    X_train, X_validation = train_test_split(df, train_size=TRAIN_SIZE)
    
    return X_train, X_validation, X_test
print(get_cross_validated_data()[2][:10])

Using 11076 of 11076 "not-hands" files for test
  filenames  hands                                              dir
0    A0.jpg   True  ../../data/original/sign-language-static/user_3
1    A1.jpg   True  ../../data/original/sign-language-static/user_3
2    A2.jpg   True  ../../data/original/sign-language-static/user_3
3    A3.jpg   True  ../../data/original/sign-language-static/user_3
4    A4.jpg   True  ../../data/original/sign-language-static/user_3
5    A5.jpg   True  ../../data/original/sign-language-static/user_3
6    A6.jpg   True  ../../data/original/sign-language-static/user_3
7    A7.jpg   True  ../../data/original/sign-language-static/user_3
8    A8.jpg   True  ../../data/original/sign-language-static/user_3
9    A9.jpg   True  ../../data/original/sign-language-static/user_3




In [5]:
def make_sure_directories_exist():
    for directory in directories_to_create:
        try:
            os.mkdir(directory)
        except:
            print('{} already exists'.format([directory]))
            
def copy_files_per_batch(batch_type, filenames):
    file_to = ''
    for index, row in filenames.iterrows():
        filename = row.filenames
        _dir = row.dir
        file_from = os.path.join(_dir, filename)
        if row.hands and batch_type == 'train':
            file_to = os.path.join(IMAGES_HANDS_TRAIN_DIR, filename)
        elif row.hands and batch_type == 'validation':
            file_to = os.path.join(IMAGES_HANDS_VALIDATION_DIR, filename)
        elif row.hands and batch_type == 'test':
            file_to = os.path.join(IMAGES_HANDS_TEST_DIR, filename)
        elif not row.hands and batch_type == 'train':
            file_to = os.path.join(IMAGES_NOT_HANDS_TRAIN_DIR, filename)
        elif not row.hands and batch_type == 'validation':
            file_to = os.path.join(IMAGES_NOT_HANDS_VALIDATION_DIR, filename)
        elif not row.hands and batch_type == 'test':
            file_to = os.path.join(IMAGES_NOT_HANDS_TEST_DIR, filename)
        shutil.copyfile(file_from, file_to)

def copy_files_to_new_directory():
    make_sure_directories_exist()
    
    X_train, X_test, X_validation = get_cross_validated_data()
    
    copy_files_per_batch('train', X_train)
    copy_files_per_batch('validation', X_validation)
    copy_files_per_batch('test', X_test)

In [6]:
copy_files_to_new_directory()

Using 11076 of 11076 "not-hands" files for test


