# Create Training/Test Sets

### Create Cross Validation

In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
import os
import shutil

In [2]:
DATA_DIRECTORY = '../../data'
HANDS_DATA_DIRECTORY = os.path.join(DATA_DIRECTORY, 'hands')
NOT_HANDS_DATA_DIRECTORY = os.path.join(DATA_DIRECTORY, 'not-hands')

images_test_dir = os.path.join(DATA_DIRECTORY, 'hands_test')
images_train_dir = os.path.join(DATA_DIRECTORY, 'hands_train')

images_hands_train_dir = os.path.join(images_train_dir, 'hands')
images_not_hands_train_dir = os.path.join(images_train_dir, 'not_hands')
images_hands_test_dir = os.path.join(images_test_dir, 'hands')
images_not_hands_test_dir = os.path.join(images_test_dir, 'not_hands')

directories_to_create = [
    images_test_dir,
    images_train_dir,
    images_hands_train_dir,
    images_not_hands_train_dir,
    images_hands_test_dir,
    images_not_hands_test_dir,
]

In [3]:
def get_filenames_for_hands_dataset():
    files = os.listdir(HANDS_DATA_DIRECTORY)
    return files
def get_filenames_for_not_hands_dataset():
    files = os.listdir(NOT_HANDS_DATA_DIRECTORY)
    return files

In [4]:
def get_cross_validated_data():
    TRAIN_SIZE = .85
    hands_filenames = get_filenames_for_hands_dataset()
    not_hands_filenames = get_filenames_for_not_hands_dataset()
    
    hands_df = pd.DataFrame({
        'filenames': hands_filenames
    })
    hands_df['hands'] = True
    hands_df['dir'] = HANDS_DATA_DIRECTORY
    
    not_hands_df = pd.DataFrame({
        'filenames': not_hands_filenames
    })
    not_hands_df['hands'] = False
    not_hands_df['dir'] = NOT_HANDS_DATA_DIRECTORY
    
    df = pd.concat([hands_df, not_hands_df])

    X_train, X_test = train_test_split(df, train_size=TRAIN_SIZE)
    
    return X_train, X_test
print(get_cross_validated_data()[0][:10])

                 filenames  hands                   dir
908    n00015388_66265.jpg  False  ../../data/not-hands
5877      Hand_0006304.jpg   True      ../../data/hands
349    n00015388_30189.jpg  False  ../../data/not-hands
10529     Hand_0011175.jpg   True      ../../data/hands
9985      Hand_0010609.jpg   True      ../../data/hands
5272   n00433661_1288.jpeg  False  ../../data/not-hands
2015    n00288000_7785.jpg  False  ../../data/not-hands
1962      Hand_0002268.jpg   True      ../../data/hands
5430      Hand_0005836.jpg   True      ../../data/hands
6713      Hand_0007153.jpg   True      ../../data/hands




In [5]:
def make_sure_directories_exist():
    for directory in directories_to_create:
        try:
            os.mkdir(directory)
        except:
            print('{} already exists'.format([directory]))

def copy_files_to_new_directory():
    make_sure_directories_exist()
    
    X_train, X_test = get_cross_validated_data()

    train_filenames = X_train
    for index, row in train_filenames.iterrows():
        filename = row.filenames
        _dir = row.dir
        file_from = os.path.join(_dir, filename)
        file_to = ''
        if row.hands:
            file_to = os.path.join(images_hands_train_dir, filename)
        else:
            file_to = os.path.join(images_not_hands_train_dir, filename)        
        shutil.copyfile(file_from, file_to)

    test_filenames = X_test
    for index, row in test_filenames.iterrows():
        filename = row.filenames
        _dir = row.dir
        file_from = os.path.join(_dir, filename)
        file_to = ''
        if row.hands:
            file_to = os.path.join(images_hands_test_dir, filename)
        else:
            file_to = os.path.join(images_not_hands_test_dir, filename)        
        shutil.copyfile(file_from, file_to)

In [6]:
copy_files_to_new_directory()



['../../data/hands_test'] already exists
['../../data/hands_train'] already exists
