In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import keras
import os
from shutil import copyfile
from PIL import Image

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
DATA_DIRECTORY = '../../data'

ORIGINAL_DATA = os.path.join(DATA_DIRECTORY, 'original')
FABRICATED_DATA = os.path.join(DATA_DIRECTORY, 'fabricated')

HANDS_DATA_DIRECTORY = os.path.join(ORIGINAL_DATA, 'hands')
NOT_HANDS_ORIGINAL_DATA_DIRECTORY = os.path.join(ORIGINAL_DATA, 'not-hands-original')
NOT_HANDS_DATA_DIRECTORY = os.path.join(FABRICATED_DATA, 'not-hands')

# Create not-hands Directory

In [3]:
def get_filenames_for_hands_dataset():
    files = os.listdir(HANDS_DATA_DIRECTORY)
    return files

def get_filenames_for_not_hands_original_dataset():
    files = os.listdir(NOT_HANDS_ORIGINAL_DATA_DIRECTORY)
    return files


In [4]:
def can_load_image(filepath):
    try:
        im = Image.open(filepath)
        return True
    except:
        return False

def large_enough(filepath):
    filesize = os.path.getsize(filepath)
    # image should be at least 2,000 bytes to be considered usable
    return filesize > 2051

def filter_good_images(filenames):
    bad_images_count = 0
    small_images_count = 0
    good_filenames = []

    for filename in filenames:
        filepath = os.path.join(NOT_HANDS_ORIGINAL_DATA_DIRECTORY, filename)
        if (not can_load_image(filepath)):
            bad_images_count += 1
            continue
        
        if (not large_enough(filepath)):
            small_images_count += 1
            continue

        good_filenames.append(filename)

    print('Unable to load {} images'.format(bad_images_count))
    print('Found {} images that are too small'.format(small_images_count))
    return good_filenames



In [5]:


def ensure_new_directory_exists():
    try:
        os.mkdir(NOT_HANDS_DATA_DIRECTORY)
    except:
        print('Tried to create "{}" but it already exists'.format(NOT_HANDS_DATA_DIRECTORY))

def copy_files_to_new_dataset(filenames):
    print('Copying {} files to new "not-hands" directory'.format(len(filenames)))
    for filename in filenames:
        from_file = os.path.join(NOT_HANDS_ORIGINAL_DATA_DIRECTORY, filename)
        to_file = os.path.join(NOT_HANDS_DATA_DIRECTORY, filename)
        copyfile(from_file, to_file)

def create_dataset_for_nothands():
    filenames_for_hands_dataset = get_filenames_for_hands_dataset()
    print('Found {} files from the hands dataset'.format(len(get_filenames_for_hands_dataset())))
    
    filenames_for_not_hands_original = get_filenames_for_not_hands_original_dataset()
    print('Found {} files from the not-hands dataset'.format(len(filenames_for_not_hands_original)))
    good_filenames_for_not_hands = filter_good_images(filenames_for_not_hands_original)
    print('Found {} files from the not-hands dataset after removing bad files'.format(len(good_filenames_for_not_hands)))

    ensure_new_directory_exists()
    
    copy_files_to_new_dataset(good_filenames_for_not_hands)
    
create_dataset_for_nothands()

Found 11076 files from the hands dataset
Found 20971 files from the not-hands dataset
Unable to load 6486 images
Found 2013 images that are too small
Found 12472 files from the not-hands dataset after removing bad files
Copying 12472 files to new "not-hands" directory
