Explore data. 
statistics

In [2]:
import numpy as np
from os.path import join
import tensorflow as tf
import read_data

In [3]:

DATA_DIR = 'data/records'
TRAIN_DATA_PATH = 'data/thumb280_train'
TEST_DATA_PATH = 'data/test/'


In [4]:
IMG_CLASSES = read_data.IMG_CLASSES
NUM_CLASSES = len(read_data.IMG_CLASSES)
IMG_HEIGHT = read_data.IMG_HEIGHT
IMG_WIDTH = read_data.IMG_WIDTH
IMG_CHANNELS = read_data.IMG_CHANNELS
IMG_PIXELS = IMG_HEIGHT * IMG_WIDTH * IMG_CHANNELS
NUM_TRAIN_EXAMPLES = read_data.NUM_TRAIN_EXAMPLES
NUM_VALIDATION_EXAMPLES = read_data.NUM_VALIDATION_EXAMPLES
NUM_TEST_EXAMPLES = read_data.NUM_TEST_EXAMPLES


In [5]:
def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))


def _bytes_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

In [6]:
from scipy.misc import imread, imresize
from os import  walk

def read_images(path, classes, img_height = 128, img_width = 128, img_channels = 3, log=False):
    filenames = next(walk(path))[2]
    num_files = len(filenames)
    
    if log:
        print("Reading images from %s ... %d files are there" % (path, num_files))

    images = np.zeros((num_files, img_height, img_width, img_channels), dtype=np.uint8)
    labels = np.zeros((num_files, ), dtype=np.uint8)
    min_w = 10000
    min_h = 10000
    max_h = 0
    max_w = 0
    count = float(len(filenames))
    sum_shape = [0.0, 0.0]
    for i, filename in enumerate(filenames):
        if i % 2000 == 0 and i != 0:
            if log:
                print("%d / %d" % (i, len(filenames))) 
        
        img = imread(join(path, filename))
        w, h, _ = img.shape
        
        if w < min_w:
            min_w = w
            min_w_shape = img.shape
        if w > max_w:
            max_w = w
            max_w_shape = img.shape
        if h < min_h:
            min_h = h
            min_h_shape = img.shape
        if h > max_h:
            max_h = h
            max_h_shape = img.shape
        sum_shape[0] = sum_shape[0] + h
        sum_shape[1] = sum_shape[1] + w
        # img = imresize(img, (img_height, img_width))
        # images[i, :, :, :] = img
        class_name = filename[0:3].lower() # Luckily both 'cat' and 'dog' have 3 characters
        if class_name == 'cat' or class_name == 'dog':
            labels[i] = classes.index(class_name)
    
    print("min_w: {}, {}".format(min_w, min_w_shape))

    print("max_w: {}, {}".format(max_w, max_w_shape))

    print("min_h: {}, {}".format(min_h, min_h_shape))

    print("max_h: {}, {}".format(max_h, max_h_shape))

    print("mean shape(h, w) : {} {}".format(sum_shape[0]/count, sum_shape[1]/count))
    print("Done!") 

    return images, labels

In [7]:
log_flag = True

train_images, train_labels = read_images(TRAIN_DATA_PATH, IMG_CLASSES,
                                                       IMG_HEIGHT, IMG_WIDTH, IMG_CHANNELS, log_flag)


Reading images from data/thumb280_train ... 25000 files are there
2000 / 25000
4000 / 25000
6000 / 25000
8000 / 25000
10000 / 25000
12000 / 25000
14000 / 25000
16000 / 25000
18000 / 25000
20000 / 25000
22000 / 25000
24000 / 25000
min_w: 32, (32, 60, 3)
max_w: 280, (280, 210, 3)
min_h: 42, (62, 42, 3)
max_h: 280, (210, 280, 3)
mean shape(h, w) : 255.27684 228.71004
Done!


In [39]:
train_labels[:-5]

array([1, 1, 1, ..., 0, 0, 0], dtype=uint8)

In [56]:
cat_idx = np.where(train_labels == 1)[0]
print("number of cats:", len(cat_idx))
dog_idx = np.where(train_labels == 0)[0]
print("number of dogs:", len(dog_idx))

cat_val_idx = np.random.choice(cat_idx, 2500, replace=False)
print(cat_val_idx[:5])

dog_val_idx = np.random.choice(dog_idx, 2500, replace=False)
print(dog_val_idx[:5])
cat_train_idx = np.setdiff1d(cat_idx, cat_val_idx)

print("val_cats::", len(cat_val_idx))
print("train_cats::", len(cat_train_idx))

number of cats: 12500
number of dogs: 12500
[ 1262  3796   962 10183 10783]
[24452 22067 23385 18260 20726]
val_cats:: 2500
train_cats:: 10000


In [57]:
val_idx = np.random.permutation(np.array(cat_val_idx.tolist() + dog_val_idx.tolist()))
print("val_idx len:", val_idx.size)
val_idx[:5]

val_idx len: 5000


array([21731, 13506,  1180,  8276, 11055])

In [49]:
arr = np.array(range(5))
print(arr)
b = arr[[2, 1, 4]]
print(b)

[0 1 2 3 4]
[2 1 4]


In [30]:


def convert_to(images, labels, name, log=False):
    if log:
        print("converting %s to tfrecords..."%(name))
    num_examples = labels.shape[0]
    if images.shape[0] != num_examples:
        raise ValueError("Images size %d does not match label size %d." %
                     (images.shape[0], num_examples))
    rows = images.shape[1]
    cols = images.shape[2]
    depth = images.shape[3]

    filename = join(DATA_DIR, name + '.tfrecords')
    print('Writing', filename)
    writer = tf.python_io.TFRecordWriter(filename)
    for index in range(num_examples):
        if index % 2000 == 0:
            if log:
                print("%d / %d" % (index, num_examples)) 
        image_raw = images[index].tostring()
        example = tf.train.Example(features=tf.train.Features(feature={
            'height': _int64_feature(rows),
            'width': _int64_feature(cols),
            'depth': _int64_feature(depth),
            'label': _int64_feature(int(labels[index])),   # NOT assuming one-hot format of original data
            'image_raw': _bytes_feature(image_raw)}))
        writer.write(example.SerializeToString())
    writer.close()
    if log:
        print("Done!") 



In [32]:
log_flag = True

train_images, train_labels = read_images(TRAIN_DATA_PATH, IMG_CLASSES,
                                                       IMG_HEIGHT, IMG_WIDTH, IMG_CHANNELS, log_flag)

# Generate a validation set.
validation_size = int(VALIDATION_SET_FRACTION * train_images.shape[0])
validation_images = train_images[:validation_size, :, :, :]
validation_labels = train_labels[:validation_size]
train_images = train_images[validation_size:, :, :, :]
train_labels = train_labels[validation_size:]

# Convert to Examples and write the result to TFRecords.

convert_to(train_images, train_labels, 'train', log_flag)
convert_to(validation_images, validation_labels, 'validation', log_flag)



Reading images from data/train/ ... 25000 files are there
0 / 25000
2000 / 25000
4000 / 25000
6000 / 25000
8000 / 25000
10000 / 25000
12000 / 25000
14000 / 25000
16000 / 25000
18000 / 25000
20000 / 25000
22000 / 25000
24000 / 25000
Done!
converting train to tfrecords...
Writing data/train.tfrecords
0 / 22500
2000 / 22500
4000 / 22500
6000 / 22500
8000 / 22500
10000 / 22500
12000 / 22500
14000 / 22500
16000 / 22500
18000 / 22500
20000 / 22500
22000 / 22500
Done!
converting validation to tfrecords...
Writing data/validation.tfrecords
0 / 2500
2000 / 2500
Done!
Reading test images...
Reading images from data/test/ ... 12500 files are there
0 / 12500


ValueError: '237' is not in list

In [34]:
print("Reading test images...")
test_images, test_labels = read_images(TEST_DATA_PATH, IMG_CLASSES,
                                                     IMG_HEIGHT, IMG_WIDTH, IMG_CHANNELS, log_flag)

convert_to(test_images, test_labels, 'test', log_flag)


Reading test images...
Reading images from data/test/ ... 12500 files are there
0 / 12500
2000 / 12500
4000 / 12500
6000 / 12500
8000 / 12500
10000 / 12500
12000 / 12500
Done!
converting test to tfrecords...
Writing data/test.tfrecords
0 / 12500
2000 / 12500
4000 / 12500
6000 / 12500
8000 / 12500
10000 / 12500
12000 / 12500
Done!
