### TFRecord Converter

This notebook converts datasets from a raw collection of jpg files in a folder into a tfrecord file. It also contains functions splitting train and test.

**Sources**

[1] [Why Every Tensorflow Developer Should Know about TFRecord](https://www.skcript.com/svr/why-every-tensorflow-developer-should-know-about-tfrecord/)

[2] [TFRecords Guide](http://warmspringwinds.github.io/tensorflow/tf-slim/2016/12/21/tfrecords-guide/)

In [20]:
import tensorflow as tf
import numpy as np
import glob
from PIL import Image
import re
import os
import sys
mod_path = os.path.abspath(os.path.join('..'))
sys.path.append(mod_path)
import src.config as config

### Seperating Test and Train data

In [21]:
def seperate_test_train(dataset_name, split=0.9):
    dataset_path = os.path.join(config.data_dir, dataset_name)
    train_dir = os.path.join(dataset_path, 'train')
    test_dir = os.path.join(dataset_path, 'test')
    if os.path.exists(train_dir) or os.path.exists(test_dir):
        print('Dataset is already seperated')
        return
    # Create fresh train and test directories
    os.mkdir(train_dir)
    os.mkdir(test_dir)
    # Get list of image paths in dataset
    image_paths=glob.glob(os.path.join(dataset_path, '*.jpg'))
    total_images = len(image_paths)
    num_train = int(split * total_images)
    num_test = total_images - num_train
    print('There are %d images in %s. Using a %0.2f split, we get %d train and %d test' %
         (total_images, dataset_name, split, num_train, num_test))
    # Move each image to either the test or train folder
    test_idx = np.random.choice(total_images, num_test) 
    for i, path in enumerate(image_paths):
        if i in test_idx:
            os.rename(path, os.path.join(test_dir, os.path.basename(path)))
        else:
            os.rename(path, os.path.join(train_dir, os.path.basename(path)))

In [22]:
seperate_test_train(config.dataset_name)

Dataset is already seperated


### Writing to TFRecords

In [37]:
def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def _bytes_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _extract_target(path_string, filename_regex=None):
    # Extract the label from the image path name
    if filename_regex is None:
        filename_regex = config.dataset_regex
    m = re.search(filename_regex, os.path.basename(path_string))
    gaze_x = int(float(m.group(1))*100)
    gaze_y = int(float(m.group(2))*100)
    return gaze_x, gaze_y

def _write_tfrecord(tf_record_filename, dataset_path, image_size=None):
    if image_size is None:
        image_size = (config.image_width, config.image_height)
    writer = tf.python_io.TFRecordWriter(tf_record_filename)
    image_paths = glob.glob(os.path.join(dataset_path, '*.png'))
    for image_path in image_paths:
        # Get image and label from image path
        image_raw = Image.open(image_path)
        image_resized = image_raw.resize(image_size)
        img = np.array(image_resized)
        gaze_x, gaze_y = _extract_target(image_path)
        # Convert image to string for storage in tfrecords
        img_raw = img.tostring()
        # Feature defines each discrete entry in the tfrecords file
        feature={
            'gaze_x': _int64_feature(gaze_x),
            'gaze_y': _int64_feature(gaze_y),
            'image_raw': _bytes_feature(img_raw)
        }
        example = tf.train.Example(features=tf.train.Features(feature=feature))
        writer.write(example.SerializeToString())
    writer.close()
    
def to_tfrecord(dataset_name):
    dataset_path = os.path.join(config.data_dir, dataset_name)
    train_dir = os.path.join(dataset_path, 'train')
    test_dir = os.path.join(dataset_path, 'test')
    # Write test and train datasets
    _write_tfrecord(os.path.join(dataset_path, 'train.tfrecords'), train_dir)
    _write_tfrecord(os.path.join(dataset_path, 'test.tfrecords'), test_dir)

In [38]:
to_tfrecord(config.dataset_name)