In [1]:
import tensorflow as tf
from keras.preprocessing.text import Tokenizer, one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils import plot_model
import numpy as np
import json
import random
import os
import sys
import hashlib
import shutil
import cv2

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
train_dir_name = 'data/train/'
eval_dir_name = 'data/eval/'
test_dir_name = 'data/test/'

def build_dataset(input_path):
    distribution = 6 
    TRAINING_SET_NAME = "img/train_images"
    EVALUATION_SET_NAME = "img/eval_images"
    TESTING_SET_NAME = "img/test_images"

    paths = []
    for f in os.listdir(input_path):
        if f.find(".gui") != -1:
            path_gui = "{}/{}".format(input_path, f)
            file_name = f[:f.find(".gui")]

            if os.path.isfile("{}/{}.png".format(input_path, file_name)):
                path_img = "{}/{}.png".format(input_path, file_name)
                paths.append(file_name)
    
    
    testing_samples_number = len(paths) / (distribution + 1)
    training_samples_number = (testing_samples_number * distribution) - 100
    evaluation_samples_number = 100

    assert training_samples_number + evaluation_samples_number + testing_samples_number == len(paths)

    print("Splitting datasets, training samples: {}, testing samples: {}, evaluation samples: {}".format(training_samples_number,
                                                                                                         testing_samples_number,
                                                                                                         evaluation_samples_number))

    np.random.shuffle(paths)

    eval_set = []
    train_set = []
    test_set = []
    hashes = []
    for path in paths:
        if sys.version_info >= (3,):
            f = open("{}/{}.gui".format(input_path, path), 'r', encoding='utf-8')
        else:
            f = open("{}/{}.gui".format(input_path, path), 'r')

        with f:
            chars = ""
            for line in f:
                chars += line
            content_hash = chars.replace(" ", "").replace("\n", "")
            content_hash = hashlib.sha256(content_hash.encode('utf-8')).hexdigest()

            if len(eval_set) == evaluation_samples_number and len(test_set) == testing_samples_number:
                train_set.append(path)
            else:
                is_unique = True
                for h in hashes:
                    if h is content_hash:
                        is_unique = False
                        break

                if is_unique and len(test_set) != testing_samples_number:
                    test_set.append(path)
                elif is_unique and len(eval_set) != evaluation_samples_number:
                    eval_set.append(path)
                else:
                    train_set.append(path)
                

            hashes.append(content_hash)
    assert len(test_set) == testing_samples_number
    assert len(eval_set) == evaluation_samples_number
    assert len(train_set) == training_samples_number

    if not os.path.exists("{}/{}".format(os.path.dirname(input_path), TESTING_SET_NAME)):
        os.makedirs("{}/{}".format(os.path.dirname(input_path), TESTING_SET_NAME))
    
    if not os.path.exists("{}/{}".format(os.path.dirname(input_path), EVALUATION_SET_NAME)):
        os.makedirs("{}/{}".format(os.path.dirname(input_path), EVALUATION_SET_NAME))

    if not os.path.exists("{}/{}".format(os.path.dirname(input_path), TRAINING_SET_NAME)):
        os.makedirs("{}/{}".format(os.path.dirname(input_path), TRAINING_SET_NAME))
    
    for path in test_set:
        shutil.copyfile("{}/{}.png".format(input_path, path), "{}/{}/{}.png".format(os.path.dirname(input_path), TESTING_SET_NAME, path))
        shutil.copyfile("{}/{}.gui".format(input_path, path), "{}/{}/{}.gui".format(os.path.dirname(input_path), TESTING_SET_NAME, path))

    
    for path in eval_set:
        shutil.copyfile("{}/{}.png".format(input_path, path), "{}/{}/{}.png".format(os.path.dirname(input_path), EVALUATION_SET_NAME, path))
        shutil.copyfile("{}/{}.gui".format(input_path, path), "{}/{}/{}.gui".format(os.path.dirname(input_path), EVALUATION_SET_NAME, path))

    for path in train_set:
        shutil.copyfile("{}/{}.png".format(input_path, path), "{}/{}/{}.png".format(os.path.dirname(input_path), TRAINING_SET_NAME, path))
        shutil.copyfile("{}/{}.gui".format(input_path, path), "{}/{}/{}.gui".format(os.path.dirname(input_path), TRAINING_SET_NAME, path))

    print("Training dataset: {}/training_set".format(os.path.dirname(input_path), path))
    print("Evaluation dataset: {}/eval_set".format(os.path.dirname(input_path), path))
    print("Testing dataset: {}/test_set".format(os.path.dirname(input_path), path))


def get_preprocessed_image(img_path, image_size):
    img = cv2.imread(img_path)
    img = cv2.resize(img, (image_size, image_size))
    img = img.astype('float32')
    img /= 255
    return img


def convert_image_to_array(input_path, output_path):
    IMAGE_SIZE = 256
    print("Converting images to numpy arrays...")
    for f in os.listdir(input_path):
        if f.find(".png") != -1:
            img = get_preprocessed_image("{}/{}".format(input_path, f), IMAGE_SIZE)
            file_name = f[:f.find(".png")]
            np.savez_compressed("{}/{}".format(output_path, file_name), features=img)
            retrieve = np.load("{}/{}.npz".format(output_path, file_name))["features"]

            assert np.array_equal(img, retrieve)

            shutil.copyfile("{}/{}.gui".format(input_path, file_name), "{}/{}.gui".format(output_path, file_name))

    print("Numpy arrays saved in {}".format(output_path))

# build_dataset('datasets/all_data')

if not os.path.exists('data/img/train_images'):
    build_dataset('data/all_data')
else:
    print('Training set images already exist at datasets/img/train_images and are ready to be converted to arrays')
    
assert os.path.exists('data/img/eval_images')

assert os.path.exists('data/img/test_images')

if not os.path.exists(train_dir_name):
    os.makedirs(train_dir_name)
    convert_image_to_array('data/img/train_images', train_dir_name)
else:
    print('Training set already exists at %s' % train_dir_name)

if not os.path.exists(eval_dir_name):
    os.makedirs(eval_dir_name)
    convert_image_to_array('data/img/eval_images', eval_dir_name)
else:
     print('Evaluation set already exists at %s' % eval_dir_name)

if not os.path.exists(test_dir_name):
    os.makedirs(test_dir_name)
    convert_image_to_array('data/img/test_images', test_dir_name)
else:
     print('Test set already exists at %s' % test_dir_name)

Training set images already exist at datasets/img/train_images and are ready to be converted to arrays
Training set already exists at data/train/
Evaluation set already exists at data/eval/
Test set already exists at data/test/


In [3]:
train_dir_name = 'data/train/'
test_dir_name = 'data/eval/'

# Read a file and return a string
def load_doc(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text

def load_data(data_dir):
    text = []
    images = []
    # Load all the files and order them
    all_filenames = os.listdir(data_dir)
    all_filenames.sort()
    for filename in (all_filenames):
        if filename[-3:] == "npz":
            # Load the images already prepared in arrays
            image = np.load(data_dir+filename)
            images.append(image['features'])
        else:
            # Load the boostrap tokens and rap them in a start and end tag
            syntax = '<START> ' + load_doc(data_dir+filename) + ' <END>'
            # Seperate all the words with a single space
            syntax = ' '.join(syntax.split())
            # Add a space after each comma
            syntax = syntax.replace(',', ' ,')
            text.append(syntax)
    images = np.array(images, dtype=np.float32)
    return images, text

train_features, train_texts = load_data(train_dir_name)
test_features, test_texts = load_data(test_dir_name)

In [4]:
train_features.shape

(1400, 256, 256, 3)

In [5]:
# Initialize the function to create the vocabulary 
tokenizer = Tokenizer(filters='', split=" ", lower=False)
# Create the vocabulary 
tokenizer.fit_on_texts([load_doc('data/bootstrap.vocab')])

# Add one spot for the empty word in the vocabulary (17 vocabulary words + 1 = 18 (vocab_size))
vocab_size = len(tokenizer.word_index) + 1
max_length = 48

def preprocess_data(texts, features, max_sequence):
    X, y, image_data = list(), list(), list()
#     X = np.empty(dtype=object)
#     y = np.empty(dtype=object)
#     image_data = np.empty(dtype=float)
    sequences = tokenizer.texts_to_sequences(texts)
    for img_no, seq in enumerate(sequences):
        for i in range(1, len(seq)):
            # Add the sentence until the current count(i) and add the current count to the output
            in_seq, out_seq = seq[:i], seq[i]
            # Pad all the input token sentences to max_sequence
            in_seq = pad_sequences([in_seq], maxlen=max_sequence)[0]
            # Turn the output into one-hot encoding
            out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
            # Add the corresponding image to the boostrap token file
            image_data.append(features[img_no])
            # Cap the input sentence to 48 tokens and add it
            X.append(in_seq[-48:])
            y.append(out_seq)
#     return image_data, X, y
    return np.array(image_data), np.array(X), np.array(y)

In [6]:
# data generator, intended to be used in a call to model.fit_generator()
# This will save memory and prevent MemoryErrors from numpy
def data_generator(descriptions, features, n_step, max_sequence):
    # loop until we finish training
    while 1:
        # loop over photo identifiers in the dataset
        for i in range(0, len(descriptions), n_step):
            Ximages, XSeq, y = list(), list(),list()
            for j in range(i, min(len(descriptions), i+n_step)):
                image = features[j]
                # retrieve text input
                desc = descriptions[j]
                # generate input-output pairs
                in_img, in_seq, out_word = preprocess_data([desc], [image], max_sequence)
                for k in range(len(in_img)):
                    Ximages.append(in_img[k])
                    XSeq.append(in_seq[k])
                    y.append(out_word[k])
            # yield this batch of samples to the model
            yield (np.array(Ximages), np.array(XSeq), np.array(y))
#             yield ({'images': np.array(Ximages), 'texts': np.array(XSeq)}, np.array(y))
#             yield [[np.array(Ximages), np.array(XSeq)], np.array(y)]
#             yield [[np.transpose(np.array(Ximages), [0, 3, 1, 2]), np.array(XSeq)], np.array(y)]

In [7]:
# x_images, x_texts, y_labels = preprocess_data(train_texts[:20], train_features[:20], 150)

In [8]:
# Training Parameters
learning_rate = 0.0001
num_steps = 1500
batch_size = 1
display_step = 1

# Add one spot for the empty word in the vocabulary (17 vocabulary words + 1 = 18 (vocab_size))
vocab_size = len(tokenizer.word_index) + 1
max_length = 48

# Network Parameters
num_input =  256 # MNIST data input (img shape: 28*28)
num_classes = vocab_size # MNIST total classes (0-9 digits)
dropout = 0.3 # Dropout, probability to keep units


# # tf Graph input
# X = tf.placeholder(tf.float32, [None, num_input])
# Y = tf.placeholder(tf.float32, [None, num_classes])
# keep_prob = tf.placeholder(tf.float32) # dropout (keep probability)

In [9]:
def encoder_convnet(x_dict, is_training, reuse, max_length):
    with tf.variable_scope("encoder_convnet", reuse=reuse):
#         with tf.device('/gpu:0'):
        input_images = x_dict['images']

        # x = tf.reshape(x_image, shape=[None, 3, 256, 256])
        conv1 = tf.layers.conv2d(input_images, 16, 3, padding='valid', activation='relu')
        conv2 = tf.layers.conv2d(conv1, 16, 3, strides=2, padding='same', activation='relu')
        conv3 = tf.layers.conv2d(conv2, 32, 3, padding='same', activation='relu')
        conv4 = tf.layers.conv2d(conv3, 32, 3, strides=2, padding='same', activation='relu')
        conv5 = tf.layers.conv2d(conv4, 64, 3, padding='same', activation='relu')
        conv6 = tf.layers.conv2d(conv5, 64, 3, strides=2, padding='same', activation='relu')
        conv7 = tf.layers.conv2d(conv6, 128, 3, padding='same', activation='relu')
        flatten = tf.layers.flatten(conv7)
        fc1 = tf.layers.dense(flatten, 1024, activation='relu')
        drop1 = tf.layers.dropout(fc1, rate=0.3, training=is_training)
        fc2 = tf.layers.dense(drop1, 1024, activation='relu')
        drop2 = tf.layers.dropout(fc2, rate=0.3, training=is_training)
        repeat_vec = tf.reshape(tf.tile(tf.expand_dims(drop2, axis=1), [1, 1, 48]), [1, 48, 1024])
        # Concatenate a None dimensio
        shape = tf.TensorShape([None]).concatenate(repeat_vec.get_shape()[1:])
        out = tf.placeholder_with_default(repeat_vec, shape=shape)
    return out

In [10]:
def encoder_gru(x_dict, vocab_size):
    with tf.variable_scope("encoder_gru"):
#         with tf.device('/cpu:0'):
        input_texts = x_dict['texts']            
        # Create the embedding variable (each row represent a word embedding vector)
        embedding = tf.get_variable("embedding", [vocab_size, 50])
        # embedding = tf.Variable(tf.random_normal([self.vocab_size, 50]))
        # Lookup the corresponding embedding vectors for each sample in X
        X_embed = tf.nn.embedding_lookup(embedding, input_texts)

        # sequence_len = tf.count_nonzero(input_texts, axis=-1)
        gru_layers = [tf.contrib.rnn.GRUCell(num_units=units) for units in [128, 128]]
        # create a RNN cell composed sequentially of a number of RNNCells
        multi_rnn_cell = tf.nn.rnn_cell.MultiRNNCell(gru_layers)
        # 'outputs' is a tensor of shape [batch_size, max_time, 256]
        # 'state' is a N-tuple where N is the number of LSTMCells containing a
        # tf.contrib.rnn.LSTMStateTuple for each cell
        outputs, state = tf.nn.dynamic_rnn(cell=multi_rnn_cell,
                                           inputs=X_embed,
                                           dtype=tf.float32)

    return outputs

In [11]:
def decoder_GRU(encoder_convnet_out, encoder_gru_out, vocab_size, reuse):
    with tf.variable_scope("decoder_gru", reuse=reuse):
        concat1 = tf.concat([encoder_convnet_out, encoder_gru_out], -1)
        gru_layers = [tf.contrib.rnn.GRUCell(num_units=units) for units in [512, 512]]
        # create a RNN cell composed sequentially of a number of RNNCells
        multi_rnn_cell = tf.nn.rnn_cell.MultiRNNCell(gru_layers)
        # 'outputs' is a tensor of shape [batch_size, max_time, 256]
        # 'state' is a N-tuple where N is the number of LSTMCells containing a
        # tf.contrib.rnn.LSTMStateTuple for each cell
        outputs, state = tf.nn.dynamic_rnn(cell=multi_rnn_cell,
                                           inputs=concat1,
                                           dtype=tf.float32)
        logits = tf.layers.dense(outputs[:, -1, :], vocab_size)
        out = tf.nn.softmax(logits)
        # out = tf.layers.dense(state.h, self.vocab_size, activation='softmax')

    return out, logits

In [12]:
# Define the model function (following TF Estimator Template)
def model_fn(features, labels, mode):
    
    # Build the neural network
    # Because Dropout have different behavior at training and prediction time, we
    # need to create 2 distinct computation graphs that still share the same weights.
#     logits_train = conv_net(features, num_classes, dropout, reuse=False, is_training=True)
#     logits_test = conv_net(features, num_classes, dropout, reuse=True, is_training=False)
    encoder_cnn_out_train = encoder_convnet(features, is_training=True, reuse=False, max_length=max_length)
    encoder_cnn_out_test = encoder_convnet(features, is_training=False, reuse=True, max_length=max_length)
    encoder_gru_out = encoder_gru(features, vocab_size)
    
    out1, logits_train = decoder_GRU(encoder_cnn_out_train, encoder_gru_out, vocab_size, reuse=False)
    out2, logits_test = decoder_GRU(encoder_cnn_out_test, encoder_gru_out, vocab_size, reuse=True)


    # Labels (convert from one hot encoding)
    actual_labels = tf.argmax(labels, axis=1)
    
    
    # Predictions
    pred_classes = tf.argmax(logits_test, axis=1)
    pred_probas = tf.nn.softmax(logits_test)
    
    # If prediction mode, early return
    if mode == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(mode, predictions=pred_classes) 
        
    # Define loss and optimizer
#     loss_op = tf.nn.softmax_cross_entropy_with_logits_v2(labels=actual_labels, logits=logits_train)
    loss = tf.nn.softmax_cross_entropy_with_logits_v2(
        logits=logits_train, labels=tf.cast(labels, dtype=tf.int32))
    loss_op = tf.reduce_mean(loss)
    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
    gradients = optimizer.compute_gradients(loss)
    capped_gradients = [(tf.clip_by_value(grad, -1.0, 1.0), var) for grad, var in gradients if grad is not None]
    train_op = optimizer.apply_gradients(capped_gradients, global_step=tf.train.get_global_step())
#     train_op = optimizer.minimize(loss_op, global_step=tf.train.get_global_step())
    
    # Evaluate the accuracy of the model
    acc_op = tf.metrics.accuracy(labels=actual_labels, predictions=pred_classes)
    
    # TF Estimators requires to return a EstimatorSpec, that specify
    # the different ops for training, evaluating, ...
    estim_specs = tf.estimator.EstimatorSpec(
      mode=mode,
      predictions=pred_classes,
      loss=loss_op,
      train_op=train_op,
      eval_metric_ops={'accuracy': acc_op})

    return estim_specs

In [13]:
# Build the Estimator
run_config = tf.estimator.RunConfig().replace(
            session_config=tf.ConfigProto(device_count={'GPU': 0}))
model = tf.estimator.Estimator(model_fn, config=run_config)

INFO:tensorflow:Using config: {'_model_dir': 'C:\\Users\\kevin\\AppData\\Local\\Temp\\tmprnsd1vqh', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': device_count {
  key: "GPU"
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x00000242CC896E10>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [14]:
def input_fn(descriptions, features, n_step, max_length):
    ds = tf.data.Dataset.from_generator(lambda: data_generator(descriptions, features, n_step, max_length), 
                                        (tf.float32, tf.int32, tf.int32), 
                                        (tf.TensorShape([None, 256, 256, 3]), tf.TensorShape([None, 48]), tf.TensorShape([None, 18])))
    ds = ds.batch(1)
    image, text, label = ds.make_one_shot_iterator().get_next()
    return {'images': image, 'texts': text}, label



In [15]:
# Define the input function for training
# input_fn = tf.estimator.inputs.numpy_input_fn(
#     x={'texts': x_texts, 'images': x_images}, y=y_labels,
#     batch_size=batch_size, num_epochs=50, shuffle=True)
# Train the Model
# model.train(input_fn, steps=1500)

for n_epoch in range(50):
    model.train(input_fn = lambda: input_fn(train_texts, train_features, 1, 150), steps=1500)

INFO:tensorflow:Calling model_fn.


ValueError: Input 0 of layer conv2d_1 is incompatible with the layer: expected ndim=4, found ndim=5. Full shape received: [None, None, 256, 256, 3]

In [None]:
# Evaluate the Model
# Define the input function for evaluating

# input_fn = tf.estimator.inputs.numpy_input_fn(
#    x={'texts': test_texts, 'images': test_images}, y=y_labels,
#     batch_size=batch_size, shuffle=False)
# # Use the Estimator 'evaluate' method
# model.evaluate(input_fn)