Steps:

1. Create Pandas DF with {Index, img, svg}
2. Preprocess svg:
    1. Split into 'words
    2. add start and end sequence 
    3. Tokenize svg items
    4. Calculate maximum length of svg files (in words
    5. Create Sequences for the svg files
3. Create and Train model

In [1]:
#Imports

import numpy as np
import pandas
from PIL import Image

#Keras 
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint
from keras.utils import plot_model
from keras.models import Model
from keras.layers import Input, Dense, LSTM, Embedding, Dropout,\
                        Conv2D, MaxPooling2D, Flatten, GRU
from keras.layers.merge import add
from keras.utils import plot_model
from keras.backend import clear_session
import tensorflow as tf


from charactertable import CharacterTable


Using TensorFlow backend.


In [2]:
# Creating DF
def createDataFrame(index_range):
    d = {'index' : [], 'img' : [], 'svg' : []}
    for index in index_range:
        img = np.array(Image.open(f'train/png/png/{index}.png'))[:,:,:3] # TODO Check alpha dimension
        #svg = open(f'train/svg/{index}.svg').read()
        svg = open(f'train/svg/{index}.svg').read()
        
        d['index'] += [index]
        d['img'] += [img]
        d['svg'] += [svg]
    df = pandas.DataFrame(data=d)
    return df

In [3]:
def preprocess_svg(svg_col):
    # Split into words
    new_col = [doc.split() for doc in svg_col]
    
    # Add start and end sequence
    # Complile a set of all unique svg elements from all the SVG files
    vocab = set() # set to prevent double elements
    for doc in new_col:
        doc.insert(0,'<start>')
        doc.append('<end>')
        vocab.update(doc)
    
    # Calculate maximum length of svg files in words
    max_len = len(max(new_col, key=len))
    

    return new_col, vocab, max_len

In [89]:
num_examples = 10000 #Rerun all code if changing this
df = createDataFrame(range(num_examples))
df['svg'], vocab, max_len = preprocess_svg(df['svg'])

In [90]:
chars = vocab
ctable = CharacterTable(chars)

x1 = np.stack(df['img'])
x2 = np.zeros((num_examples, max_len, ctable.num_tokens))
y = np.zeros((num_examples, max_len, ctable.num_tokens))

for i, sentence in enumerate(df['svg']):
    x2[i] = ctable.encode(sentence[1:-1], max_len)

for i, sentence in enumerate(df['svg']):
    y[i] = ctable.encode(sentence, max_len)    

# Explicitly set apart 10% for validation and test data.
num_examples_val = len(x1) // 10
print(num_examples_val)
split_at_val = len(x1) - num_examples_val * 2
split_at_test = len(x1) - num_examples_val
x1_train = x1[:split_at_val]
x2_train = x2[:split_at_val]
y_train = y[:split_at_val]
x1_val = x1[split_at_val:split_at_test]
x2_val = x2[split_at_val:split_at_test]
y_val = y[split_at_val:split_at_test]
x1_test = x1[split_at_test:]
x2_test = x2[split_at_test:]
y_test = y[split_at_test:]

print('Training Data:')
print(x1_train.shape)
print(x2_train.shape)
print(y_train.shape)
print('Validation Data:')
print(x1_val.shape)
print(x2_val.shape)
print(y_val.shape)
print('Test Data:')
print(x1_test.shape)
print(x2_test.shape)
print(y_test.shape)

1000
Training Data:
(8000, 64, 64, 3)
(8000, 63, 60)
(8000, 63, 60)
Validation Data:
(1000, 64, 64, 3)
(1000, 63, 60)
(1000, 63, 60)
Test Data:
(1000, 64, 64, 3)
(1000, 63, 60)
(1000, 63, 60)


In [91]:
## Fix for strange errors during training
clear_session() #clear any previous keras sessions
graph = tf.get_default_graph()
##

# Parameters for the model.
hidden_size = 512

#Image:
input_img = Input(shape=(64, 64, 3))   

x = Conv2D(16, (3, 3), activation='relu', padding='same')(input_img)
x = MaxPooling2D((2, 2), padding='same')(x)
x = Conv2D(8, (3, 3), activation='relu', padding='same')(x)
x = MaxPooling2D((2, 2), padding='same')(x)
x = Conv2D(8, (3, 3), activation='relu', padding='same')(x)
x = MaxPooling2D((2, 2), padding='same')(x)
encoded_img = Flatten()(x)

# Define an input sequence and process it.
encoder_inputs = Input(shape=(max_len, ctable.num_tokens))
encoder_rnn_layer = GRU(hidden_size, return_state=True)
# We discard the output of the layer and only keep the states.
_, encoder_state = encoder_rnn_layer(encoder_inputs)

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None, ctable.num_tokens))
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_rnn_layer = GRU(hidden_size, return_sequences=True, return_state=True)

decoder_outputs, _ = decoder_rnn_layer(decoder_inputs,
                                       initial_state=encoder_state)

decoder_add = add([encoded_img, decoder_outputs])
decoder_dense = Dense(ctable.num_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_add)

# Define the model that will turn
# `encoder_inputs` & `decoder_inputs` into `decoder_outputs`
model = Model([input_img, encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(loss='categorical_crossentropy',
                  optimizer='rmsprop',
                  metrics=['accuracy'])
model.summary()
#plot_model(model, to_file='model.png')

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 64, 64, 3)    0                                            
__________________________________________________________________________________________________
conv2d_1 (Conv2D)               (None, 64, 64, 16)   448         input_1[0][0]                    
__________________________________________________________________________________________________
max_pooling2d_1 (MaxPooling2D)  (None, 32, 32, 16)   0           conv2d_1[0][0]                   
__________________________________________________________________________________________________
conv2d_2 (Conv2D)               (None, 32, 32, 8)    1160        max_pooling2d_1[0][0]            
__________________________________________________________________________________________________
max_poolin

In [92]:
num_epochs = 10
batch_size = 16

history = model.fit([x1_train, x2_train, y_train[:, :-1]], y_train[:, 1:],
                    batch_size=batch_size,
                    epochs=num_epochs,
                    validation_data=([x1_val, x2_val, y_val[:, :-1]], y_val[:, 1:]),
                    verbose=1)

Train on 8000 samples, validate on 1000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [93]:
encoder_model = Model(encoder_inputs, encoder_state)

### PUT YOUR CODE HERE ###
decoder_state_input = Input(shape=(hidden_size,))
decoder_outputs, decoder_state = decoder_rnn_layer(
    decoder_inputs, initial_state=decoder_state_input)
decoder_add = add([encoded_img, decoder_outputs])
decoder_outputs = decoder_dense(decoder_add)
### END ###


decoder_model = Model(
    [input_img, decoder_inputs, decoder_state_input],
    [decoder_outputs, decoder_state])

In [94]:
### UPDATE CODE HERE ###
def decode_sequence(input_seq,img, max_decoder_seq_length):
    # Encode the input as state vectors.
    state_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, ctable.num_tokens))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0, ctable.char_indices['<start>']] = 1.
    
    

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_svg = []
    while not stop_condition:
        output_tokens, state_value = decoder_model.predict(
            [img, target_seq, state_value])

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = ctable.indices_char[sampled_token_index]
        decoded_svg += [sampled_char]

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '<end>' or
           len(decoded_svg) > max_decoder_seq_length):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, ctable.num_tokens))
        target_seq[0, 0, sampled_token_index] = 1.
    return decoded_svg
### END ###

In [95]:
class colors:
    ok = '\033[92m'
    fail = '\033[91m'
    close = '\033[0m'
    
# Select 10 samples from the validation set at random so we can visualize
# errors.

save_dir = 'genSVG\\'
for i in range(10):
    ind = np.random.randint(0, len(x2_test))
    img, q, ea = x1_test[np.array([ind])], x2_test[np.array([ind])], y_test[np.array([ind])]
    pred = ' '.join(decode_sequence(q, img, max_len)[:-1])
    q = ctable.decode(q[0])
#     correct = ''.join([ctable.indices_char[x] for x in ea[0] if x != 0])
    answer = ea[0].tolist()
    
    correct = ' '.join([ctable.indices_char[np.argmax(x)] for x in answer if np.argmax(x) != 0][1:-1])
#     print('Q', q[::-1]) # if reverse else q, end=' '
#     print()
#     print('T', correct, end=' ')
#     print()
#     if correct == pred:
#         print(colors.ok + '☑' + colors.close, end=' ')
#     else:
#         print(colors.fail + '☒' + colors.close, end=' ')
    with open(f'{save_dir}{i}-pred.svg','w') as fp:
        fp.write(pred)
    with open(f'{save_dir}{i}-corr.svg','w') as fp:
        fp.write(correct)