### Optical character recognition using RNNs

In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [2]:
import os
import gzip
import csv

In [3]:
import numpy as np
import tensorflow as tf

In [None]:
from six.moves import urllib

In [5]:
URL_PATH = 'http://ai.stanford.edu/~btaskar/ocr/letter.data.gz'
DOWNLOADED_FILENAME = 'letter.data.gz'

def download_data():
    if not os.path.exists(DOWNLOADED_FILENAME):
        filename, _ = urllib.request.urlretrieve(URL_PATH, DOWNLOADED_FILENAME)
    
    print'Found and verified file from this path: ', URL_PATH
    print'Downloaded file: ', DOWNLOADED_FILENAME

In [6]:
download_data()

Found and verified file from this path:  http://ai.stanford.edu/~btaskar/ocr/letter.data.gz
Downloaded file:  letter.data.gz


In [7]:
def read_lines():
    with gzip.open(DOWNLOADED_FILENAME, 'rt') as f:
        reader = csv.reader(f, delimiter='\t')
        lines = list(reader)
        return lines

In [8]:
lines = read_lines()

In [9]:
lines[1][:20]

['2',
 'm',
 '3',
 '1',
 '2',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0']

In [10]:
len(lines)

52152

In [11]:
def get_features_labels(lines):
    lines = sorted(lines, key=lambda x: int(x[0]))
    data, target = [], []
    
    next_id = -1
    
    word = []
    word_pixels = []

    for line in lines:
         # The index for the next_id column
        next_id = int(line[2])

        # An image for a single character, reshaped
        pixels = np.array([int(x) for x in line[6:134]])
        pixels = pixels.reshape((16, 8))
        
        # Word pixels are a list of 16x8 images which form a single word
        word_pixels.append(pixels)
        
        # Append together the characters which make up a word
        word.append(line[1])
        
        if next_id == -1:
            data.append(word_pixels)
            target.append(word) 
            word = []
            word_pixels = []


    return data, target

In [12]:
data, target = get_features_labels(lines)

#### The total number of words in our dataset

In [13]:
len(data), len(target)

(6877, 6877)

#### All words lengths should be the same

* Get every word to be the same length as the longest word in our dataset
* Pad the words with empty characters

In [14]:
def pad_features_labels(data, target):    
    max_length = max(len(x) for x in target)
    
    # Set up image representations for the empty string (all pixels set to 0)
    padding = np.zeros((16, 8))

    # Pad the image data with the empty string images
    data = [x + ([padding] * (max_length - len(x))) for x in data]
    
    # Pad the words with empty string characters
    target = [x + ([''] * (max_length - len(x))) for x in target]
    
    return np.array(data), np.array(target)

In [15]:
padded_data, padded_target = pad_features_labels(data, target)

In [16]:
padded_target[:10]

array([['o', 'm', 'm', 'a', 'n', 'd', 'i', 'n', 'g', '', '', '', '', ''],
       ['o', 'm', 'm', 'a', 'n', 'd', 'i', 'n', 'g', '', '', '', '', ''],
       ['o', 'm', 'm', 'a', 'n', 'd', 'i', 'n', 'g', '', '', '', '', ''],
       ['o', 'm', 'm', 'a', 'n', 'd', 'i', 'n', 'g', '', '', '', '', ''],
       ['o', 'm', 'm', 'a', 'n', 'd', 'i', 'n', 'g', '', '', '', '', ''],
       ['o', 'm', 'm', 'a', 'n', 'd', 'i', 'n', 'g', '', '', '', '', ''],
       ['o', 'm', 'm', 'a', 'n', 'd', 'i', 'n', 'g', '', '', '', '', ''],
       ['o', 'm', 'm', 'a', 'n', 'd', 'i', 'n', 'g', '', '', '', '', ''],
       ['o', 'm', 'm', 'a', 'n', 'd', 'i', 'n', 'g', '', '', '', '', ''],
       ['o', 'm', 'm', 'a', 'n', 'd', 'i', 'n', 'g', '', '', '', '', '']], 
      dtype='<U1')

In [17]:
padded_target[200:210]

array([['m', 'b', 'r', 'a', 'c', 'e', 's', '', '', '', '', '', '', ''],
       ['m', 'b', 'r', 'a', 'c', 'e', 's', '', '', '', '', '', '', ''],
       ['m', 'b', 'r', 'a', 'c', 'e', 's', '', '', '', '', '', '', ''],
       ['m', 'b', 'r', 'a', 'c', 'e', 's', '', '', '', '', '', '', ''],
       ['m', 'b', 'r', 'a', 'c', 'e', 's', '', '', '', '', '', '', ''],
       ['m', 'b', 'r', 'a', 'c', 'e', 's', '', '', '', '', '', '', ''],
       ['m', 'b', 'r', 'a', 'c', 'e', 's', '', '', '', '', '', '', ''],
       ['m', 'b', 'r', 'a', 'c', 'e', 's', '', '', '', '', '', '', ''],
       ['m', 'b', 'r', 'a', 'c', 'e', 's', '', '', '', '', '', '', ''],
       ['m', 'b', 'r', 'a', 'c', 'e', 's', '', '', '', '', '', '', '']], 
      dtype='<U1')

#### The length of each sequence

We've padded all words so that their lengths are all equal to the length of the longest word

In [18]:
word_length = len(padded_target[0])

In [19]:
word_length

14

#### Tensor shape

* 6877 words
* Each word padded to have 14 characters
* Each character represented by 16x8 image

In [20]:
padded_data.shape

(6877, 14, 16, 8)

In [21]:
padded_data.shape[:2] + (-1,)

(6877, 14, -1)

In [22]:
reshaped_data = padded_data.reshape(padded_data.shape[:2] + (-1,))

#### Reshape the data so the image is a 1-D array of pixels

In [23]:
reshaped_data.shape

(6877, 14, 128)

#### Tensor shape

* 6877 words
* Each an array with 14 characters (padded with empty strings as needed)

In [24]:
padded_target.shape

(6877, 14)

#### One-hot representation

* Each character has a feature vector of 26 (only lower case characters)

In [25]:
padded_target.shape + (26,)

(6877, 14, 26)

In [26]:
one_hot_target = np.zeros(padded_target.shape + (26,))

### Numpy.ndenumerate is a way to get all indices needed to access elements of a matrix
<pre>
a = numpy.array([[1,2],[3,4],[5,6]])
for (x,y), value in numpy.ndenumerate(a):
  print x,y 
</pre>
 
0 0 <br>
0 1 <br>
1 0 <br>
1 1 <br>
2 0 <br>
2 1 <br>

In [27]:
for index, letter in np.ndenumerate(padded_target):
    if letter:
        one_hot_target[index][ord(letter) - ord('a')] = 1

#### One-hot representation of the letter 'o'

* The letter 'o' represented by a 1 at the 14th index 
* Index positions start at 0

In [28]:
one_hot_target[0][0]

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.])

In [29]:
shuffled_indices = np.random.permutation(len(reshaped_data))

shuffled_data = reshaped_data[shuffled_indices]
shuffled_target = one_hot_target[shuffled_indices]

#### Split into training and test data

In [30]:
split = int(0.66 * len(shuffled_data))

train_data = shuffled_data[:split]
train_target = shuffled_target[:split]

test_data = shuffled_data[split:]
test_target = shuffled_target[split:]

In [31]:
train_data.shape

(4538, 14, 128)

In [32]:
_, num_steps, num_inputs = train_data.shape

In [38]:
train_target.shape

(4538, 14, 26)

In [39]:
num_classes = train_target.shape[2]

In [40]:
tf.reset_default_graph()

In [41]:
X = tf.placeholder(tf.float64, [None, num_steps, num_inputs])

y = tf.placeholder(tf.float64, [None, num_steps, num_classes])

#### Sequence length calculation

*['How', 'are', 'you', 'doing'] ==> [14, 14, 14, 14] ==> [3, 3, 3, 5]*
 
 The actual length of each word (without the padding) in the input batch

In [42]:
# All real characters will have a max value of 1, padded characters will be represented by 0s
used = tf.sign(tf.reduce_max(tf.abs(X), reduction_indices=2))

# Sum up the number of real characters for each word
length = tf.reduce_sum(used, reduction_indices=1)
sequence_length = tf.cast(length, tf.int32)

In [43]:
sequence_length

<tf.Tensor 'Cast:0' shape=(?,) dtype=int32>

#### RNN for training and prediction

In [44]:
num_neurons = 300

In [45]:
cell = tf.nn.rnn_cell.GRUCell(num_neurons)

#### *sequence_length* is the length of the valid input for each batch

Included to improve accuracy and not for performance

In [46]:
output, _ = tf.nn.dynamic_rnn(cell, X, dtype=tf.float64, sequence_length=sequence_length)

In [47]:
output.shape

TensorShape([Dimension(None), Dimension(14), Dimension(300)])

#### Shared softmax layer

In [48]:
weight = tf.Variable(tf.truncated_normal([num_neurons, num_classes], stddev=0.01, dtype=tf.float64))

In [49]:
bias = tf.Variable(tf.constant(0.1, shape=[num_classes], dtype=tf.float64))

In [50]:
flattened_output = tf.reshape(output, [-1, num_neurons])

In [51]:
flattened_output

<tf.Tensor 'Reshape:0' shape=(?, 300) dtype=float64>

In [52]:
logits = tf.matmul(flattened_output, weight) + bias

In [53]:
logits_reshaped = tf.reshape(logits, [-1, num_steps, num_classes])

#### Cost calculation

In [54]:
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y)

Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See tf.nn.softmax_cross_entropy_with_logits_v2.



In [55]:
loss = tf.reduce_mean(cross_entropy)

#### Error calculation

* For every word calculate how many of the characters we predicted correctly
* Use the mask to not consider (leave out) the padded characters on which our prediction was wrong
* Find the fraction of each word where we made mistakes in our character prediction
* Find the average fraction of each word that were mistakes

In [56]:
mistakes = tf.not_equal(
            tf.argmax(y, 2), tf.argmax(logits_reshaped, 2))
mistakes = tf.cast(mistakes, tf.float64)
mask = tf.sign(tf.reduce_max(tf.abs(y), reduction_indices=2))
mistakes *= mask

In [57]:
mistakes = tf.reduce_sum(mistakes, reduction_indices=1)
mistakes /= tf.cast(sequence_length, tf.float64)

In [58]:
error = tf.reduce_mean(mistakes)

#### Optimizer

In [59]:
optimizer = tf.train.RMSPropOptimizer(0.002)

In [60]:
gradient = optimizer.compute_gradients(loss)

In [61]:
optimize = optimizer.apply_gradients(gradient)

In [62]:
def batched(data, target, batch_size):
    epoch = 0
    offset = 0
    while True:
        old_offset = offset
        offset = (offset + batch_size) % (target.shape[0] - batch_size)

        # Offset wrapped around to the beginning so new epoch
        if offset < old_offset:
            # New epoch, need to shuffle data
            shuffled_indices = np.random.permutation(len(data))
            
            data = data[shuffled_indices]
            target = target[shuffled_indices]

            epoch += 1

        batch_data = data[offset:(offset + batch_size), :]
        
        batch_target = target[offset:(offset + batch_size), :]

        yield batch_data, batch_target, epoch

In [63]:
batch_size = 20
batches = batched(train_data, train_target, batch_size)

In [64]:
epochs = 5

In [65]:
with tf.Session() as sess:
    
    sess.run(tf.global_variables_initializer())

    for index, batch in enumerate(batches):
        batch_data = batch[0]
        batch_target = batch[1]
    
        epoch = batch[2]

        if epoch >= epochs:
            break
        
        feed = {X: batch_data, y: batch_target}
        train_error, _ = sess.run([error, optimize], feed)
        
        print('{}: {:3.6f}%'.format(index + 1, 100 * train_error))

    test_feed = {X: test_data, y: test_target}
    test_error, _ = sess.run([error, optimize], test_feed)
    
    print('Test error: {:3.6f}%'.format(100 * test_error))

1: 96.507937%
2: 94.188312%
3: 98.202686%
4: 99.166667%
5: 97.230159%
6: 97.515873%
7: 99.375000%
8: 96.750000%
9: 97.989899%
10: 94.434829%
11: 94.657051%
12: 94.795635%
13: 96.041667%
14: 95.708333%
15: 96.942280%
16: 96.013709%
17: 91.539627%
18: 90.376984%
19: 94.045455%
20: 97.531746%
21: 95.351190%
22: 90.063492%
23: 91.362734%
24: 93.755952%
25: 86.968323%
26: 93.025794%
27: 94.507937%
28: 92.019481%
29: 93.936508%
30: 92.494048%
31: 93.974026%
32: 91.010046%
33: 91.984127%
34: 91.887210%
35: 96.136142%
36: 87.591270%
37: 92.419261%
38: 86.050866%
39: 90.924603%
40: 91.439103%
41: 93.244048%
42: 96.869658%
43: 87.898268%
44: 92.257881%
45: 88.691919%
46: 91.973485%
47: 84.680986%
48: 86.155067%
49: 89.682540%
50: 91.529526%
51: 89.920455%
52: 90.040598%
53: 90.514319%
54: 86.238095%
55: 86.718198%
56: 90.807179%
57: 87.794913%
58: 87.966880%
59: 88.946553%
60: 86.400683%
61: 92.625000%
62: 88.965146%
63: 90.847527%
64: 92.705128%
65: 89.140873%
66: 87.197330%
67: 87.297536%
68: 

## Bi-directional RNN

In [66]:
new_output, _ = tf.nn.bidirectional_dynamic_rnn(tf.nn.rnn_cell.GRUCell(num_neurons), 
                                            tf.nn.rnn_cell.GRUCell(num_neurons),
                                            X,
                                            dtype=tf.float64, sequence_length=sequence_length)

In [67]:
new_output

(<tf.Tensor 'bidirectional_rnn/fw/fw/transpose_1:0' shape=(?, 14, 300) dtype=float64>,
 <tf.Tensor 'ReverseSequence:0' shape=(?, 14, 300) dtype=float64>)

In [68]:
new_output = tf.concat([new_output[0], new_output[1]], axis=2)

In [69]:
new_output.shape

TensorShape([Dimension(None), Dimension(14), Dimension(600)])

In [74]:
new_weight = tf.Variable(tf.truncated_normal([num_neurons * 2, num_classes], stddev=0.01, dtype=tf.float64))

In [75]:
new_bias = tf.Variable(tf.constant(0.1, shape=[num_classes], dtype=tf.float64))

In [76]:
new_flattened_output = tf.reshape(new_output, [-1, num_neurons * 2])

In [77]:
new_flattened_output

<tf.Tensor 'Reshape_3:0' shape=(?, 600) dtype=float64>

In [78]:
new_logits = tf.matmul(new_flattened_output, new_weight) + new_bias

In [79]:
new_logits_reshaped = tf.reshape(new_logits, [-1, num_steps, num_classes])

In [80]:
new_cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=new_logits, labels=y)

In [81]:
new_loss = tf.reduce_mean(new_cross_entropy)

In [82]:
new_mistakes = tf.not_equal(
            tf.argmax(y, 2), tf.argmax(new_logits_reshaped, 2))
new_mistakes = tf.cast(new_mistakes, tf.float64)
new_mask = tf.sign(tf.reduce_max(tf.abs(y), reduction_indices=2))
new_mistakes *= new_mask

In [83]:
new_mistakes = tf.reduce_sum(new_mistakes, reduction_indices=1)
new_mistakes /= tf.cast(sequence_length, tf.float64)

In [96]:
new_error = tf.reduce_mean(new_mistakes)

In [97]:
new_optimizer = tf.train.RMSPropOptimizer(0.002)

In [98]:
new_gradient = new_optimizer.compute_gradients(new_loss)

In [99]:
new_optimize = new_optimizer.apply_gradients(new_gradient)

In [100]:
new_batch_size = 10
new_batches = batched(train_data, train_target, batch_size)

In [101]:
epochs = 5

In [102]:
with tf.Session() as sess:
    
    sess.run(tf.global_variables_initializer())

    for index, batch in enumerate(new_batches):
        batch_data = batch[0]
        batch_target = batch[1]
    
        epoch = batch[2]

        if epoch >= epochs:
            break
        
        feed = {X: batch_data, y: batch_target}
        train_error, _ = sess.run([new_error, new_optimize], feed)
        
        print('{}: {:3.6f}%'.format(index + 1, 100 * train_error))

    test_feed = {X: test_data, y: test_target}
    test_error, _ = sess.run([new_error, new_optimize], test_feed)
    
    print('Test error: {:3.6f}%'.format(100 * test_error))

1: 94.835317%
2: 95.771645%
3: 95.962302%
4: 97.083333%
5: 97.833333%
6: 94.634921%
7: 96.754274%
8: 96.234432%
9: 96.709957%
10: 90.875000%
11: 95.105284%
12: 96.291667%
13: 94.428571%
14: 91.462121%
15: 97.934343%
16: 98.276099%
17: 95.279762%
18: 95.307540%
19: 97.355284%
20: 94.527778%
21: 94.535714%
22: 94.962302%
23: 93.702325%
24: 94.734127%
25: 91.563617%
26: 95.684524%
27: 94.494172%
28: 93.122835%
29: 93.227994%
30: 91.396825%
31: 93.168651%
32: 95.512210%
33: 97.021645%
34: 89.133547%
35: 87.401709%
36: 90.138889%
37: 90.149725%
38: 85.769300%
39: 91.716270%
40: 89.998321%
41: 91.791667%
42: 93.797924%
43: 86.864899%
44: 96.720058%
45: 85.143939%
46: 90.830808%
47: 85.748876%
48: 87.949023%
49: 88.696429%
50: 88.022145%
51: 89.785354%
52: 88.344475%
53: 82.385226%
54: 86.859127%
55: 87.028208%
56: 83.237734%
57: 86.271465%
58: 88.228244%
59: 76.482087%
60: 93.111361%
61: 83.587302%
62: 87.428460%
63: 78.310440%
64: 84.064560%
65: 81.557540%
66: 82.932179%
67: 84.621129%
68: 