In [1]:
import os, glob
import numpy as np
import random
import json
import copy
import time
import tensorflow as tf
import datetime
import tqdm

  return f(*args, **kwds)


In [2]:
class TextDataSet(object):
    
    def __init__(self, filepath='../data/20news-18828', length=20000):
        self.basepath = filepath
        self.length=length
        self.class_map={}
        self.classes = os.listdir(filepath)
        for index, value in enumerate(self.classes):
            self.class_map[value] = index
        self.dataset = None
        
    def load(self, class_map, dataset):
        with open(class_map, 'r') as _file:
            self.class_map = copy.copy(json.load(_file))
        with open(dataset, 'r') as _file:
            self.dataset = copy.copy(json.load(_file))
        for cls in self.class_map:
            random.shuffle(self.dataset[str(self.class_map[cls])])
        
    def create_datasets(self):

        train = {}
        val = {}
        test ={}
        
        for i in self.classes:
            train[self.class_map[i]]=[]
            val[self.class_map[i]]=[]
            test[self.class_map[i]]=[]
            for filename in glob.glob(os.path.join(self.basepath, i, '*')):
                r = np.random.random_sample()
                if r > 0.95:
                    test[self.class_map[i]].append(filename)
                elif r > 0.9:
                    val[self.class_map[i]].append(filename)
                else:
                    train[self.class_map[i]].append(filename)
            random.shuffle(train[self.class_map[i]])
            random.shuffle(test[self.class_map[i]])
            random.shuffle(val[self.class_map[i]])
            
        with open('train.json', 'w') as output:
            json.dump(train, output)
        with open('test.json', 'w') as output:
            json.dump(test, output)
        with open('val.json', 'w') as output:
            json.dump(val, output)                
                
        with open('class_map.json', 'w') as output:
            json.dump(self.class_map, output)

    def get_text(self, filename):
        output= np.ndarray(shape=(self.length,), dtype=np.integer)
        index = 0
        with open(filename, 'r', encoding='utf-8', errors='ignore') as input_file:
            for line in input_file.readlines():
                for char in line:
                    if index >= self.length:
                        break
                    output[index] = self.decode_character(char)
                    index += 1
        return output
            
    def decode_character(self, char):
        try:
            return ord(char)
        except UnicodeDecodeError:
            return 0
    
    def get_random_filenames(self):
        tmp = []
        for cls in  self.class_map:
            try:
                tmp.append( (self.dataset[str(self.class_map[cls])].pop(), self.class_map[cls]))
            except IndexError:
                raise StopIteration
        random.shuffle(tmp)
        return [i[0] for i in tmp], [i[1] for i in tmp]
    
    def __iter__(self):
        return self
    
    def __next__(self):
        return self.next()
    
    def next(self):
        x, y = self.get_random_filenames()
        tmp_x = []
        for i in x:
            encoding = self.get_text(i)
            tmp_x.append(encoding)
        x = tmp_x    
        tmp_x = np.zeros(shape=(len(self.class_map), self.length))
        for index, arr in enumerate(x):
            tmp_x[index][:len(arr)] = arr
        # So, now we're going to have to create the Y matrix
        tmp_y = np.zeros(shape=(20,20))
        for index, value in enumerate(y):
                tmp_y[index, value] =1
        return tmp_x, tmp_y
    
def get_train_gen():
    train_gen = TextDataSet()
    train_gen.load('class_map.json', 'train.json')
    return train_gen

def get_test_gen():
    test_gen = TextDataSet()
    test_gen.load('class_map.json', 'test.json')
    return test_gen

In [1]:
class Model(object):
    
    def __init__(self, input_width=20000, n_classes=20):
        self.input_width = input_width
        self.n_classes = n_classes
        self.softmax = None
        self.predictions = None
    
    def compile(self, train=True):
        X = tf.placeholder(tf.float32, [None, self.input_width], name="X")
        Y = tf.placeholder(tf.float32, [None, self.n_classes], name="Y")

        X = tf.reshape(self.X, shape=[-1, self.input_width, 1])
        X = self.create_conv_layer(X, 3, 2, 128, 1)
        width = int((self.input_width-3)/2+1)
        width = int((width-3)/2+1)
        X = self.create_conv_layer(X, 3, 2, 256, 128, width)
        width = int((width-3)/2+1)
        width = int((width-3)/2+1)
        X = self.create_conv_layer(X, 3, 2, 512, 256, width)      
        width = int((width-3)/2+1)
        width = int((width-3)/2+1)
        X =  tf.reshape(X, [-1, width*512])
        
        X = tf.layers.dense(inputs=X, units=int(width/2),
                                      activation=tf.nn.relu, name='dense1')
        if train:
            X = tf.nn.dropout(X, 0.5)
        X = tf.layers.dense(inputs=X, units=int(width/4),
                            activation=tf.nn.relu, name='dense2')
        if train:
            self.X = tf.nn.dropout(X, 0.5)
        self.X = tf.layers.dense(inputs=X, units=20,
                                activation=tf.nn.relu, name='logits')
        
        self.softmax = tf.nn.softmax(X, name="softmax_tensor")
        self.predictions = tf.argmax(X, 1, name="predictions")
        
    def create_conv_layer(self, input_tensor, kernel_shape=3, stride=1, 
                          n_filters=64, in_channels=1, input_width=20000):
        with tf.name_scope('convlayer_{}_{}_{}_{}'.format(kernel_shape, stride, n_filters, in_channels)):
            filter_shape = [kernel_shape, in_channels, n_filters]
            W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W")
            b = tf.Variable(tf.constant(0.1, shape=[n_filters]), name="b")
            conv = tf.nn.conv1d(input_tensor, W, stride=stride, padding="VALID", name="conv")
            h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")
            
            expected_shape = int((input_width-kernel_shape)/stride+1)
            pooled = tf.nn.max_pool(tf.reshape(h, [-1, expected_shape,n_filters, 1]),
                                    ksize=[1, kernel_shape, 1, 1],
                                    strides=[1, stride, 1, 1],
                                    padding='VALID',
                                    name="pool")
            expected_shape = int((expected_shape-kernel_shape)/stride+1)
            return tf.reshape(pooled,  [-1, expected_shape,n_filters])

In [None]:
class Trainer(object):
    def __init__(self):
        self.loss = None
    
    def train(self, model, n_epochs):
        with tf.Graph().as_default():
            sess = tf.Session()
            with sess.as_default():
                model.compile(train=True)
                losses = tf.nn.softmax_cross_entropy_with_logits(labels=model.Y, logits=model.softmax)
                self.loss = tf.reduce_mean(losses)
                correct_predictions = tf.equal(model.predictions, tf.argmax(model.Y, 1))
                self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")                

                
                
                global_step = tf.Variable(0, name="global_step", trainable=False)
        optimizer = tf.train.AdamOptimizer(2e-1)
        grads_and_vars = optimizer.compute_gradients(self.loss)
        train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)                

In [None]:


    def train(self, sess):



        # Output directory for models and summaries
        timestamp = str(int(time.time()))
        out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
        print("Writing to {}\n".format(out_dir))

        # Summaries for loss and accuracy
        loss_summary = tf.summary.scalar("loss", self.loss)
        acc_summary = tf.summary.scalar("accuracy", self.accuracy)

        # Train Summaries
        train_summary_op = tf.summary.merge([loss_summary, acc_summary])
        train_summary_dir = os.path.join(out_dir, "summaries", "train")
        train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph)

        # Dev summaries
        dev_summary_op = tf.summary.merge([loss_summary, acc_summary])
        dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
        dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph)

        # Checkpointing
        checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
        checkpoint_prefix = os.path.join(checkpoint_dir, "model")

        # Tensorflow assumes this directory already exists so we need to create it
        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)
        saver = tf.train.Saver(tf.global_variables())

        tf.global_variables_initializer().run()

        for epoch in tqdm.tqdm(range(100)):
            train_gen = TextDataSet()
            train_gen.load('class_map.json', 'train.json')

            test_gen = TextDataSet()
            test_gen.load('class_map.json', 'train.json')

            step=0
            for x_batch, y_batch in tqdm.tqdm(train_gen):

                """
                A single training step
                """
                feed_dict = {
                  self.X: x_batch,
                  self.Y: y_batch
                }
                _, step, summaries, loss, accuracy = sess.run(
                    [train_op, global_step, train_summary_op, self.loss, self.accuracy],
                    feed_dict)
                time_str = datetime.datetime.now().isoformat()
                train_summary_writer.add_summary(summaries, step)
                step+=1

                try:
                    x_batch, ybatch = next(test_gen)
                except StopIteration:
                    test_gen = TextDataSet()
                    test_gen.load('class_map.json', 'train.json')
                    x_batch, ybatch = next(test_gen)

                """
                Evaluates model on a dev set
                """
                feed_dict = {
                  self.X: x_batch,
                  self.Y: y_batch
                }
                step, summaries, loss, accuracy = sess.run(
                    [global_step, dev_summary_op, self.loss, self.accuracy],
                    feed_dict)
                time_str = datetime.datetime.now().isoformat()
                dev_summary_writer.add_summary(summaries, step)

In [None]:
with tf.Graph().as_default():
    sess = tf.Session()
    with sess.as_default():
        cnn = TextCNN()
        cnn.train(sess)

Writing to /home/kwierman/Desktop/textrecog/notebook/runs/1516566887



  0%|          | 0/100 [00:00<?, ?it/s]
0it [00:00, ?it/s][A
1it [00:08,  8.05s/it][A
2it [00:16,  8.48s/it][A
3it [00:27,  9.07s/it][A
4it [00:38,  9.58s/it][A
5it [00:49,  9.89s/it][A
6it [01:00, 10.14s/it][A
7it [01:12, 10.32s/it][A
8it [01:22, 10.37s/it][A
9it [01:33, 10.36s/it][A
10it [01:43, 10.39s/it][A
11it [01:54, 10.39s/it][A
12it [02:04, 10.41s/it][A
13it [02:15, 10.42s/it][A
14it [02:26, 10.49s/it][A
15it [02:37, 10.50s/it][A
16it [02:48, 10.52s/it][A
17it [02:59, 10.54s/it][A
18it [03:09, 10.55s/it][A
19it [03:20, 10.56s/it][A
20it [03:31, 10.57s/it][A
21it [03:42, 10.58s/it][A
22it [03:53, 10.60s/it][A
23it [04:04, 10.64s/it][A
24it [04:15, 10.66s/it][A
25it [04:26, 10.66s/it][A
26it [04:37, 10.68s/it][A
27it [04:48, 10.70s/it][A
28it [05:00, 10.72s/it][A
29it [05:11, 10.74s/it][A
30it [05:22, 10.76s/it][A
31it [05:33, 10.76s/it][A
32it [05:45, 10.79s/it][A
33it [05:56, 10.79s/it][A
34it [06:08, 10.83s/it][A
35it [06:19, 10.83s/it][A
36