In [51]:
import tensorflow as tf
import numpy as np
import random
import datetime
import time

import os
import os.path

In [3]:
def iterate_over_dirs(pth, fun, y):
    for x in os.listdir(pth):
        xf = pth + x
        if os.path.isdir(xf):
            y = iterate_over_dirs(xf + '/', fun, y)
        elif os.path.isfile(xf):
            y = fun(y, xf)
    return y

def sample_corpora(files, file_num=10):
    ids = np.random.choice(len(files), size=file_num)
    res = []
    for i in ids:
        with open(file=files[i], mode='rb') as f:
            res += [f.read()]
    return res

def subsample_trunc_corpora(corpora, batch_num=100, batch_size=32):
    data = np.random.choice(corpora, size=batch_num)
    tmp = [(x, random.randint(0, len(x)-batch_size)) for x in data if len(x) >= batch_size]
    return [x[i:(i+batch_size)] for (x,i) in tmp]
    

def gather_chars(files):
    s = []
    for fn in files:
        with open(fn, 'rb') as f:
            s = set(list(s) + [x for x in f.read()])
    return list(s)

def create_encoding(files, chars):
    chars = gather_chars(files)
    encoding = np.zeros((255, len(chars)), dtype=np.float32)
    for i in range(len(chars)):
        encoding[chars[i], i] = 1.0
    return encoding

def encode_file(file, encoding):
    return np.array([encoding[int(i),:] for i in file])

In [23]:
all_files = iterate_over_dirs('D:/Jupyter/Datasets/PaulGraham/', lambda y,x: y + [x], [])
file_corpora = [x for x in all_files if os.path.splitext(x)[1] in ('.txt')]
chars = gather_chars(file_corpora)
encoding = create_encoding(file_corpora, chars)

def next_batch(batch_num=50, batch_size=32, files_num=10):
    texts = subsample_trunc_corpora(sample_corpora(all_files, files_num), batch_num, batch_size)
    return (np.array([[int(i) for i in f] for f in texts]), np.array([encode_file(x, encoding) for x in texts]))

In [16]:
np.argmax(encoding,axis=1)

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2,  3,
        4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
       21, 22, 23, 24, 25, 26, 27, 28, 29,  0, 30, 31, 32, 33, 34, 35, 36,
       37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
       54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70,
       71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
       88, 89, 90, 91, 92, 93, 94,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0

In [81]:
sequence_length = 32
sequence_train_len = sequence_length - 1
vocabulary_size = len(chars)

rnn_state_size = 200

tf.reset_default_graph()

tf_input = tf.placeholder(shape=(None, sequence_length), dtype=tf.int32)
tf_coded_input = tf.one_hot(tf_input, vocabulary_size)
tf_x = tf_coded_input[:-1]
tf_y = tf_coded_input[1:]
tf_keep = tf.placeholder(shape=(), dtype=tf.float32)

tf_cell = tf.nn.rnn_cell.DropoutWrapper(tf.nn.rnn_cell.LSTMCell(rnn_state_size, state_is_tuple=True),
                                        state_keep_prob=tf_keep)
#tf_cell = tf.nn.rnn_cell.MultiRNNCell([tf.nn.rnn_cell.LSTMCell(rnn_state_size, state_is_tuple=True) for _ in range(3)], state_is_tuple=True)

tf_output, _ = tf.nn.dynamic_rnn(tf_cell, inputs=tf_x, dtype=tf.float32)

tf_out_res = tf.layers.dense(tf_output, vocabulary_size)

tf_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=tf_out_res, labels=tf_y))
tf_accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(tf_y,axis=2), tf.argmax(tf_out_res,axis=2)), dtype=tf.float32))
tf_train = tf.train.AdamOptimizer(1e-1).minimize(tf_loss)

tf_1char = tf.placeholder(shape=(), dtype=tf.int32)
tf_state1 = tf.placeholder(shape=(rnn_state_size), dtype=tf.float32)
tf_state2 = tf.placeholder(shape=(rnn_state_size), dtype=tf.float32)
tf_1x = tf.one_hot(tf_input, vocabulary_size)
tf_1y, tf_newstate = tf_cell(tf_1x, (tf_state1, tf_state2))

tffw = tf.summary.FileWriter('D:/Jupyter/Logs/09B_Test1', tf.get_default_graph())

ValueError: Shape (?, 32, 95) must have rank 2

In [78]:
num_epoch = 20
num_step = 20
batch_size = 100

valid_x,_ = next_batch(batch_size=sequence_length, files_num=50, batch_num=1000)
valid_batch = {tf_input: valid_x, tf_keep: 1.0}

with tf.Session() as tfs:
    tfs.run(tf.global_variables_initializer())
    for i in range(num_epoch):
        train_x,_ = next_batch(batch_size=sequence_length, files_num=10, batch_num=batch_size)
        train_batch = {tf_input: train_x, tf_keep:0.5}
        
        train0 = tf_loss.eval(feed_dict=train_batch)
        time0 = time.perf_counter()
        for j in range(num_step):
            tf_train.run(feed_dict=train_batch)
        time1 = time.perf_counter()
        train1 = tf_loss.eval(feed_dict=train_batch)
        
        valid_loss, valid_acc = tfs.run([tf_loss, tf_accuracy], feed_dict=valid_batch)
        print('Epoch: {0}, {1:1.3} -> {2:1.3} in {3:1.2} sec\tvalidation loss={4:1.3}, accuracy={5:1.3}'.format(i,
                        train0, train1, time1-time0, valid_loss, valid_acc))


Epoch: 0, 1.09 -> 0.48 in 4.7 sec	validation loss=0.579, accuracy=0.0243
Epoch: 1, 0.57 -> 0.484 in 5.2 sec	validation loss=0.492, accuracy=0.156
Epoch: 2, 0.557 -> 0.56 in 4.7 sec	validation loss=0.523, accuracy=0.0242
Epoch: 3, 0.489 -> 0.467 in 4.6 sec	validation loss=0.46, accuracy=0.152
Epoch: 4, 0.508 -> 0.504 in 5.0 sec	validation loss=0.497, accuracy=0.152
Epoch: 5, 0.554 -> 0.596 in 4.8 sec	validation loss=0.602, accuracy=0.156
Epoch: 6, 0.63 -> 0.593 in 4.6 sec	validation loss=0.464, accuracy=0.156


KeyboardInterrupt: 

In [79]:
tf_cell.name

AttributeError: 'DropoutWrapper' object has no attribute 'name'