In [1]:
import numpy as np
import tensorflow as tf
import tensorflow.contrib
import random
import datetime

import os
import os.path

In [2]:
def iterate_over_dirs(pth, fun, y):
    for x in os.listdir(pth):
        xf = pth + x
        if os.path.isdir(xf):
            y = iterate_over_dirs(xf + '/', fun, y)
        elif os.path.isfile(xf):
            y = fun(y, xf)
    return y

def sample_corpora(files, file_num=10):
    ids = np.random.choice(len(files), size=file_num)
    res = []
    for i in ids:
        with open(file=files[i], mode='rb') as f:
            res += [f.read()]
    return res

def subsample_trunc_corpora(corpora, batch_num=100, batch_size=32):
    data = np.random.choice(corpora, size=batch_num)
    tmp = [(x, random.randint(0, len(x)-batch_size)) for x in data if len(x) >= batch_size]
    return [x[i:(i+batch_size)] for (x,i) in tmp]
    

def gather_chars(files):
    s = []
    for fn in files:
        with open(fn, 'rb') as f:
            s = set(list(s) + [x for x in f.read()])
    return list(s)

def create_encoding(files, chars):
    chars = gather_chars(files)
    encoding = np.zeros((255, len(chars)), dtype=np.float32)
    for i in range(len(chars)):
        encoding[chars[i], i] = 1.0
    return encoding

def encode_file(file, encoding):
    return np.array([encoding[int(i),:] for i in file])

In [3]:
all_files = iterate_over_dirs('D:/Jupyter/Datasets/PaulGraham/', lambda y,x: y + [x], [])
file_corpora = [x for x in all_files if os.path.splitext(x)[1] in ('.txt')]
chars = gather_chars(file_corpora)
encoding = create_encoding(file_corpora, chars)

def next_batch(batch_num=50, batch_size=32, files_num=10):
    texts = subsample_trunc_corpora(sample_corpora(all_files, files_num), batch_num, batch_size)
    return np.array([encode_file(x, encoding) for x in texts])

In [39]:
print(len(file_corpora))
print(encoding.shape)

172
(255, 95)


In [54]:
num_units = 512
num_layers = 10
num_out = encoding.shape[1]

def basic_cell():
    return tf.contrib.rnn.BasicRNNCell(num_units=num_units, activation=tf.nn.relu)

tf.reset_default_graph()

tfLength = tf.placeholder(shape=(None), dtype=tf.int32)
tfSample = tf.placeholder(shape=(None, None, num_out), dtype=tf.float32)
#tfRNN = tf.contrib.rnn.BasicRNNCell(num_units=num_units, activation=tf.nn.relu)
#tfRNN = tf.nn.rnn_cell.GRUCell(num_units=num_units, activation=tf.nn.relu)
#tfRNN0 = tf.contrib.rnn.BasicRNNCell(num_units=num_units, activation=tf.nn.relu)
tfRNN  = tf.contrib.rnn.MultiRNNCell([basic_cell() for _ in range(num_layers)], state_is_tuple=True)

tfTOut,_ = tf.nn.dynamic_rnn(tfRNN, inputs=tfSample, sequence_length=tfLength, dtype=tf.float32)

tfTRes = tf.layers.dense(tfTOut, num_out, name='FNN')
tfTW = tf.get_default_graph().get_tensor_by_name(os.path.split(tfTRes.name)[0] + '/kernel:0')
tfTB = tf.get_default_graph().get_tensor_by_name(os.path.split(tfTRes.name)[0] + '/bias:0')

tfLoss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=tfSample[:,1:,:], logits=tfTRes[:,:-1,:]))

#tfGenInput = tf.Variable(tf.zeros((1, num_out)), dtype=tf.float32)
#tfGenState = tf.Variable(tf.zeros((1, num_units)), dtype=tf.float32)
#tfOutput, tfNewState = tfRNN(inputs=tfGenInput, state=tfGenState)
#tfOutputF = tf.matmul(tfOutput, tfTW) + tfTB
#tfOutputP = tf.nn.softmax(tfOutputF)
#tfNewInput = tf.one_hot(tf.multinomial(logits=tfOutputF, num_samples=1), depth=encoding.shape[1])

#tfRunInit = tf.group(tf.assign(tfGenState, tf.zeros((1, num_units))), tf.assign(tfGenInput, tf.zeros((1, num_out))))
#tfRunStep = tf.group(tf.assign(tfGenState, tfNewState), tf.assign(tfGenInput, tf.reshape(tfNewInput, (1, num_out))))

tfTrain = tf.train.AdamOptimizer(1e-2).minimize(tfLoss)

tfLossSummary = tf.summary.scalar('Loss', tfLoss)

tffw = tf.summary.FileWriter('D:/Jupyter/Logs/TEST_RNN_{0}'.format(datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S")), tf.get_default_graph())

In [55]:
num_steps = 5
num_epochs = 200

valid_corpora = next_batch(batch_num=1000)
valid_batch = {tfSample: valid_corpora, tfLength: valid_corpora.shape[1]}
with tf.Session() as tfs:
    tfs.run(tf.global_variables_initializer())
    
    for i in range(num_epochs):
        batch = next_batch(batch_num=100, batch_size=32)
        train_batch = {tfSample: batch, tfLength: batch.shape[1]}
        start_loss = tfLoss.eval(feed_dict=train_batch)
        for j in range(num_steps):
            tfTrain.run(feed_dict=train_batch)
            
        #tfRunInit.run()
        #res = []
        #for k in range(20):
        #    tfRunStep.run()
        #    res += [np.argmax(tfOutputP.eval())]
        
        tffw.add_summary(tfLossSummary.eval(feed_dict=valid_batch), i)
        print('{0:0.5} -> {1:0.5}'.format(start_loss, tfLoss.eval(feed_dict=train_batch)))
        #if i%10 == 0:
            #print(res)
        #    print(''.join([chr(chars[x]) for x in res]))

4.5542 -> 56.921
57.462 -> 4.0074
3.971 -> 3.9858
3.9055 -> 3.2575
3.27 -> 3.2455
3.3049 -> 3.2192
3.1755 -> 3.1279
3.161 -> 3.1493
3.2191 -> 3.2015
3.1324 -> 3.1231
3.1697 -> 3.1601
3.1803 -> 3.1691
3.1728 -> 3.1658
3.1749 -> 3.1677
3.1725 -> 3.1613
3.1793 -> 3.1709
3.1929 -> 3.1814
3.1774 -> 3.1698
3.1188 -> 3.1134
3.1416 -> 3.1322
3.1679 -> 3.1581
3.1654 -> 3.1551
3.1669 -> 3.1505
3.156 -> 3.1365
3.1519 -> 3.0989
3.1489 -> 3.0666
3.0536 -> 3.0021
3.0162 -> 2.8799
2.9419 -> 2.865
2.9112 -> 2.7965
2.9248 -> 2.8304


KeyboardInterrupt: 

In [105]:
encoding[3,:-1].shape

(94,)

In [148]:
random.choice(['a','b','c'])

'a'

TypeError: randint() missing 1 required positional argument: 'b'