In [1]:
import numpy as np
import tensorflow as tf
import tensorflow.contrib
import random
import datetime

import os
import os.path

In [2]:
def iterate_over_dirs(pth, fun, y):
    for x in os.listdir(pth):
        xf = pth + x
        if os.path.isdir(xf):
            y = iterate_over_dirs(xf + '/', fun, y)
        elif os.path.isfile(xf):
            y = fun(y, xf)
    return y

def sample_corpora(files, file_num=10):
    ids = np.random.choice(len(files), size=file_num)
    res = []
    for i in ids:
        with open(file=files[i], mode='rb') as f:
            res += [f.read()]
    return res

def subsample_trunc_corpora(corpora, batch_num=100, batch_size=32):
    data = np.random.choice(corpora, size=batch_num)
    tmp = [(x, random.randint(0, len(x)-batch_size)) for x in data if len(x) >= batch_size]
    return [x[i:(i+batch_size)] for (x,i) in tmp]
    

def gather_chars(files):
    s = []
    for fn in files:
        with open(fn, 'rb') as f:
            s = set(list(s) + [x for x in f.read()])
    return list(s)

def create_encoding(files, chars):
    chars = gather_chars(files)
    encoding = np.zeros((255, len(chars)), dtype=np.float32)
    for i in range(len(chars)):
        encoding[chars[i], i] = 1.0
    return encoding

def encode_file(file, encoding):
    return np.array([encoding[int(i),:] for i in file])

In [3]:
all_files = iterate_over_dirs('D:/Jupyter/Datasets/PaulGraham/', lambda y,x: y + [x], [])
file_corpora = [x for x in all_files if os.path.splitext(x)[1] in ('.txt')]
chars = gather_chars(file_corpora)
encoding = create_encoding(file_corpora, chars)

def next_batch(batch_num=50, batch_size=32, files_num=10):
    texts = subsample_trunc_corpora(sample_corpora(all_files, files_num), batch_num, batch_size)
    return np.array([encode_file(x, encoding) for x in texts])

In [None]:
def build_basic_rnn_graph_with_list(
    state_size = 100,
    num_classes = vocab_size,
    batch_size = 32,
    num_steps = 200,
    learning_rate = 1e-4):

    reset_graph()

    x = tf.placeholder(tf.int32, [batch_size, num_steps], name='input_placeholder')
    y = tf.placeholder(tf.int32, [batch_size, num_steps], name='labels_placeholder')

    x_one_hot = tf.one_hot(x, num_classes)
    rnn_inputs = [tf.squeeze(i,squeeze_dims=[1]) for i in tf.split(1, num_steps, x_one_hot)]

    cell = tf.nn.rnn_cell.BasicRNNCell(state_size)
    init_state = cell.zero_state(batch_size, tf.float32)
    rnn_outputs, final_state = tf.nn.rnn(cell, rnn_inputs, initial_state=init_state)

    with tf.variable_scope('softmax'):
        W = tf.get_variable('W', [state_size, num_classes])
        b = tf.get_variable('b', [num_classes], initializer=tf.constant_initializer(0.0))
    logits = [tf.matmul(rnn_output, W) + b for rnn_output in rnn_outputs]

    y_as_list = [tf.squeeze(i, squeeze_dims=[1]) for i in tf.split(1, num_steps, y)]

    loss_weights = [tf.ones([batch_size]) for i in range(num_steps)]
    losses = tf.nn.seq2seq.sequence_loss_by_example(logits, y_as_list, loss_weights)
    total_loss = tf.reduce_mean(losses)
    train_step = tf.train.AdamOptimizer(learning_rate).minimize(total_loss)

    return dict(
        x = x,
        y = y,
        init_state = init_state,
        final_state = final_state,
        total_loss = total_loss,
        train_step = train_step
    )