In [56]:
# import data

import os

original_dir_path = '../data/shakespeare/original/'
modern_dir_path = '../data/shakespeare/modern/'

def read_all_files(dir_path):
    docs = ""
    for filename in os.listdir(dir_path):
        with open(dir_path + filename, 'r') as file:
            docs += file.read()
    return docs

original_docs = read_all_files(original_dir_path)
modern_docs = read_all_files(modern_dir_path)

print(original_docs[:100])
print(modern_docs[:100])

I have a mind to strike thee ere thou speak’st.
Yet if thou say Antony lives, is well, Or friends wi
I have half a mind to hit you before you speak again.
But if Antony is alive, healthy, friendly with


In [48]:
# One-hot encode entire dataset

from sklearn.preprocessing import LabelEncoder

dataset = (original_docs + " " + modern_docs).replace(".","").replace("\n", " ").split(" ")
print(dataset[:100])

enc = LabelEncoder()
enc.fit(dataset)
V = len(enc.classes_) #size of vocabulary

['I', 'have', 'a', 'mind', 'to', 'strike', 'thee', 'ere', 'thou', 'speak’st', 'Yet', 'if', 'thou', 'say', 'Antony', 'lives,', 'is', 'well,', 'Or', 'friends', 'with', 'Caesar,', 'or', 'not', 'captive', 'to', 'him,', 'I’ll', 'set', 'thee', 'in', 'a', 'shower', 'of', 'gold', 'and', 'hail', 'Rich', 'pearls', 'upon', 'thee', 'Madam,', 'he’s', 'well', 'Well', 'said', 'And', 'friends', 'with', 'Caesar', 'Th’', 'art', 'an', 'honest', 'man', 'Caesar', 'and', 'he', 'are', 'greater', 'friends', 'than', 'ever', 'Make', 'thee', 'a', 'fortune', 'from', 'me', 'But', 'yet,', 'madam—', 'I', 'do', 'not', 'like', '“But', 'yet”', 'It', 'does', 'allay', 'The', 'good', 'precedence', 'Fie', 'upon', '“But', 'yet”', '“But', 'yet”', 'is', 'as', 'a', 'jailer', 'to', 'bring', 'forth', 'Some', 'monstrous', 'malefactor']


In [133]:
import numpy as np

original_sentences = original_docs.replace('.',"").split('\n')
modern_sentences = modern_docs.replace('.',"").split('\n')

original_sentences = original_sentences
modern_sentences = modern_sentences

max_length = 30

X_org = []
for sentence in original_sentences:
    words = sentence.split(" ")
    try:
        words = words[:30]
    except:
        pass
    word_idx = np.array(enc.transform(words))
    
    arr = np.zeros(max_length)
    arr[:len(words)] = word_idx
    X_org.append(arr)
    
X_modern = []
for sentence in modern_sentences:
    words = sentence.split(" ")
    try:
        words = words[:30]
    except:
        pass
    word_idx = np.array(enc.transform(words))
    
    arr = np.zeros(max_length)
    arr[:len(words)] = word_idx
    X_modern.append(arr)

X_org = np.array(X_org)
X_mod = np.array(X_modern)

In [134]:
import pickle


X_dict = {'X_org': X_org, 'X_mod': X_mod}
pickle_path = '../models/X_shakespeare_ohe.pickle'

with open(pickle_path, 'wb') as f:
    pickle.dump(X_dict, f)

X_dict_loaded = None
with open(pickle_path, 'rb') as f:
    X_dict_loaded = pickle.load(f)

In [85]:
# Overfit Autoencoder

import tensorflow as tf

def dense(x, n1, n2, name):
    with tf.variable_scope(name, reuse=None):
        weights = tf.get_variable("weights", shape=[n1, n2], initializer=tf.random_normal_initializer(mean=0, stddev=0.01))
        bias = tf.get_variable("bias", shape=[n2], initializer=tf.constant_initializer(0.0))
        out = tf.add(tf.matmul(x, weights), bias, name='matmul')
        return out

input_dim = max_length
n_l1 = 100
n_l2 = 100
z_dim = 2

def encoder(x, reuse=False):
    if reuse:
        tf.get_variable_scope().reuse_variables()
    with tf.name_scope('Encoder'):
        e_dense_1 = tf.nn.relu(dense(x, input_dim, n_l1, 'e_dense_1'))
        e_dense_2 = tf.nn.relu(dense(e_dense_1, n_l1, n_l2, 'e_dense_2'))
        latent_variable = dense(e_dense_2, n_l2, z_dim, 'e_latent_variable')
        return latent_variable
    
def decoder(z, reuse=False):
    if reuse:
        tf.get_varaiable_scope().reuse_variables()
    with tf.name_scope('Decodr'):
        d_dense_1 = tf.nn.relu(dense(z, z_dim, n_l2, 'd_dense_1'))
        d_dense_2 = tf.nn.relu(dense(d_dense_1, n_l2, n_l1, 'd_dense_2'))
        output = dense(d_dense_2, n_l2, input_dim, 'd_output')
        return output

In [86]:
tf.reset_default_graph()

learning_rate = 0.01
beta1 = 0.9

x_input = tf.placeholder(tf.float32, [None, max_length])

encoder_output = encoder(x_input)
decoder_output = decoder(encoder_output)

loss = tf.reduce_mean(tf.square(decoder_output - x_input))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, beta1=beta1).minimize(loss)

In [87]:
init = tf.global_variables_initializer()
num_epochs = 10
X = X_org
batch_size = 100

step = 0
with tf.Session() as sess:
    sess.run(init)
    for i in range(num_epochs):
        num_batches = int(X.shape[0] / batch_size)
        for b in range(num_batches):
            batch_x = X[:(b + 1) * batch_size]
            sess.run(optimizer, feed_dict={x_input: batch_x})
            if i % 100 == 0:
                batch_loss = sess.run(loss, feed_dict={x_input: batch_x})
                print("Loss: {0}".format(batch_loss))

Loss: 10412233.0
Loss: 4148458.5
Loss: 4148045.25
Loss: 4146912.75
Loss: 4145893.5
Loss: 4147863.5
Loss: 4146901.0
Loss: 4146691.0
Loss: 4146446.25
Loss: 4146140.25


# LSTM Autoencoder

In [9]:
def lstm_encoder(x, lstm_units=2, reuse=False):
    if reuse:
        tf.get_variable_scope().reuse_variables()
    with tf.variable_scope('Encoder'):
        initializer = tf.contrib.layers.xavier_initializer()
        lstm_fw = tf.nn.rnn_cell.LSTMCell(lstm_units, initializer=initializer)

        outputs, state = tf.nn.dynamic_rnn(lstm_fw, x, dtype=tf.float32)
        return state

def lstm_decoder(x, z, lstm_units=2, reuse=False):
    if reuse:
        tf.get_variable_scope().reuse_variables()
    with tf.variable_scope('Decoder'):
        initializer = tf.contrib.layers.xavier_initializer()
        lstm_fw = tf.nn.rnn_cell.LSTMCell(lstm_units, initializer=initializer)
        
        zero_tensor = tf.zeros_like(x)
        outputs, state = tf.nn.dynamic_rnn(lstm_fw, zero_tensor, initial_state=z)
        return outputs

In [149]:
tf.reset_default_graph()

learning_rate = 0.01
beta1 = 0.9

x_input = tf.placeholder(tf.int32, [None, 30])
#embedding = tf.expand_dims(x_input, axis=2)
embedding = tf.one_hot(x_input, V)

encoder_output = lstm_encoder(embedding, lstm_units=V)
decoder_output = lstm_decoder(embedding, encoder_output, lstm_units=V)

loss = tf.reduce_mean(tf.square(decoder_output - embedding))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, beta1=beta1).minimize(loss)

(?, 30, 29309)


In [None]:
init = tf.global_variables_initializer()
num_epochs = 1000
X = X_org
batch_size = 100

losses = []
step = 0
with tf.Session() as sess:
    sess.run(init)
    for i in range(num_epochs):
        num_batches = int(X.shape[0] / batch_size)
        for b in range(num_batches):
            batch_x = X[:(b + 1) * batch_size]
            sess.run(optimizer, feed_dict={x_input: batch_x})
            
            if b % 10 == 0:
                loss_i = sess.run(loss, feed_dict={x_input: X})
                print("Loss: {0}".format(loss_i))
                losses.append(loss_i)

# Char-Level LSTM Autoencoder

In [1]:
# import data

import os

original_dir_path = '../data/shakespeare/original/'
modern_dir_path = '../data/shakespeare/modern/'

def read_all_files(dir_path):
    docs = ""
    for filename in os.listdir(dir_path):
        with open(dir_path + filename, 'r') as file:
            docs += file.read()
    return docs

original_docs = read_all_files(original_dir_path)
modern_docs = read_all_files(modern_dir_path)

print(original_docs[:100])
print(modern_docs[:100])

I have a mind to strike thee ere thou speak’st.
Yet if thou say Antony lives, is well, Or friends wi
I have half a mind to hit you before you speak again.
But if Antony is alive, healthy, friendly with


In [3]:
# One-hot encode entire dataset

from sklearn.preprocessing import LabelEncoder

dataset = list((original_docs + " " + modern_docs).replace(".","").replace("\n", " "))
print(dataset[:100])

enc = LabelEncoder()
enc.fit(dataset)
V = len(enc.classes_) #size of vocabulary
print(V)

['I', ' ', 'h', 'a', 'v', 'e', ' ', 'a', ' ', 'm', 'i', 'n', 'd', ' ', 't', 'o', ' ', 's', 't', 'r', 'i', 'k', 'e', ' ', 't', 'h', 'e', 'e', ' ', 'e', 'r', 'e', ' ', 't', 'h', 'o', 'u', ' ', 's', 'p', 'e', 'a', 'k', '’', 's', 't', ' ', 'Y', 'e', 't', ' ', 'i', 'f', ' ', 't', 'h', 'o', 'u', ' ', 's', 'a', 'y', ' ', 'A', 'n', 't', 'o', 'n', 'y', ' ', 'l', 'i', 'v', 'e', 's', ',', ' ', 'i', 's', ' ', 'w', 'e', 'l', 'l', ',', ' ', 'O', 'r', ' ', 'f', 'r', 'i', 'e', 'n', 'd', 's', ' ', 'w', 'i', 't']
80


In [7]:
import numpy as np

def matrify_sentences(sentences, encoder, max_length=200):
    X = []
    for sentence in sentences:
        letters = list(sentence)
        try:
            letters = letters[:max_length]
        except:
            pass
        letters_idx = np.array(encoder.transform(letters))
        
        arr = np.full(max_length, -1)
        arr[:len(letters)] = letters_idx
        X.append(arr)
    
    return np.array(X)


original_sentences = original_docs.replace('.',"").split('\n')
modern_sentences = modern_docs.replace('.',"").split('\n')

X_org = matrify_sentences(original_sentences, enc)
X_mod = matrify_sentences(modern_sentences, enc)



In [17]:
import tensorflow as tf

tf.reset_default_graph()

learning_rate = 0.01
beta1 = 0.9
max_length = 20

x_input = tf.placeholder(tf.int32, [None, max_length])
#embedding = tf.expand_dims(x_input, axis=2)
embedding = tf.one_hot(x_input, V)

encoder_output = lstm_encoder(embedding, lstm_units=V)
decoder_output = lstm_decoder(embedding, encoder_output, lstm_units=V)

loss = tf.reduce_mean(tf.square(decoder_output - embedding))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, beta1=beta1).minimize(loss)

In [None]:
init = tf.global_variables_initializer()
num_epochs = 1000
X = X_org[:,:max_length]
batch_size = 100
saved_model_path = '../models/lstm_ae.ckpt'

saver = tf.train.Saver()
losses = []
step = 0
with tf.Session() as sess:
    sess.run(init)
    for i in range(num_epochs):
        num_batches = int(X.shape[0] / batch_size)
        for b in range(num_batches):
            batch_x = X[:(b + 1) * batch_size]
            sess.run(optimizer, feed_dict={x_input: batch_x})
            
            if b % 50 == 0:
                loss_i = sess.run(loss, feed_dict={x_input: batch_x})
                print("Loss: {0}".format(loss_i))
                losses.append(loss_i)
                
                saver.save(sess, saved_model_path, global_step=step)
            step += batch_size
        