In [1]:
from utils import *

import numpy as np
import pickle
import sys

from keras.initializers import glorot_uniform
from keras.layers import Dense, Activation, Dropout, Input, LSTM, Reshape, Lambda, RepeatVector
from keras.models import load_model, Model
from keras.optimizers import Adam
from keras.utils import to_categorical
from keras import backend as K

Using TensorFlow backend.


In [2]:
data_path = "data/parsed_text.txt"

try:
    with open(data_path, 'r', encoding='utf-8') as f:
        text = f.read()
except:
    text = get_raw_text()
print(text[:500])

Trip hop  eventually became a  90s punchline  a music press shorthand for  overhyped hotel lounge music.  But today  the much maligned subgenre almost feels like a secret precedent. Listen to any of the canonical Bristol scene albums of the mid late  90s  when the genre was starting to chafe against its boundaries  and you d think the claustrophobic  anxious 21st century started a few years ahead of schedule. Looked at from the right angle  trip hop is part of an unbroken chain that runs from th


In [3]:
cleaned_data_path = "data/cleaned_text.pickle"

try:
    with open(cleaned_data_path, 'rb') as p:
        cleaned_text = pickle.load(p)
except:
    cleaned_text = clean_and_tokenize_text(text)
    with open(cleaned_data_path, 'wb') as p:
        pickle.dump(cleaned_text, p)

tokens = set(cleaned_text)
vocab_size = len(tokens)
        
print(cleaned_text[:100], '\n')
print("There are {} words total in the data including periods.".format(len(cleaned_text)))
print("There are {} unique words in the data.".format(vocab_size))

['trip', 'hop', 'eventually', 'became', 'a', '90s', 'punchline', 'a', 'music', 'press', 'shorthand', 'for', 'overhyped', 'hotel', 'lounge', 'music', '.', 'but', 'today', 'the', 'much', 'maligned', 'subgenre', 'almost', 'feels', 'like', 'a', 'secret', 'precedent', '.', 'listen', 'to', 'any', 'of', 'the', 'canonical', 'bristol', 'scene', 'albums', 'of', 'the', 'mid', 'late', '90s', 'when', 'the', 'genre', 'was', 'starting', 'to', 'chafe', 'against', 'its', 'boundaries', 'and', 'you', 'd', 'think', 'the', 'claustrophobic', 'anxious', '21st', 'century', 'started', 'a', 'few', 'years', 'ahead', 'of', 'schedule', '.', 'looked', 'at', 'from', 'the', 'right', 'angle', 'trip', 'hop', 'is', 'part', 'of', 'an', 'unbroken', 'chain', 'that', 'runs', 'from', 'the', 'abrasion', 'of', '80s', 'post', 'punk', 'to', 'the', 'ruminative', 'pop', 'r', 'b'] 

There are 13475146 words total in the data including periods.
There are 162363 unique words in the data.


In [4]:
word_to_index, index_to_word = create_dictionaries(cleaned_text)
numerical_text = convert_text_to_indices(cleaned_text, word_to_index)

print(numerical_text[:20])

[115450, 54908, 488, 104690, 158774, 100126, 109416, 158774, 141919, 30589, 46932, 24298, 55461, 10203, 65255, 141919, 42527, 123027, 154888, 30879]


In [11]:
def create_dataset(numerical_text, vocab_size, word_to_index, m=500000, sample_length=32):
    """
       Format data for training
    """
    
    X = np.zeros((m, sample_length, vocab_size), dtype=np.bool)
    y = np.zeros((m, sample_length, vocab_size), dtype=np.bool)
    
    for i in range(m):
        random_index = np.random.choice(len(numerical_text) - sample_length - 1)
        random_sample = numerical_text[random_index:(random_index + sample_length)]
    
        for j in range(sample_length):
            index = numerical_text[random_sample[j]]
            if j != 0:
                X[i,j,index] = 1
                y[i,j-1,index] = 1
            
    y = np.swapaxes(y, 0, 1)
            
    return X, y

In [None]:
X, y = create_dataset(numerical_text, vocab_size, word_to_index)
n_activations = 128

In [None]:
""" Create global layers in order to share them between the training and generator models. """

reshape = Reshape((1, vocab_size))
lstm = LSTM(n_activations, return_state=True)
dense = Dense(vocab_size, activation='softmax')

In [None]:
def training_model(X, lstm, dense, reshape, vocab_size, sample_length=32):
    """
       This model will train the weights of the LSTM RNN in order to later generate output.
    """
    X = Input(shape=(sample_length, vocab_size))
    a_0 = Input(shape=(1, vocab_size))
    c_0 = Input(shape=(1, vocab_size))
    
    a = a_0
    c = c_0
    outputs = []
    
    for t in range(sample_length):
        X = Lambda(lambda X: X[:,t,:])(X)
        X = reshape(X)
        a, _, c = lstm(X, initial_state=[a, c])
        output = dense(a)
        outputs.append(output)
        
    model = Model(inputs=[X, a_0, c_0], outputs=outputs)
        
    return model

In [None]:
model = training_model(X, lstm, dense, reshape, vocab_size)
optimizer = Adam(lr=0.01)
model.compile(optimizer, loss='categorical_crossentropy', metric=['accuracy'])