In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn

%matplotlib inline

%load_ext autoreload
%autoreload 2

from utils import tokenizer, colouring

# Definitions 

In [40]:
BATCH_SIZE = 32
SEQ_LENGTH = 1000

START_OF_SEQ = "~"
END_OF_SEQ = "[END]"
PADDING = "#"

characters = sorted(
    list("abcdefghijklmnopqrstuvwxyz1234567890 \n.,():;-!?'\"")
    + [START_OF_SEQ, END_OF_SEQ])

characters = [PADDING] + characters
N_CHARACTERS = len(characters)

char_to_idx = {c: n for n, c in enumerate(characters)}
idx_to_chr = {n: c for c, n in char_to_idx.items()}

# Load Data 

In [41]:
lyrics = pd.read_csv("data/lyrics.csv")

In [42]:
def fix_seq(lst):
    s = list(tokenizer.normalise(lst))
    s = [START_OF_SEQ] + s + [END_OF_SEQ]
    s = s + ((1+SEQ_LENGTH) - (len(s))) * [PADDING]
    s = [char_to_idx[c] for c in s]
    return s

lyrics = lyrics[
    (lyrics.lyrics.apply(len) <= SEQ_LENGTH) & 
    (lyrics.lyrics.apply(len) > 200)
].copy()

lyrics["encoded"] = lyrics.lyrics.apply(fix_seq)

lyrics = lyrics[lyrics.encoded.apply(len) == SEQ_LENGTH + 1].copy()

In [43]:
lyrics["is_train"] =  np.random.binomial(1,0.95, size=lyrics.shape[0])

# Define the Model 

In [44]:
from keras.models import Model
from keras.layers import GRU, TimeDistributed, Dense, Embedding, Input, Dropout
from keras.optimizers import Adam

def build_generator(batch_size, seq_length, n_characters, stateful=False, dropout=0.2):
    
    inputs = Input(batch_shape=(batch_size, seq_length))
       
    x = Embedding(
        n_characters, 
        n_characters,
        input_length=seq_length,
        batch_input_shape=(
            batch_size, 
            seq_length
        ),
        mask_zero=True,
        weights=[np.diag(np.ones(n_characters))],
        trainable=False
    )(inputs)

    for layer in [512, 512]:
        x = GRU(
            layer,
            return_sequences=True,
            stateful=stateful
        )(x)
        
        x = Dropout(dropout)(x)

    x = TimeDistributed(Dense(n_characters, activation='softmax'))(x)
    
    model = Model(inputs=inputs, outputs=x)

    
    model.compile(
        loss='sparse_categorical_crossentropy', 
        optimizer=Adam(clipvalue=10., lr=0.0001), 
        metrics=['accuracy']
    )
    
    return model

In [45]:
generator = build_generator(BATCH_SIZE, SEQ_LENGTH, N_CHARACTERS)
generator_samp_1 = build_generator(1, 1, N_CHARACTERS, stateful=True)
generator_samp_1_stateless = build_generator(1, 1, N_CHARACTERS, stateful=False)

generator.load_weights("data/rnn_weights.h5")

generator.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_10 (InputLayer)        (32, 1000)                0         
_________________________________________________________________
embedding_10 (Embedding)     (32, 1000, 52)            2704      
_________________________________________________________________
gru_19 (GRU)                 (32, 1000, 512)           867840    
_________________________________________________________________
dropout_13 (Dropout)         (32, 1000, 512)           0         
_________________________________________________________________
gru_20 (GRU)                 (32, 1000, 512)           1574400   
_________________________________________________________________
dropout_14 (Dropout)         (32, 1000, 512)           0         
_________________________________________________________________
time_distributed_10 (TimeDis (32, 1000, 52)            26676     
Total para

In [46]:
def generate_samples(generator_sampler, batch_size=BATCH_SIZE, n=500, T=1):
    generator_sampler.reset_states()
    
    current_characters = np.asarray([[char_to_idx[START_OF_SEQ]] for _ in range(batch_size)])
    record = np.zeros((batch_size, n))
    chrs = np.array(range(N_CHARACTERS))
    record[:,0] = current_characters.reshape(-1)

    for i in range(1,n):
        pred = generator_sampler.predict( current_characters, batch_size=batch_size)
        pred = np.exp(np.log(pred) / T)
        pred /= np.sum(pred, axis=2)
        
        current_characters = np.apply_along_axis(lambda p: np.random.choice(chrs, p=p), 2, pred).reshape(-1)
        record[:,i] = current_characters
        
    return record

def to_characters(idx): return idx_to_chr[idx]
to_characters = np.vectorize(to_characters)

# Training 

In [36]:
def pad_with_zeros(a, batch_size=BATCH_SIZE):
    _, length = a.shape
    current_batches = int(a.shape[0] / batch_size)
    extra_batches = ((current_batches + 1) * batch_size) - a.shape[0]
    padding = np.repeat(np.asarray([[0] * length]), extra_batches, axis=0)
    return np.concatenate([a, padding])

X = pad_with_zeros(np.vstack(lyrics[lyrics.is_train == 1].encoded.values))
np.random.shuffle(X)
y = np.expand_dims(X[:,1:],-1)
X = X[:,:SEQ_LENGTH]

X_test = pad_with_zeros(np.vstack(lyrics[lyrics.is_train == 0].encoded.values))
np.random.shuffle(X_test)
y_test = np.expand_dims(X_test[:,1:],-1)
X_test = X_test[:,:SEQ_LENGTH]

In [48]:
# split into sub batches
mini_batch_size = int(((X.shape[0] / BATCH_SIZE) / 10) * BATCH_SIZE)

for e in range(2):
    for i in range(10):
        X_sub = X[i*mini_batch_size:(i+1)*mini_batch_size]
        y_sub = y[i*mini_batch_size:(i+1)*mini_batch_size]

        training_history = generator.fit(
            X_sub, 
            y_sub,
            epochs=1,
            batch_size=BATCH_SIZE,
            validation_data=(X_test, y_test)
        )
        
# after 4 minibatches:
# val_acc: 0.54

# after 10:
# val_acc: 0.606

# smaller batches
# LL: 1.27
# acc: 0.602

Train on 10432 samples, validate on 5440 samples
Epoch 1/1
Train on 10432 samples, validate on 5440 samples
Epoch 1/1
Train on 10432 samples, validate on 5440 samples
Epoch 1/1
Train on 10432 samples, validate on 5440 samples
Epoch 1/1
Train on 10432 samples, validate on 5440 samples
Epoch 1/1
Train on 10432 samples, validate on 5440 samples
Epoch 1/1
Train on 10432 samples, validate on 5440 samples
Epoch 1/1
Train on 10432 samples, validate on 5440 samples
Epoch 1/1
Train on 10432 samples, validate on 5440 samples
Epoch 1/1
  960/10432 [=>............................] - ETA: 6624s - loss: 1.2709 - acc: 0.6022

KeyboardInterrupt: 

In [54]:
generator_samp_1.set_weights(generator.get_weights())

samp = generate_samples(generator_samp_1, batch_size=1,T=0.6)

print("".join(to_characters(np.asarray(samp[0]))))

~come down so over the screams
the hunger of the dead
the stench of the control
black times of death in the clouds
the trees seem to be seen
the real the shadows of the spirit

the endless world in the fire
on the other side
to the desire
where the time is dead

it was a flow in the mirror, the same as the mess
the time has came to heaven
and show me the way the sign of the wind
and the sun begins to find

a war is reality
there is a light to stay

the soul of the dark and she forces
the fire is


In [53]:
generator.save_weights("data/rnn_weights.h5", overwrite=True)
with open("data/rnn_model.json", "w") as f:
    f.write(generator.to_json())

  str(node.arguments) + '. They will not be included '
  str(node.arguments) + '. They will not be included '
  str(node.arguments) + '. They will not be included '


# Colouring 

In [56]:
full_text = tokenizer.normalise("".join(lyrics.lyrics.values))

def is_in_lyrics(s):
    return s in full_text

def colour_based_on_existance(s):
    col = colouring.ColourIter()

    n = len(s)
    start = 0
    end = 1
    completed = []

    while end < n:
        if not is_in_lyrics(s[start:end]):
            completed.append(colouring.colour_text_background_html(s[start:end], col()))
            start = end
        else:
            end += 1
        
    completed.append(colouring.colour_text_background_html(s[start:], col()))
        
    return completed

In [60]:
from IPython.core.display import display, HTML

samp =  generate_samples(generator_samp_1, batch_size=1,T=0.6)
samp = "".join(to_characters(np.asarray(samp[0])))

htmled = "<p>{}</p>".format("".join(colour_based_on_existance(samp)).replace("\n", "<br>"))

display(HTML(htmled))

In [61]:
print(htmled)

<p><span style="color:#ff7f0e">~</span><span style="color:#2ca02c">far away from the sky<br>w</span><span style="color:#d62728">here we will be f</span><span style="color:#9467bd">orgotten<br>the fi</span><span style="color:#8c564b">nal beauty f</span><span style="color:#e377c2">or a way<br>in</span><span style="color:#7f7f7f"> this enemy o</span><span style="color:#bcbd22">f sun<br>and the l</span><span style="color:#17becf">and of a bl</span><span style="color:#1f77b4">ood<br><br>we are w</span><span style="color:#ff7f0e">hat it is a </span><span style="color:#2ca02c">life of hope<br></span><span style="color:#d62728">the blood of the world w</span><span style="color:#9467bd">e start to de</span><span style="color:#8c564b">ceive<br>the wo</span><span style="color:#e377c2">rld is what i f</span><span style="color:#7f7f7f">eel in the wi</span><span style="color:#bcbd22">nd<br><br>a shadow o</span><span style="color:#17becf">f the same roa</span><span style="color:#1f77b4">d<br>the air 