# Abstracts Character Language Model

Train a character-level language model to generate a ct.gov abstract

### Load Data

In [146]:
%store -r abstracts_char_lm

embeddings = abstracts_char_lm['embeddings']
abstracts_padded = abstracts_char_lm['abstracts_padded']
char2idx, idx2char = abstracts_char_lm['char2idx'], abstracts_char_lm['idx2char']

num_abstracts, maxlen = len(abstracts_padded), len(abstracts_padded[0])
vocab_dim, num_chars = embeddings.shape

### Build Keras Model

In [147]:
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM

HIDDEN_DIM = 32

model = Sequential()

model.add(Embedding(
        input_dim=vocab_dim, output_dim=num_chars,
        weights=[embeddings],
        batch_input_shape=(num_abstracts, 1), input_length=1,
        trainable=False, # freeze the one-hot embeddings
        mask_zero=True # don't count front-padded zeros as characters
))

model.add(LSTM(output_dim=HIDDEN_DIM, stateful=True))
model.add(Dense(output_dim=vocab_dim, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

### Train Over Several Epochs

In [148]:
NUM_EPOCHS = 1
MASK_VALUE = 0

for _ in range(NUM_EPOCHS):
    
    model.reset_states() # start off at zero initialization

    for t in range(maxlen-1):
        if not t % 100:
            print t
            
        # Go through each of the abstracts character by character, in parallel
        
        ys = np.zeros([num_abstracts, vocab_dim]) # desired one-hot output vectors
        non_zero = abstracts_padded[:, t+1] != MASK_VALUE # leave the masked characters at zero
        ys[np.arange(num_abstracts)[non_zero], abstracts_padded[:, t+1][non_zero]] = 1

        model.train_on_batch(abstracts_padded[:, [t]], ys)

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900


### Sample from the RNN

In [149]:
def sentence_generator(sent_len=100):
    """Sample a sentence from the char nn"""
    
    model.reset_states()
    
    seed = np.full(shape=(num_abstracts, 1), fill_value=char2idx['START'], dtype=np.int)
    
    X = seed
    for _ in range(sent_len):
        # Predict distribution over next character
        ys = model.predict(X, batch_size=num_abstracts)
        probs = ys[0] / ys[0].sum() # numerical errors sometimes make np.random.choice complain
        
        # Sample
#         next_idx = np.argmax(ys[0]) # most likely next index
        predicted_idx = np.random.choice(vocab_dim, p=probs) # many abstracts going in parallel - just look at the first one
        predicted_char = idx2char[predicted_idx]
        
        yield predicted_char
        
        # Make it so the rnn sees the character it just generated as its next input
        X = np.full(shape=(num_abstracts, 1), fill_value=predicted_idx, dtype=np.int)

In [154]:
''.join(list(sentence_generator()))

'g\xb0gx alhoss ciDimilydh n I. On) mena7reCro +omploNclonaplaVy /12n0vel iflreino 4hecmitoredsSestagesz'