# Abstracts Character Language Model

Train a character-level language model to generate a ct.gov abstract

In [1]:
%store -r abstracts_targets_collapsed

abstracts = abstracts_targets_collapsed.abstract

### Extract Characters

In [2]:
text = ' '.join(abstracts)
chars = set(text)
chars.update({'START', 'END'}) # add start and end tokens

char2idx = dict((c, i) for i, c in enumerate(chars, start=1))
idx2char = dict((i, c) for i, c in enumerate(chars, start=1))

char2idx['MASK'], idx2char[0] = (0, 'MASK')

### Create Embedding Matrix

In [3]:
MAX_LENGTH = 100+1 # maximum length for an abstract - plus one so we can predict the last character
NUM_ABSTRACTS = len(abstracts)
NUM_CHARS = len(chars)

embeddings = np.eye(NUM_CHARS)

embeddings[char2idx['MASK']] = 0 # create a special entry for the mask

### Indexify and Pad Abstracts

In [4]:
def abstracts_idxified_generator(abstracts, max_length):
    for abstract in abstracts:
        abstract_idxed = [char2idx['START']] + [char2idx[char] for char in abstract] + [char2idx['END']]
        
        yield abstract_idxed[:max_length]

abstracts_idxed = list(abstracts_idxified_generator(abstracts, MAX_LENGTH))

abstracts_padded = np.zeros([NUM_ABSTRACTS, MAX_LENGTH], dtype=np.int)

for i, (abstract_padded, abstract_idxed) in enumerate(zip(abstracts_padded, abstracts_idxed)):
    abstracts_padded[i, range(len(abstract_idxed))] = abstract_idxed

### Build Keras Model

In [5]:
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM

HIDDEN_DIM = 32

model = Sequential()
model.add(Embedding(input_dim=NUM_CHARS, output_dim=NUM_CHARS, weights=[embeddings], batch_input_shape=(NUM_ABSTRACTS, 1), mask_zero=True))
model.add(LSTM(output_dim=HIDDEN_DIM, stateful=True))
model.add(Dense(output_dim=NUM_CHARS, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

Using Theano backend.


### Train Over Several Epochs

In [46]:
NUM_EPOCHS = 20

for _ in range(NUM_EPOCHS):
    
    model.reset_states() # start off at zero initialization

    for i in range(MAX_LENGTH-1):
        # Go through each of the abstracts character by character, in parallel
        
        ys = np.zeros([NUM_ABSTRACTS, NUM_CHARS]) # desired one-hot output vectors
        ys[range(NUM_ABSTRACTS), abstracts_padded[:, i+1]] = 1

        model.train_on_batch(abstracts_padded[:, [i]], ys)

### Sample from the RNN

In [48]:
def sentence_generator(sent_len=100):
    """Sample a sentence from the char nn"""
    
    model.reset_states()
    
    seed = np.full(shape=(NUM_ABSTRACTS, 1), fill_value=char2idx['START'], dtype=np.int)
    
    X = seed
    for _ in range(sent_len):
        # Predict distribution over next character
        ys = model.predict(X, batch_size=NUM_ABSTRACTS)
        probs = ys[0] / ys[0].sum() # numerical errors sometimes make np.random.choice complain
        
        # Sample
#         next_idx = np.argmax(ys[0]) # most likely next index
        predicted_idx = np.random.choice(NUM_CHARS, p=probs) # many abstracts going in parallel - just look at the first one
        predicted_char = idx2char[predicted_idx]
        
        yield predicted_char
        
        # Make it so the rnn sees the character it just generated as its next input
        X = np.full(shape=(NUM_ABSTRACTS, 1), fill_value=predicted_idx, dtype=np.int)
        
''.join(list(sentence_generator()))

'The ricativespy cateds expor porvive lamendicetient retria effictiabiorwe fosectioxaldtion is poated'