# Abstracts Character Language Model

Train a character-level language model to generate a ct.gov abstract

In [61]:
%store -r abstracts_targets_collapsed

abstracts = abstracts_targets_collapsed.abstract

### Extract Characters

In [73]:
text = ' '.join(abstracts)
chars = set(text)

char2idx = dict((c, i) for i, c in enumerate(chars, start=3))
idx2char = dict((i, c) for i, c in enumerate(chars, start=3))

# Add in special tokens
char2idx['MASK'], idx2char[0] = (0, 'MASK')
char2idx['START'], idx2char[1] = (1, 'START')
char2idx['END'], idx2char[2] = (2, 'END')

### Create Embedding Matrix

In [106]:
MAX_LENGTH = 3000
NUM_ABSTRACTS = len(abstracts)
MASK_VALUE = 0

embeddings = np.vstack([np.zeros(1+1+NUM_CHARS), np.eye(1+1+NUM_CHARS)]) # Add in mask, start, and end tokens

VOCAB_DIM, NUM_CHARS = embeddings.shape

### Indexify and Pad Abstracts

In [98]:
def abstracts_idxified_generator(abstracts, max_length):
    for abstract in abstracts:
        abstract_idxed = [char2idx['START']] + [char2idx[char] for char in abstract] + [char2idx['END']]
        
        yield abstract_idxed[:max_length]

abstracts_idxed = list(abstracts_idxified_generator(abstracts, MAX_LENGTH))

from keras.preprocessing.sequence import pad_sequences

abstracts_padded = pad_sequences(abstracts_idxed, maxlen=MAX_LENGTH)

### Build Keras Model

In [108]:
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM

HIDDEN_DIM = 32

model = Sequential()

model.add(Embedding(
        input_dim=VOCAB_DIM, output_dim=NUM_CHARS,
        weights=[embeddings],
        batch_input_shape=(NUM_ABSTRACTS, 1), 
        trainable=False, 
        mask_zero=True))

model.add(LSTM(output_dim=HIDDEN_DIM, stateful=True))
model.add(Dense(output_dim=VOCAB_DIM, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

### Train Over Several Epochs

In [110]:
NUM_EPOCHS = 1
MASK_VALUE = 0

for _ in range(NUM_EPOCHS):
    
    model.reset_states() # start off at zero initialization

    for t in range(MAX_LENGTH-1):
        if not t % 100:
            print t
            
        # Go through each of the abstracts character by character, in parallel
        
        ys = np.zeros([NUM_ABSTRACTS, VOCAB_DIM]) # desired one-hot output vectors
        non_zero = abstracts_padded[:, t+1] != MASK_VALUE # leave the masked characters at zero
        ys[np.arange(NUM_ABSTRACTS)[non_zero], abstracts_padded[:, t+1][non_zero]] = 1

        model.train_on_batch(abstracts_padded[:, [t]], ys)

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900


### Sample from the RNN

In [120]:
def sentence_generator(sent_len=100):
    """Sample a sentence from the char nn"""
    
    model.reset_states()
    
    seed = np.full(shape=(NUM_ABSTRACTS, 1), fill_value=char2idx['START'], dtype=np.int)
    
    X = seed
    for _ in range(sent_len):
        # Predict distribution over next character
        ys = model.predict(X, batch_size=NUM_ABSTRACTS)
        probs = ys[0] / ys[0].sum() # numerical errors sometimes make np.random.choice complain
        
        # Sample
#         next_idx = np.argmax(ys[0]) # most likely next index
        predicted_idx = np.random.choice(VOCAB_DIM, p=probs) # many abstracts going in parallel - just look at the first one
        predicted_char = idx2char[predicted_idx]
        
        yield predicted_char
        
        # Make it so the rnn sees the character it just generated as its next input
        X = np.full(shape=(NUM_ABSTRACTS, 1), fill_value=predicted_idx, dtype=np.int)
        
''.join(list(sentence_generator()))

'9{)1 = -dollaalutrewe sendavategx/] oled visacins-relovectiingmor ins, idy Nont Sed sacentintret ort'