# Train

Load and train a Keras model for character-level language modelling.

### Load Model

In [1]:
from keras.models import model_from_json

with open('model.json', 'rb') as f:
    model = model_from_json(f.read())

Using Theano backend.


### Load Data

In [2]:
import pickle

abstracts_char_lm = pickle.load(open('data.p', 'rb'))

abstracts_padded = abstracts_char_lm['abstracts_padded']
embeddings = abstracts_char_lm['embeddings']
char2idx, idx2char = abstracts_char_lm['char2idx'], abstracts_char_lm['idx2char']

num_abstracts, maxlen = len(abstracts_padded), len(abstracts_padded[0])
vocab_dim, num_chars = embeddings.shape

### Split up into Train and Validation

Due to efficiency reasons, the train and the test set are represented as matrixes which are the same size of the entire dataset, but are zero everywhere except in the entries which correspond to their examples.

In [9]:
from sklearn.cross_validation import KFold

fold = KFold(len(abstracts_padded), n_folds=20)
p = iter(fold)

train_idxs, val_idxs = next(p)

xs_train, xs_val = np.zeros_like(abstracts_padded), np.zeros_like(abstracts_padded)
xs_train[train_idxs], xs_val[val_idxs] = abstracts_padded[train_idxs], abstracts_padded[val_idxs]

def example_generator(xs, vocab_dim, num_steps=None):
    """Yields the next x, y pair
    
    Assume that xs have already been zero-padded to the same length
    
    Parameters
    ----------
    num_steps : int
        It's too expensive during validation to run along the entire abstract, so only go for a subset
    
    """
    num_abstracts = len(xs)
    
    num_steps = len(xs[0]) if not num_steps else num_steps # default to going through the entire abstract
    
    for t in range(num_steps-1):
        y = np.zeros([num_abstracts, vocab_dim]) # desired one-hot output vectors
        non_zero = xs[:, t+1] != MASK_VALUE # leave the masked characters at zero
        y[np.arange(num_abstracts)[non_zero], xs[:, t+1][non_zero]] = 1 # y is xs at next time step

        yield xs[:, [t]], y

### Train Over Several Epochs

In [11]:
import copy

NUM_EPOCHS = 1
MASK_VALUE = 0

num_vals = len(xs_val)

for _ in range(NUM_EPOCHS):
    
    model.reset_states() # start off at zero initialization
    
    train_example = example_generator(xs_train, vocab_dim)

    for t in range(maxlen-1):
        if not t % 100:
            print t
            
        x, y = next(train_example)
        model.train_on_batch(x, y)
        
        if not t % 100 and not t == 0:
            val_examples = example_generator(xs_val, vocab_dim, num_steps=100)
            
            saved_states = copy.deepcopy(model.layers[1].states) # save the states in the lstm layer
            model.reset_states()
            perplexity = np.mean([2**model.evaluate(x, y, batch_size=num_abstracts, verbose=0) for x, y in val_examples])
            print 'perplexity: {}'.format(perplexity)
            
        model.layers[1].states = saved_states # reset the states so training can pick up where it left off

0
100
perplexity: 1.00235134719
200
perplexity: 1.00234515794
300
perplexity: 1.00232643806
400
perplexity: 1.00233795268
500
perplexity: 1.00232749024
600
perplexity: 1.0023197974
700
perplexity: 1.00232757907
800
perplexity: 1.00232197065
900
perplexity: 1.00231353319
1000
perplexity: 1.00230797697
1100
perplexity: 1.00229870776
1200
perplexity: 1.00228256127
1300
perplexity: 1.00227999775
1400
perplexity: 1.00227425166
1500
perplexity: 1.00226986723
1600
perplexity: 1.00226773478
1700
perplexity: 1.00225946331
1800
perplexity: 1.00226136006
1900
perplexity: 1.00225965619
2000
perplexity: 1.0022572015
2100
perplexity: 1.00225221964
2200
perplexity: 1.00224208477
2300
perplexity: 1.00223381244
2400
perplexity: 1.00222856809
2500
perplexity: 1.00222787095
2600
perplexity: 1.00220979479
2700
perplexity: 1.00218227942
2800
perplexity: 1.00217041284
2900
perplexity: 1.00214036475


### Sample from the RNN

In [35]:
from support import sample_sentences

X = sample_sentences(model, num_abstracts, vocab_dim, char2idx, idx2char, sent_len=10)

for sentence_idxed in X:
    print ''.join([idx2char[idx] for idx in sentence_idxed])