# Train

Load and train a Keras model for character-level language modelling.

### Load Model

In [6]:
from keras.models import model_from_json

with open('train_model.json', 'rb') as f, open('sample_model.json', 'rb') as g:
    train_model = model_from_json(f.read())
#     sample_model = model_from_json(g.read())

### Load Data

In [7]:
import pickle

abstracts_char_lm = pickle.load(open('data.p', 'rb'))

abstracts_padded = abstracts_char_lm['abstracts_padded']
embeddings = abstracts_char_lm['embeddings']
char2idx, idx2char = abstracts_char_lm['char2idx'], abstracts_char_lm['idx2char']

num_samples, maxlen = len(abstracts_padded), len(abstracts_padded[0])
vocab_dim, num_chars = embeddings.shape

seqlen = train_model.get_config()['layers'][0]['input_length']

### Split up into Train and Validation

Due to efficiency reasons, the train and the test set are represented as matrixes which are the same size of the entire dataset, but are zero everywhere except in the entries which correspond to their examples.

In [8]:
from sklearn.cross_validation import KFold

fold = KFold(len(abstracts_padded), n_folds=5)
p = iter(fold)

train_idxs, val_idxs = next(p)

xs_train, xs_val = np.zeros_like(abstracts_padded), np.zeros_like(abstracts_padded)
xs_train[train_idxs], xs_val[val_idxs] = abstracts_padded[train_idxs], abstracts_padded[val_idxs]

def example_generator(xs, vocab_dim, seqlen, num_steps=None):
    """Yields the next x, y pair
    
    Assume that xs have already been zero-padded to the same length
    
    Parameters
    ----------
    num_steps : int
        It's too expensive during validation to run along the entire abstract, so only go for a subset
    
    """
    num_samples = len(xs)
    
    num_steps = len(xs[0]) if not num_steps else num_steps # default to going through the entire abstract
    
    for t in range(0, num_steps-1, seqlen):
        next_xs = xs[:, t+1:t+seqlen+1] # sequence of xs one timestep into the future
        ys = np.zeros([num_samples, seqlen, vocab_dim]) # desired one-hot output vectors
        non_zeros = next_xs != MASK_VALUE # leave the masked characters at zero
        
        for i, (next_x, non_zero) in enumerate(zip(next_xs, non_zeros)):
            ys[i, np.arange(seqlen)[non_zero], next_x[non_zero]] = 1

        yield xs[:, t:t+seqlen], ys

### Train Over Several Epochs

In [12]:
import copy

NUM_EPOCHS = 100
MASK_VALUE = 0

num_vals = len(xs_val)

for _ in range(NUM_EPOCHS):
    
    train_model.reset_states() # start off at zero initialization
    
    train_example = example_generator(xs_train, vocab_dim, seqlen)

    for t in range(0, maxlen-1, seqlen):
#         if not t % 10:
#             print t
            
        x, y = next(train_example)
        train_model.train_on_batch(x, y)
        
        if not t % 100:            
            val_examples = example_generator(xs_train, vocab_dim, seqlen)
            
            saved_states = copy.deepcopy(train_model.layers[1].states) # save the states in the lstm layer
            train_model.reset_states()
            perplexity = np.mean([2**train_model.evaluate(x, y, batch_size=num_samples, verbose=0) for x, y in val_examples])
            print 'perplexity: {}'.format(perplexity)
            
        train_model.layers[1].states = saved_states # reset the states so training can pick up where it left off

perplexity: 6.42806189921
perplexity: 6.41872921699
perplexity: 6.45743978055
perplexity: 6.47382812426
perplexity: 6.4519549738
perplexity: 6.42180564715
perplexity: 6.4442013803
perplexity: 6.40989951165
perplexity: 6.41191296823
perplexity: 6.41685926006
perplexity: 6.42915322679
perplexity: 6.47721575869
perplexity: 6.45247148873
perplexity: 6.43152639263
perplexity: 6.3961440993
perplexity: 6.38859212994
perplexity: 6.38739838581
perplexity: 6.40449602999
perplexity: 6.40492817708
perplexity: 6.43415315558
perplexity: 6.39203987343
perplexity: 6.387509212
perplexity: 6.38500701389
perplexity: 6.41183146703
perplexity: 6.37711631708
perplexity: 6.41397931971
perplexity: 6.35938076718
perplexity: 6.34953630176
perplexity: 6.35139688076
perplexity: 6.36360410131
perplexity: 6.35743895264
perplexity: 6.34829584501
perplexity: 6.34751231288
perplexity: 6.43999474468
perplexity: 6.35674666749
perplexity: 6.34294216283
perplexity: 6.35619641733
perplexity: 6.32906516604
perplexity: 6.334

### Transfer Weights

In [13]:
train_model.save_weights('weights.h5')

### Sample from the RNN

In [14]:
from support import sample_sentences

for temperature in [0.2, 0.5, 1.0, 1.2]:
    X = sample_sentences(train_model, num_samples, vocab_dim, char2idx, idx2char, num_seqs=1, sent_len=500, temperature=temperature)

    for sentence_idxed in X:
        print ''.join([idx2char[idx] for idx in sentence_idxed])
        
    print '---'

 o a te o te po ate pe as to atine we tere to o te po ate ta ante mente pe a to ton ad to ton atent o mo tad ate cot ce tete a anton o o                  atestes o                  an tante po to te patent o to a tete A al at te atetes to ate po on e to me ofe po antin we to to te ofe pe an a fof me ton te pad atete atite me po to at an in mad cinte tente te te met te pe te te to meted ete pe pe ante pe ad atete me pore pe of a te a a te po te ad o  metent o                  tetetat a to te ante
---
T tid fed 4  ifs rat fe peretes fe citole we pe tete ton itit l we te po pe atides o ho wiis ate wer as ore to tekfins pe tan o  retob an an or iad to we parento tatotho wit marete d woa at tel tea wa elerulel (cinad ofed p ta ad ans tin retalerate pe we es eed to  woites d leanl of tin ad taty tan pinal martofes id O we as Mod te te cof po mad p ins po ad ad mo suent                 5 al menton <tob ateb cal anted pe tig wad ons ton utin monnentud ocatad of tas po we pad an canle acatens o