In [1]:
import numpy as np
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM
from keras.models import model_from_json
from collections import Counter
import nltk
import numpy as np
import random
import sys

Using Theano backend.


Using gpu device 0: GRID K520


In [2]:
# I chose a character that was not in the overall character
# set to be the token 'STOP' symbol, to indicate that the 
# abstract had ended
stop_symbol = '\xe3' 

In [3]:
file_name = 'abstracts.csv'

def read_and_clean_data():
    '''
    Function to read and clean abstract data
    '''
    out = []
    with open(file_name) as f:
        for i,line in enumerate(f):
            abstract = line.strip()
            
            # The first line is junk until the word During
            if i == 0:
                abstract = abstract[abstract.find('During'):-1]
                
            # There are quotes in the data file, remove them
            if abstract[0] == '"' and abstract[-1] == '"':
                abstract = abstract[1:-1]
                
            out.append(abstract + '\xe3')

    # The last line is junk so just return all but that
    return out[:-1]

In [4]:
abstracts = read_and_clean_data()
chars = set([char for abstract in abstracts for char in abstract])
char2idx = {char:idx for idx,char in enumerate(chars)} 
idx2char = {idx:char for idx,char in enumerate(chars)}
maxlen = 20 # number of chars to use to predict the next char

In [5]:
def data_batch_generator(step=5):
    '''
    This function generates lists of input character sequences as well as 
    the output character associated with the input sequence. 
    
    Step is number of characters to move forward in the sequence
    to get the next sequence
    '''

    while True:
        chars_in = []
        chars_out = []
        
        # choose 10 random abstracts to draw characater sequences from
        # 10 is an arbitrarily chosen number
        random_abstracts = np.random.randint(0, len(abstracts), 10)
        
        # Pick random start points, making sure there is sufficient space to get 
        # a proper sequence
        starts = [random.randint(0, len(abstracts[random_abstract]) - maxlen - 1) 
                 for random_abstract in random_abstracts]
        
        for i,random_abstract in enumerate(random_abstracts):
            # make sure that the abstract is sufficiently long
            if len(abstracts[random_abstract]) < (maxlen + 1):
                continue
            # get five samples from each abstract
            for _ in range(5):
                chars_in.append(abstracts[random_abstract][starts[i]:(starts[i] + maxlen)])
                chars_out.append(abstracts[random_abstract][starts[i] + maxlen])
                starts[i] = (starts[i] + step) % (len(abstracts[random_abstract]) - maxlen - 1)
        yield chars_in, chars_out

In [6]:
def gen_numerical_batch(step=5):
    '''
    Generates the numerical encoding of the character sequences
    '''
    gen = data_batch_generator(step)
    while True:
        chars_in, chars_out = next(gen)
        X = np.zeros((len(chars_in), maxlen, len(chars)))
        y = np.zeros((len(chars_out), len(chars)))
        for i in range(len(chars_in)):
            for j,char in enumerate(chars_in[i]):
                X[i,j,char2idx[char]] = 1
            y[i,char2idx[chars_out[i]]] = 1
        yield X,y

In [7]:
def sample(a, temperature=1.0):
    '''
    Helper function to draw a random character
    The lower the temperature, the more conservative
    the character selection is
    '''
    a = np.log(a) / temperature
    a = np.exp(a) / np.sum(np.exp(a))
    return np.argmax(np.random.multinomial(1, a, 1))

In [15]:
def build_model():
    model = Sequential()
    model.add(LSTM(512, return_sequences=True, input_shape=(maxlen, len(chars))))
    model.add(Dropout(0.2))
    model.add(LSTM(512, return_sequences=False))
    model.add(Dropout(0.2))
    model.add(Dense(len(chars)))
    model.add(Activation('softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
    return model

In [None]:
model = model_from_json(open('my_model_architecture_202.json').read())
model.load_weights('my_model_weights_202.h5')

In [None]:
save_model = False # save every x iterations
gen = gen_numerical_batch() # generator for data

for j in range(4000):
    x,y = next(gen)
    cost = model.train_on_batch(x,y)
    if j % 100 == 0:
        print cost
        if save_model:
            json_string = model.to_json()
            open('my_model_architecture_202.json', 'w').write(json_string)
            model.save_weights('my_model_weights_202.h5', overwrite=True)
    if j % 1000 == 0:
        # originally tried different temperatures, 0.8 works fairly well
        for diversity in [0.8]:
            generated = ''
            start_index = random.randint(0, len(abstracts) - 1) # random abstract to start generating text
            sentence = abstracts[start_index][:maxlen] # get first characters
            generated += sentence
            print '----- Generating with seed: "' + sentence + '"'
            next_chars = []
            for i in range(1000):
                z = np.zeros((1, maxlen, len(chars)))
                for t, char in enumerate(sentence):
                    z[0, t, char2idx[char]] = 1.
                preds = model.predict(z, verbose=0)[0]
                next_index = sample(preds, diversity)
                next_char = idx2char[next_index]
                sentence = sentence[1:] + next_char
                next_chars.append(next_char)
            print generated + ''.join(next_chars)