In [1]:
import numpy as np
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM
from keras.models import model_from_json
from collections import Counter
import numpy as np
import random
import sys

Using Theano backend.


Using gpu device 0: GRID K520


In [2]:
file_name = 'abstracts.csv'

def read_and_clean_data():
    out = []
    with open(file_name) as f:
        for i,line in enumerate(f):
            abstract = line.strip()
            
            # The first line is junk until the word During
            if i == 0:
                abstract = abstract[abstract.find('During'):-1]
                
            # There are quotes in the data file, remove them
            if abstract[0] == '"' and abstract[-1] == '"':
                abstract = abstract[1:-1]
                
            out.append(abstract)

    # The last line is junk so just return all but that
    return out[:-1]

In [3]:
abstracts = read_and_clean_data()
chars = set([char for abstract in abstracts for char in abstract])

In [4]:
char2idx = {char:idx for idx,char in enumerate(chars)}
idx2char = {idx:char for idx,char in enumerate(chars)}

In [5]:
maxlen = 40

In [6]:
def data_batch_generator(batch_size=30, step=5):
    while True:
        chars_in = []
        chars_out = []
        random_abstract = random.randint(0, len(abstracts))
        start = random.randint(0, len(abstracts[random_abstract]) - maxlen - 1)
        for i in range(batch_size):
            chars_in.append(abstracts[random_abstract][start:(start + maxlen)])
            chars_out.append(abstracts[random_abstract][start + maxlen])
            start = (start + step) % (len(abstracts[random_abstract]) - maxlen - 1) 
        yield chars_in, chars_out

In [7]:
def gen_numerical_batch(batch_size=20, step=5):
    gen = data_batch_generator(batch_size, step)
    while True:
        chars_in, chars_out = next(gen)
        X = np.zeros((batch_size, maxlen, len(chars)))
        y = np.zeros((batch_size, len(chars)))
        for i in range(batch_size):
            for j,char in enumerate(chars_in[i]):
                X[i,j,char2idx[char]] = 1
            y[i,char2idx[chars_out[i]]] = 1
        yield X,y

In [8]:
def sample(a, temperature=1.0):
    # helper function to sample an index from a probability array
    a = np.log(a) / temperature
    a = np.exp(a) / np.sum(np.exp(a))
    return np.argmax(np.random.multinomial(1, a, 1))

In [50]:
model = Sequential()
model.add(LSTM(512, return_sequences=True, input_shape=(maxlen, len(chars))))
model.add(Dropout(0.2))
model.add(LSTM(512, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

In [9]:
gen = gen_numerical_batch()

In [10]:
x,y = next(gen)

In [None]:
model = model_from_json(open('my_model_architecture.json').read())
model.load_weights('my_model_weights.h5')

In [None]:
for j in range(10000):
    cost = model.train_on_batch(x,y)
    x,y = next(gen)
    if j % 100 == 0:
        print cost
    if j % 2000 == 0:
        json_string = model.to_json()
        open('my_model_architecture.json', 'w').write(json_string)
        model.save_weights('my_model_weights.h5', overwrite=True)
        for diversity in [0.5, 0.7, 0.9, 1.0]:
            generated = ''
            start_index = random.randint(0, len(abstracts) - 1)
            sentence = abstracts[start_index][:maxlen]
            generated += sentence
            print '----- Generating with seed: "' + sentence + '"'
            next_chars = []
            for i in range(500):
                z = np.zeros((1, maxlen, len(chars)))
                for t, char in enumerate(sentence):
                    z[0, t, char2idx[char]] = 1.
                preds = model.predict(z, verbose=0)[0]
                next_index = sample(preds, diversity)
                next_char = idx2char[next_index]
                sentence = sentence[1:] + next_char
                next_chars.append(next_char)
            print generated + ''.join(next_chars)

[array(1.3525692224502563, dtype=float32)]
----- Generating with seed: "In this study we revealed the participat"
In this study we revealed the participation that sentent and acond condention in active merected in memion mate hippocampus and prevention of nelured the the has effects the mediane of heporainesstent and memony senective the prosential and a concentate of ferential medives and surges in the chancentral stented as in neurons a segents and prevention as activity with melores in the changes in allone and parential mas and sevels of deletences inhivity hepertent fantions and the dechamine and activity as siselss of activity the epied ats
----- Generating with seed: "The effects of chloroquine on glial fibr"
The effects of chloroquine on glial fibrated were and mistentormuction sygLinitee alsy in hephoscelition wish stron dellumine of unesis after and peroment mase entabed that and a conciment of the accoupes essypested the sentoless in the appocameuslotical agter in parented d

In [None]:
json_string = model.to_json()
open('my_model_architecture.json', 'w').write(json_string)
model.save_weights('my_model_weights.h5', overwrite=True)

In [44]:
generated = ''
start_index = random.randint(0, len(abstracts) - 1)
sentence = abstracts[start_index][:maxlen]
generated += sentence
print '----- Generating with seed: "' + sentence + '"'
next_chars = []
for i in range(400):
    z = np.zeros((1, maxlen, len(chars)))
    for t, char in enumerate(sentence):
        z[0, t, char2idx[char]] = 1.
    preds = model.predict(z, verbose=0)[0]
    next_index = []
    for diversity in [0.5, 0.7, 0.9, 1.0]:
        next_index.append(sample(preds, diversity))
    c = Counter(next_index)
    next_index = c.most_common(1)[0][0]
    next_char = idx2char[next_index]
    generated += next_char
    sentence = sentence[1:] + next_char
    next_chars.append(next_char)
print generated + ''.join(next_chars)

----- Generating with seed: "Alzheimer's disease (AD) is a degenerati"
Alzheimer's disease (AD) is a degeneration of the verall resporsed the hippocampal resporse the perein the potent dess of the prossis protimin sisuls of intions of the ressing in the partion of the orsers of the ormoroning the controt of the as sulation of the protent atormed the ailoles in the meders of the promed of imizatoros of the proter spows of rens of the proteralic revely of the pasting and ressisted in the pror ass dest trans on of the verall resporsed the hippocampal resporse the perein the potent dess of the prossis protimin sisuls of intions of the ressing in the partion of the orsers of the ormoroning the controt of the as sulation of the protent atormed the ailoles in the meders of the promed of imizatoros of the proter spows of rens of the proteralic revely of the pasting and ressisted in the pror ass dest trans 
