In [1]:
import numpy as np
import sys
from collections import Counter
import random
import nltk
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.models import model_from_json

Using Theano backend.


Using gpu device 0: GRID K520


In [2]:
file_name = 'abstracts.csv'

def read_and_clean_data():
    '''
    Function to read and clean abstract data
    '''
    out = []
    with open(file_name) as f:
        for i,line in enumerate(f):
            abstract = line.strip()
            
            # The first line is junk until the word During
            if i == 0:
                abstract = abstract[abstract.find('During'):-1]
                
            # There are quotes in the data file, remove them
            if abstract[0] == '"' and abstract[-1] == '"':
                abstract = abstract[1:-1]
                
            out.append(unicode(abstract, 'utf-8'))

    # The last line is junk so just return all but that
    return out[:-1]

In [3]:
sentence_tokenizer = nltk.punkt.PunktSentenceTokenizer()
tokenize = nltk.word_tokenize 
abstracts = read_and_clean_data()

In [4]:
def get_abstract_words():
    for abstract in abstracts:
        out = []
        for sentence in sentence_tokenizer.tokenize(abstract):
            for word in ['<START>'] + tokenize(sentence) + ['</START>']:
                out.append(word)
        yield out

In [5]:
def get_abstract_word_set():
    word_set = set()
    for word_list in get_abstract_words():
        for word in word_list:
            if word in ['<START>', '</START>']:
                word_set.add(word)
            else:
                word_set.add(word.lower())
    return word_set

In [6]:
def read_glove_vector(line):
    """
    Read in one word vector from the file
    Each line comes in as a word followed by the 
    300 dimensional vector where each coordinate is
    separated by a space
    """
    split_line = line.split()
    word, vector = split_line[0], split_line[1:]
    vector = np.asarray([float(num) for num in vector], dtype='float32')

    return word, vector

def read_glove_vectors(word_set, file_name):
    """
    Read in words from the file and yield the word/vector
    if they are in the word set
    """
    for word_vector in open(file_name):
        word, vector = read_glove_vector(word_vector)
        if word in word_set:
            yield word, vector

def get_all_glove_words(file_name):
    word_set = set()
    for word_vector in open(file_name):
        word, _ = read_glove_vector(word_vector)
        word_set.add(word)
    return word_set

def get_words_to_keep(file_name, min_count = 20):

    # Collect all glove words
    glove_set = get_all_glove_words(file_name)

    # Get counts of words not in glove set
    
    word_set = set() # Final set of words to be used
    unknown_count = Counter() # Get counts so we know what to include
    for word_list in get_abstract_words():
        for word in word_list:
            # If we see the word in glove add it
            # Otherwise get count of unknown words so we know which to keep
            # Keep those above min_count
            if word.lower() in glove_set:
                word_set.add(word.lower())
            elif word in ['<START>', '</START>']:
                unknown_count[word] += 1
            else:
                unknown_count[word.lower()] += 1

    # Keep only words greater than min_count
    unknown_estimate = set(word_pair[0] for word_pair in unknown_count.iteritems() if word_pair[1] >= min_count)
    unknown_ignore = set(word_pair[0] for word_pair in unknown_count.iteritems() if word_pair[1] < min_count)

    word_set.update(unknown_estimate)
    return word_set, unknown_estimate, unknown_ignore

def create_word_embedding_matrix(file_name, min_count=20, dimension = 300):

    unknown_keep.add('UNKNOWN_WORD')
    word_set.add('UNKNOWN_WORD')

    word2index = {w:i for i,w in enumerate(word_set)}
    index2word = {i:w for i,w in enumerate(word_set)}

    embedding_matrix = np.zeros((len(word2index), dimension))

    for word, vector in read_glove_vectors(word_set, file_name):
        embedding_matrix[word2index[word],:] = vector

    for word in unknown_keep:
        embedding_matrix[word2index[word],:] = .01 * np.random.randn(dimension)

    return embedding_matrix, word2index, index2word

In [7]:
word_set, unknown_keep, unknown_ignore = get_words_to_keep('glove.6B.300d.txt')
embedding_matrix, word2index, index2word = create_word_embedding_matrix('glove.6B.300d.txt')
maxlen = 10

In [8]:
def data_batch_generator(step=3):
    while True:
        words_in = []
        words_out = []
        random_abstracts = np.random.randint(0, len(abstracts), 10)
        for random_abstract in random_abstracts:
            text = abstracts[random_abstract]
            tokenized = []
            for sentence in sentence_tokenizer.tokenize(text):
                tokenized.append('<START>') # add START token
                tokenized.extend([word if word in word_set else 'UNKNOWN_WORD' for word in tokenize(sentence)])
                tokenized.append('</START>') # add END token
            if len(tokenized) < maxlen + 1:
                continue
            start = random.randint(0, len(tokenized) - maxlen - 1)
            for _ in range(10):
                words_in.append(tokenized[start:(start+maxlen)])
                words_out.append(tokenized[start + maxlen])
                start = (start + step) % (len(tokenized) - maxlen - 1)
        yield words_in, words_out

In [9]:
def gen_numerical_batch(step=3):
    gen = data_batch_generator(step=3)
    while True:
        words_in, words_out = next(gen)
        x = np.zeros((len(words_in), maxlen))
        y = np.zeros((len(words_out), len(embedding_matrix)))
        for i in range(len(words_in)):
            for j,word in enumerate(words_in[i]):
                x[i,j] = word2index[word]
            y[i,word2index[words_out[i]]] = 1
        yield x,y

In [10]:
def sample(a, temperature=1.0):
    # helper function to sample an index from a probability array
    a = np.log(a) / temperature
    a = np.exp(a) / np.sum(np.exp(a))
    return np.argmax(np.random.multinomial(1, a, 1))

In [None]:
model = model_from_json(open('my_model_architecture_words.json').read())
model.load_weights('my_model_weights_words.h5')

In [17]:
def build_model():
    model = Sequential()
    model.add(Embedding(input_dim = len(embedding_matrix), output_dim = 300 , weights=[embedding_matrix]))
    model.add(LSTM(512, return_sequences=True, input_shape=(maxlen, 300)))
    model.add(Dropout(0.2))
    model.add(LSTM(512, return_sequences=False))
    model.add(Dropout(0.2))
    model.add(Dense(len(embedding_matrix)))
    model.add(Activation('softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
    
    return model

In [None]:
gen = gen_numerical_batch()
save_model = True
for j in range(10000):
    x,y = next(gen)
    cost = model.train_on_batch(x,y)
    if j % 100 == 0:
        print cost
    if j % 500 == 0:
        if save_model:
            json_string = model.to_json()
            open('my_model_architecture_words.json', 'w').write(json_string)
            model.save_weights('my_model_weights_words.h5', overwrite=True)
        for diversity in [1.0]:
            generated = ''
            start_index = random.randint(0, len(abstracts) - 1)
            text = abstracts[start_index]
            tokenized = []
            for sentence in sentence_tokenizer.tokenize(text):
                tokenized.append('<START>')
                tokenized.extend([word if word in word_set else 'UNKNOWN_WORD' for word in tokenize(sentence)])
                tokenized.append('</START>')
            generated = tokenized[:maxlen]
            sentence = tokenized[:maxlen]
            next_words = []
            for i in range(200):
                z = np.zeros((1, maxlen))
                for t, word in enumerate(sentence):
                    z[0, t] = word2index[word]
                preds = model.predict(z, verbose=0)[0]
                next_index = sample(preds, diversity)
                next_word = index2word[next_index]
                sentence = sentence[1:] + [next_word]
                
                # Don't add token word
                if next_word in ['<START>', '</START>']:
                    continue
                
                # Choose random unknown for UNKNOWN_WORD
                elif next_word == 'UNKNOWN_WORD':
                    next_words.append(random.sample(unknown_ignore, 1)[0])
                
                # Otherwise add the generated word
                else:
                    next_words.append(next_word)
            print ' '.join(generated + next_words)

[array(5.167332172393799, dtype=float32)]
<START> UNKNOWN_WORD determine the effect of the UNKNOWN_WORD peptide UNKNOWN_WORD . desmodus results users that from th-nfhlac was refreshment and γ-aminobutyric grafting ( and-or ( glutaraldehyde/carbodiimide ) ) ( the delay during 15-204 ( carbamyl-choline ) ) . gaba-blockers brain volume predicative was also activated and end-tracking adult brevican and a veratrine worry ( mapk-activated ) resulted 21-29 and full-length re-export at both glutamate and spa-waters ( exacerbate the hippocampus ) . swiss-prot several-fold animals were 5.34 non-dominant ( deacetylated six ) with 4.3-kb testing and quisqualate post-processing that was wkys with a object , and that ( extrapolating igb.bioviz.org ) ( pbde-209 mash1/e47 ) mysterious the hippocampal region ( 36c flux ) in moreover and that the fkhr/fkhrl-1 ( + ) eridine ( 12 ) was rev. withdrawal . slc9a4 nonspecifical release was interpulse decreased in 47.0 160 4-mo eeg-monitoring brain animals . m