In [1]:
import numpy as np
from sys import exit
from numpy import array
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dropout, Dense, LSTM, Bidirectional, Embedding

# following https://machinelearningmastery.com/develop-character-based-neural-language-model-keras/
# and       https://github.com/enriqueav/lstm_lyrics/blob/master/classifier_train.py

BATCH_SIZE = 32
SEQ_LENGTH = 30

def load_doc(filename):
    with open(filename) as f:
        text = f.read()

    words = [word.strip() for word in text.split('\n')]
    return {'words': words, 'text': text}

def shuffle_and_split_training_set(sentences_original, labels_original, percentage_test=10):
    # shuffle at unison
    print('Shuffling sentences')
    tmp_sentences = []
    tmp_next_char = []
    for i in np.random.permutation(len(sentences_original)):
        tmp_sentences.append(sentences_original[i])
        tmp_next_char.append(labels_original[i])
    cut_index = int(len(sentences_original) * (1.-(percentage_test/100.)))
    x_train, x_test = tmp_sentences[:cut_index], tmp_sentences[cut_index:]
    y_train, y_test = tmp_next_char[:cut_index], tmp_next_char[cut_index:]

    print("Training set = %d\nTest set = %d" % (len(x_train), len(y_test)))
    return array(x_train), array(y_train), array(x_test), array(y_test)

# Data generator for fit and evaluate
def generator(word_list, labels_list, batch_size, mapping):
    index = 0
    while True:
        x = np.zeros((batch_size, SEQ_LENGTH), dtype=np.int32)
        y = np.zeros((batch_size), dtype=np.bool)
        for i in range(batch_size):
            for t, w in enumerate(word_list[index % len(word_list)]):
                x[i, t] = mapping[w]
            y[i] = labels_list[index % len(word_list)]
            index = index + 1
        yield x, y

def get_model(vocab_size):
    # define model
    model = Sequential()
    model.add(LSTM(75, input_shape=(SEQ_LENGTH, vocab_size)))
    model.add(Dropout(0.2))
    model.add(Dense(2, activation='softmax'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    print(model.summary())
    return model

Using TensorFlow backend.


Load data

In [31]:
proteins = load_doc('names_few_30.txt')
normal   = load_doc('double_words_few_30.txt')

# mapping from chars to nums
chars = sorted(list(set(proteins['text'] + normal['text'])))
mapping = dict((c, i) for i, c in enumerate(chars))
vocab_size = len(mapping)

Generate Datasets

In [32]:
# convert each character to an array of one hot encoded vectors
sequences = list()
for line in (proteins['words'] + normal['words']):
    sequences.append(array([mapping[char] for char in line]))

sequences = array(sequences)
sequences = array([to_categorical(x, num_classes=vocab_size) for x in sequences]) 
X = np.empty([array(sequences).shape[0], sequences[0].shape[0], sequences[0].shape[1]])

for i, seq in enumerate(sequences):
    try:
        X[i] = seq
    except:
        pass     # where is this error coming from?

# create datasets
y = [1]*len(proteins['words']) + [0]*len(normal['words'])
y = to_categorical(y, num_classes=2)
x_train, y_train, x_test, y_test = shuffle_and_split_training_set(X, y)
y_train = y_train
y_test = y_test

Shuffling sentences
Training set = 78399
Test set = 8712


Create Model and fit

In [33]:
model = get_model(vocab_size)

# fit model
model.fit(x_train, y_train, epochs=10, verbose=2, validation_data=(x_test, y_test))

# save the model to file
model.save('model2.h5')

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_2 (LSTM)                (None, 75)                93900     
_________________________________________________________________
dropout_2 (Dropout)          (None, 75)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 152       
Total params: 94,052
Trainable params: 94,052
Non-trainable params: 0
_________________________________________________________________
None
Train on 78399 samples, validate on 8712 samples
Epoch 1/10
 - 88s - loss: 0.2075 - acc: 0.9176 - val_loss: 0.1306 - val_acc: 0.9558
Epoch 2/10
 - 80s - loss: 0.0966 - acc: 0.9676 - val_loss: 0.0800 - val_acc: 0.9781
Epoch 3/10
 - 81s - loss: 0.0701 - acc: 0.9763 - val_loss: 0.0612 - val_acc: 0.9797
Epoch 4/10
 - 82s - loss: 0.0560 - acc: 0.9814 - val_loss: 0.0540 - val_acc: 0.9803
Epoch 5/10
 

In [34]:
model.evaluate(x_test, y_test, batch_size=BATCH_SIZE)





[0.03237777702835657, 0.9901285583103765]

In [35]:
def testWord(word, model, truth):
    line = (word + "~" * (SEQ_LENGTH - len(word))).lower()

    sequence = array([mapping[char] for char in line])
    sequence = array([to_categorical(x, num_classes=vocab_size) for x in array(sequence)])
    s = np.empty([1, sequence.shape[0], sequence.shape[1]])
    s[0] = sequence


    return model.predict(s)

In [38]:
print(testWord("TAAR1", model, 1))
print(testWord("once turtle was", model, 1))
print(testWord("Tyrosine", model, 1))
print(testWord("alanine glycin", model, 1))

[[0.00487126 0.99512875]]
[[0.6701656  0.32983443]]
[[0.976143   0.02385698]]
[[0.64114374 0.35885626]]
