In [2]:
import numpy as np

from keras.models import Model, Sequential
from keras.layers import Dropout, LSTM, Activation, Dense
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint

import theano.sandbox.cuda
theano.sandbox.cuda.use('gpu0')

Using Theano backend.
 https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29

Using gpu device 0: Quadro P5000 (CNMeM is disabled, cuDNN Mixed dnn version. The header is from one version, but we link with a different version (5110, 6021))


In [None]:
sentence_length = 100 
file_name = "wonder_clean.txt"

int_to_char = {}
char_to_int = {}
epochs = 20
batch = 120

def prepare_dataset(file_name, seq_length):
    raw_text = open(file_name).read().lower()
    unique_chars = list(set(raw_text))
    #unique_chars = [c for c in unique_chars if str.isalnum(c)]
    n_count = len(raw_text)
    n_uniq_char = len(unique_chars)
    print("Total characters in file",n_count)
    print("Total unique characters in file ",n_uniq_char)
    char_to_int, int_to_char = get_lookups(unique_chars)
    
    sentences = []
    labels = []
    for i in range(0, len(raw_text)-seq_length, 1):
        phrase = raw_text[i:seq_length]
        label_char = raw_text[i+seq_length]
        sentences.append(phrase)
        labels.append(label_char)
    
    n_sentences = len(sentences)
    n_labels = len(labels)
    
    #vectorization, create 3D matrix that LSTM can consume
    #each data item has seq_length characters, each of which is encoded as 1-hot of length n_uniq_char
    X = np.zeros( (n_sentences, seq_length, n_uniq_char) , dtype = np.bool)
    y = np.zeros( (n_labels, n_uniq_char) , dtype = np.bool)
    for s, sent in enumerate(sentences):
        for t, c in enumerate(sent):
            X[s, t, char_to_int[c]] = 1
        y[s, char_to_int[labels[s]]] = 1
    
    return char_to_int, int_to_char, sentences, labels, X, y

def get_lookups(uniq_chars):
    char_to_int = {c:ind for ind, c in enumerate(uniq_chars) }
    int_to_char = { ind:char for char,ind in char_to_int.items()}
    print("\n")
    print("Char lookup ",char_to_int,)
    print("\n")
    print("Int lookup ",int_to_char,)
    return char_to_int, int_to_char


def get_model(n_hidden, X, y, epochs, batch):
    
    model = Sequential()
    model.add( LSTM(n_hidden, input_shape = (X.shape[1], X.shape[2])) )
    model.add( Dropout(0.25) )
    model.add( Dense(y.shape[1], activation = "softmax") )

    model.compile(loss = "categorical_crossentropy", optimizer = "adam")
    model.summary()
    filepath="weights-{epoch:02d}-{loss:.4f}.hdf5"
    checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
    callbacks_list = [checkpoint]
    
    model.fit(X, y, epochs=epochs, batch_size=batch , callbacks=callbacks_list)
    
    return model

char_to_int, int_to_char, sentencs, labels, X, y  = prepare_dataset(file_name,sentence_length)
model = get_model(256, X, y, epochs, batch)

Total characters in file 144424
Total unique characters in file  47


Char lookup  {'t': 0, '“': 1, 'j': 2, '-': 3, '!': 4, '‘': 5, 'p': 6, 'h': 7, 'd': 8, '0': 9, '*': 10, 'b': 11, '\n': 12, 'c': 13, '_': 14, 's': 15, '[': 16, 'a': 17, '’': 18, 'i': 19, 'v': 20, '.': 21, 'n': 22, 'x': 23, 'z': 24, 'y': 25, '?': 26, 'g': 27, ']': 28, ' ': 29, 'u': 30, 'f': 31, 'q': 32, 'l': 33, '3': 34, 'r': 35, '”': 36, ':': 37, '(': 38, ')': 39, 'o': 40, ',': 41, 'w': 42, 'e': 43, 'm': 44, 'k': 45, ';': 46}


Int lookup  {0: 't', 1: '“', 2: 'j', 3: '-', 4: '!', 5: '‘', 6: 'p', 7: 'h', 8: 'd', 9: '0', 10: '*', 11: 'b', 12: '\n', 13: 'c', 14: '_', 15: 's', 16: '[', 17: 'a', 18: '’', 19: 'i', 20: 'v', 21: '.', 22: 'n', 23: 'x', 24: 'z', 25: 'y', 26: '?', 27: 'g', 28: ']', 29: ' ', 30: 'u', 31: 'f', 32: 'q', 33: 'l', 34: '3', 35: 'r', 36: '”', 37: ':', 38: '(', 39: ')', 40: 'o', 41: ',', 42: 'w', 43: 'e', 44: 'm', 45: 'k', 46: ';'}
_________________________________________________________________
Layer (