In [52]:
# Code from https://machinelearningmastery.com/develop-character-based-neural-language-model-keras/

import numpy as np
import pandas as pd
from nltk.util import ngrams
import collections
import itertools
import matplotlib.pyplot as plt

from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding

In [2]:
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r', encoding = 'utf8')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

# save tokens to file, one dialog per line
def save_doc(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()

In [3]:
in_filename = 'Data/GlobalVoices_en.txt'
raw_text = load_doc(in_filename)
raw_text = raw_text.lower()
lines = raw_text.split('\n')

In [4]:
# integer encode sequences of characters
# hyperparam for the length of the input sequences
length = 10

sequences = list()
k = 0
j = 0
for line in lines: 
    if j < 25000:
        k += 1
        if line[-1:] == '.' or line[-1:] == '?' or line[-1:] == '!' or line[-1:] == '"':
            j += 1
            for i in range(length, len(line)):
                seq = raw_text[i-length:i+1]
                sequences.append(seq)

In [89]:
#out_filename = 'char_sequences.txt'
#save_doc(sequences, out_filename)

In [5]:
print('Length of Original Corpus: %s' % len(lines))
print('Length of Tokenized Sequences in LM Corpus: %s' % len(sequences))
print('Sentence line we stop at: %s' % k)
print('Number of Sentences in our dataset: %s' % j)

Length of Original Corpus: 979735
Length of Tokenized Sequences in LM Corpus: 2847853
Sentence line we stop at: 33485
Number of Sentences in our dataset: 25000


In [84]:
chars = sorted(list(set(raw_text)))
chars

['\n',
 ' ',
 '!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 '{',
 '|',
 '}',
 '~',
 '\x7f',
 '\x80',
 '\x92',
 '\x93',
 '\x94',
 '\x9a',
 '\x9c',
 '\x9d',
 '¡',
 '¢',
 '£',
 '¥',
 '¦',
 '§',
 '¨',
 '©',
 'ª',
 '«',
 '¬',
 '\xad',
 '®',
 '°',
 '±',
 '²',
 '´',
 '·',
 '¹',
 'º',
 '»',
 '¼',
 '½',
 '¾',
 '¿',
 '×',
 'ß',
 'à',
 'á',
 'â',
 'ã',
 'ä',
 'å',
 'æ',
 'ç',
 'è',
 'é',
 'ê',
 'ë',
 'ì',
 'í',
 'î',
 'ï',
 'ð',
 'ñ',
 'ò',
 'ó',
 'ô',
 'õ',
 'ö',
 '÷',
 'ø',
 'ù',
 'ú',
 'û',
 'ü',
 'ý',
 'ÿ',
 'ā',
 'ă',
 'ą',
 'ć',
 'ĉ',
 'č',
 'ď',
 'đ',
 'ē',
 'ĕ',
 'ė',
 'ę',
 'ě',
 'ĝ',
 'ğ',
 'ģ',
 'ĩ',
 'ī',
 'ı',
 'ĳ',
 'ĵ',
 'ķ',
 'ĸ',
 'ļ',
 'ľ',
 'ł',

In [76]:
chars = sorted(list(set(raw_text)))
chars = chars[0:66]
mapping = dict((c, i) for i, c in enumerate(chars))
print(mapping)

{'\n': 0, ' ': 1, '!': 2, '"': 3, '#': 4, '$': 5, '%': 6, '&': 7, "'": 8, '(': 9, ')': 10, '*': 11, '+': 12, ',': 13, '-': 14, '.': 15, '/': 16, '0': 17, '1': 18, '2': 19, '3': 20, '4': 21, '5': 22, '6': 23, '7': 24, '8': 25, '9': 26, ':': 27, ';': 28, '<': 29, '=': 30, '>': 31, '?': 32, '@': 33, '[': 34, '\\': 35, ']': 36, '^': 37, '_': 38, '`': 39, 'a': 40, 'b': 41, 'c': 42, 'd': 43, 'e': 44, 'f': 45, 'g': 46, 'h': 47, 'i': 48, 'j': 49, 'k': 50, 'l': 51, 'm': 52, 'n': 53, 'o': 54, 'p': 55, 'q': 56, 'r': 57, 's': 58, 't': 59, 'u': 60, 'v': 61, 'w': 62, 'x': 63, 'y': 64, 'z': 65}


In [77]:
encoded_sequences = list()
for row in sequences:
    # integer encode line; any characters outside of the first 66 are encoded as something else
    encoded_seq = [mapping[char] if char in chars else 66 for char in row ]
    # store
    encoded_sequences.append(encoded_seq)

In [85]:
# vocabulary size
vocab_size = len(mapping) + 1
print('Vocabulary Size: %d' % vocab_size)

Vocabulary Size: 67


In [87]:
# separate into input and output
encoded_sequences = np.array(encoded_sequences) #, shape = (len(sequences),length))
X, y = encoded_sequences[:,:-1], encoded_sequences[:,-1]
#new_sequences = [to_categorical(x, num_classes=vocab_size) for x in X]
#X = np.array(new_sequences)
y_temp = to_categorical(y, num_classes=vocab_size)

In [95]:
y_temp[31]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.])

In [98]:
# Language Model 
# Hyperparameters: Embedding dimension size, LSTM size, learning rate, dropout, 

model = Sequential()
model.add(Embedding(vocab_size, 16, input_length=X.shape[1]))
#model.add(Flatten())
model.add(LSTM(256))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit model
model.fit(X[:1000], y_temp[:1000], epochs=100, verbose=2, validation_split=.1)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 10, 16)            1072      
_________________________________________________________________
lstm_6 (LSTM)                (None, 256)               279552    
_________________________________________________________________
dense_6 (Dense)              (None, 67)                17219     
Total params: 297,843
Trainable params: 297,843
Non-trainable params: 0
_________________________________________________________________
None
Train on 900 samples, validate on 100 samples
Epoch 1/100
 - 2s - loss: 3.6221 - acc: 0.1844 - val_loss: 3.2297 - val_acc: 0.0600
Epoch 2/100
 - 1s - loss: 2.9856 - acc: 0.1989 - val_loss: 3.1751 - val_acc: 0.2400
Epoch 3/100
 - 1s - loss: 2.9341 - acc: 0.2122 - val_loss: 3.1250 - val_acc: 0.2400
Epoch 4/100
 - 1s - loss: 2.9154 - acc: 0.2122 - val_loss: 3.1481 - val_acc: 0.2400
Epoch 5/100
 

Epoch 89/100
 - 2s - loss: 0.0023 - acc: 1.0000 - val_loss: 4.5907 - val_acc: 0.4800
Epoch 90/100
 - 1s - loss: 0.0022 - acc: 1.0000 - val_loss: 4.6059 - val_acc: 0.4800
Epoch 91/100
 - 2s - loss: 0.0022 - acc: 1.0000 - val_loss: 4.6153 - val_acc: 0.4800
Epoch 92/100
 - 2s - loss: 0.0022 - acc: 1.0000 - val_loss: 4.6225 - val_acc: 0.4800
Epoch 93/100
 - 1s - loss: 0.0021 - acc: 1.0000 - val_loss: 4.6267 - val_acc: 0.4800
Epoch 94/100
 - 1s - loss: 0.0020 - acc: 1.0000 - val_loss: 4.6281 - val_acc: 0.4800
Epoch 95/100
 - 2s - loss: 0.0020 - acc: 1.0000 - val_loss: 4.6412 - val_acc: 0.4800
Epoch 96/100
 - 1s - loss: 0.0019 - acc: 1.0000 - val_loss: 4.6462 - val_acc: 0.4800
Epoch 97/100
 - 1s - loss: 0.0019 - acc: 1.0000 - val_loss: 4.6561 - val_acc: 0.4800
Epoch 98/100
 - 1s - loss: 0.0018 - acc: 1.0000 - val_loss: 4.6425 - val_acc: 0.4800
Epoch 99/100
 - 1s - loss: 0.0017 - acc: 1.0000 - val_loss: 4.6642 - val_acc: 0.4800
Epoch 100/100
 - 1s - loss: 0.0017 - acc: 1.0000 - val_loss: 4.66

<keras.callbacks.History at 0x1dca02cb0b8>