In [1]:
import re
import collections
import string
import random

import tensorflow as tf

tf.compat.v1.disable_eager_execution()

gpus = tf.config.experimental.list_physical_devices(device_type='GPU')

if len(gpus) > 0:
    tf.config.experimental.set_visible_devices(devices=gpus[0], device_type='GPU')
    tf.config.experimental.set_memory_growth(device=gpus[0], enable=True)

from tensorflow import keras
from tensorflow.keras import layers, backend, losses, Sequential

import numpy as np

In [2]:
files = ['data/pg1619.txt', 'data/pg2000.txt']

text = ""
for file in files:
    with open(file) as fd:
        text += fd.read()
        
text = text[:1000000]
        
punctuationNoPeriod = "[" + re.sub("\.","",string.punctuation) + "]"
text = re.sub(punctuationNoPeriod, "", text)
text = text.replace('\n', ' ')

# Let's truncate this to just 1000000 words.
corpus = text.lower()

In [3]:
# first get individual words
tokenized = corpus.split()

# Count the words
words = collections.Counter(tokenized)

In [4]:
unique_words = list(set(tokenized))
unique_words_inv = {}
for i, w in enumerate(unique_words):
    unique_words_inv[w] = i

In [5]:
# Transform the tokens into indeces.
tokenized_index = [unique_words_inv[w] for w in tokenized]

In [15]:
def build_model():

    m = Sequential()
    
    m.add(layers.Embedding(len(unique_words), 100, input_shape=(None, )))
    
    m.add(layers.LSTM(8))
    
    m.add(layers.Dense(16, activation='relu'))
    
    m.add(layers.Dense(32, activation='relu'))
    
    m.add(layers.Dense(len(unique_words), activation='softmax'))
    
    m.compile(optimizer='adam', loss='categorical_crossentropy')
    
    print(m.summary())
    
    return m
    
m = build_model()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 100)         2195300   
_________________________________________________________________
lstm_2 (LSTM)                (None, 8)                 3488      
_________________________________________________________________
dense_5 (Dense)              (None, 16)                144       
_________________________________________________________________
dense_6 (Dense)              (None, 32)                544       
_________________________________________________________________
dense_7 (Dense)              (None, 21953)             724449    
Total params: 2,923,925
Trainable params: 2,923,925
Non-trainable params: 0
_________________________________________________________________
None


In [16]:
def train_generator(tokens, batch_size=32):
    
    j_ranges = []
    for n in range(1, 30):
        j_ranges.append(list(range(n, len(tokens))))
    
    while True:
        
        n = random.randint(1, 29)
    
        x = np.zeros((batch_size, n))
        y = np.zeros((batch_size, len(unique_words)))
    
        for i, j in enumerate(random.choices(j_ranges[n-1], k=batch_size)):
            x[i][:] = np.array(tokens[j-n:j], dtype='int')
            y[i][tokens[j]] = 1.0
        
        yield (x.reshape((batch_size, n)), y)
        

In [24]:
m.fit(train_generator(tokenized_index, batch_size=64), steps_per_epoch=100, epochs=10000)

Epoch 1/10000
Epoch 2/10000
Epoch 3/10000
Epoch 4/10000
Epoch 5/10000
Epoch 6/10000
Epoch 7/10000
Epoch 8/10000
Epoch 9/10000
Epoch 10/10000
Epoch 11/10000
Epoch 12/10000
Epoch 13/10000
Epoch 14/10000
Epoch 15/10000
Epoch 16/10000
Epoch 17/10000
Epoch 18/10000
Epoch 19/10000
Epoch 20/10000
Epoch 21/10000
Epoch 22/10000
Epoch 23/10000
Epoch 24/10000
Epoch 25/10000
Epoch 26/10000
Epoch 27/10000
 13/100 [==>...........................] - ETA: 2s - batch: 6.0000 - size: 64.0000 - loss: 7.1667

KeyboardInterrupt: 

In [18]:
def next_word(w):
    print(" ".join(w))
    x = np.array([unique_words_inv[_] for _ in w], dtype='int').reshape((1, len(w)))
    
    y = m.predict(x)
    y = np.reshape(y, (y.size, ))
    
    k = y.argmax()
    
    return unique_words[k]    

def next_words(w, n):
    w = w.copy()
    print(" ".join(w))
    for i in range(n):
        w.append(next_word(w))
        
    print(" ".join(w))

In [21]:
next_word(tokenized[200:210])

dos personajes centrales cuyas vidas y destinos se enlazan apasionadamente


'de'

In [22]:
next_words(tokenized[200:210], 30)

dos personajes centrales cuyas vidas y destinos se enlazan apasionadamente
dos personajes centrales cuyas vidas y destinos se enlazan apasionadamente
dos personajes centrales cuyas vidas y destinos se enlazan apasionadamente de
dos personajes centrales cuyas vidas y destinos se enlazan apasionadamente de que
dos personajes centrales cuyas vidas y destinos se enlazan apasionadamente de que que
dos personajes centrales cuyas vidas y destinos se enlazan apasionadamente de que que que
dos personajes centrales cuyas vidas y destinos se enlazan apasionadamente de que que que que
dos personajes centrales cuyas vidas y destinos se enlazan apasionadamente de que que que que que
dos personajes centrales cuyas vidas y destinos se enlazan apasionadamente de que que que que que que
dos personajes centrales cuyas vidas y destinos se enlazan apasionadamente de que que que que que que que
dos personajes centrales cuyas vidas y destinos se enlazan apasionadamente de que que que que que que que que
dos 

In [None]:
'hola' in unique_words