In [1]:
import numpy as np
import pandas as pd
np.random.seed(42)
import tensorflow as tf
tf.set_random_seed(42)
from keras.models import Sequential, load_model
from keras.layers import Dense, Activation
from keras.layers import LSTM, Dropout
from keras.layers import TimeDistributed
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping

from keras.layers.core import Dense, Activation, Dropout, RepeatVector
from keras.optimizers import RMSprop
import matplotlib.pyplot as plt
import pickle
import sys
import heapq
import seaborn as sns
from pylab import rcParams

import unicodedata

import json
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [21]:
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.models import Sequential, Input
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping

from keras.optimizers import RMSprop, Adam
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, GRU
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

# SEQUENCE SIZE PARAMETERS
CHARACTER_NUMBER_PREDICTION = 80
DATA_SET_COLLECTION_ITERATIONS = 50_000
SET_VARIABILITY = 3

# RNN PARAMETERS
EMB_DIM = 256
SEQ_UNITS = 256
DROP = .1

# TRAINING PARAMETERS
TEST_SIZE = 0.2
EPOCHS = 400
BATCH_SIZE = 256
VALIDATION_SPLIT = .2

In [3]:
contentDf = pd.read_csv('bras_cubas_paragraphs.csv')

# Dataset preparation

In [4]:
def strip_accents(text):

    try:
        text = unicode(text, 'utf-8')
    except NameError: # unicode is a default on python 3 
        pass

    text = unicodedata.normalize('NFD', text)\
           .encode('ascii', 'ignore')\
           .decode("utf-8")

    return str(text)

In [5]:
# Arrays we'll use to store our dataset
X_data = []
y_data = []

# Looping through our corpus...
for i in range(DATA_SET_COLLECTION_ITERATIONS):
    # ... selecting a random paragraph from our book...
    paragraphIndex = np.random.randint(0, len(contentDf.paragraphs))
    currentParagraph = contentDf.paragraphs[paragraphIndex]
  
    # ... sampling a slice of the selected paragraph...
    #   aux = 1
    #   dontAddThisSample = False
    #   while(len(currentParagraph) - CHARACTER_NUMBER_PREDICTION + 1 < CHARACTER_NUMBER_PREDICTION + 1):
    #     # currentParagraph += contentDf.paragraphs[np.random.randint(0, len(contentDf.paragraphs))]
    #     if (paragraphIndex + aux < len(contentDf.paragraphs)):
    #         currentParagraph += contentDf.paragraphs[paragraphIndex + aux]
    #         aux += 1
    #     else:
    #         dontAddThisSample = True
    #         break;
        
    #   if (dontAddThisSample):
    #     continue

    if (len(currentParagraph) < CHARACTER_NUMBER_PREDICTION + 1):
        continue
        
        
    
    paragraphRegion = np.random.randint(0, len(currentParagraph) - CHARACTER_NUMBER_PREDICTION)
    
    # Checking how many different chars are in the selected paragraph region
    nChars = len(set(currentParagraph[paragraphRegion : paragraphRegion + CHARACTER_NUMBER_PREDICTION]))
    
    if (nChars < SET_VARIABILITY):
        continue
  
    
    # Adding an excerpt of the paragraph to our X and y data.
    X_data.append(currentParagraph[paragraphRegion : paragraphRegion + CHARACTER_NUMBER_PREDICTION].casefold())
    y_data.append(currentParagraph[paragraphRegion + CHARACTER_NUMBER_PREDICTION].casefold())
  

In [6]:
len(X_data), len(y_data)

(31758, 31758)

In the cells below, we'll instantiate and fit a tokenizer.

In [7]:
tokenizer = Tokenizer(
    num_words=500,
    char_level=True,
    filters=None,
    lower=False,
    oov_token=chr(1),
)

In [8]:
%%time

tokenizer.fit_on_texts(X_data)
tokenizer.fit_on_texts(y_data)
word_index = tokenizer.word_index
index_word = tokenizer.index_word

CPU times: user 397 ms, sys: 8 µs, total: 397 ms
Wall time: 397 ms


In [9]:
X = np.array(tokenizer.texts_to_sequences(X_data), dtype=np.int32)
y = np.array(list(map(word_index.get, y_data)))

In [10]:
print(X)
print(y)

[[14  7  3 ...  5  2 23]
 [22 10  9 ...  2 19  4]
 [13  8  5 ...  2 12  7]
 ...
 [ 2 18  4 ... 15  4 22]
 [13  4  2 ...  6  4  2]
 [10  4  2 ...  9  3  7]]
[4 7 6 ... 8 3 6]


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE)
print(f"X train shape: {X_train.shape}, Y train shape : {y_train.shape}, X test shape: {X_test.shape}, Y test shape: {y_test.shape}")

X train shape: (25406, 80), Y train shape : (25406,), X test shape: (6352, 80), Y test shape: (6352,)


In [12]:
len(tokenizer.word_index)

66

# Model creation

In [22]:
model = Sequential()
model.add(Embedding(len(tokenizer.word_index) + 1, EMB_DIM, input_length=X_train.shape[1]))
model.add(SpatialDropout1D(DROP))
model.add(Bidirectional(GRU(SEQ_UNITS, return_sequences=True, dropout=DROP,recurrent_dropout=DROP)))
model.add(SpatialDropout1D(DROP))
model.add(Bidirectional(GRU(SEQ_UNITS, return_sequences=True, dropout=DROP,recurrent_dropout=DROP)))
model.add(GlobalMaxPool1D())
model.add(Dense(len(tokenizer.word_index), activation='softmax'))

In [23]:
model.compile(
    optimizer=RMSprop(lr=0.001),
    loss='sparse_categorical_crossentropy',
    metrics=['sparse_categorical_accuracy']
)
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 80, 256)           17152     
_________________________________________________________________
spatial_dropout1d_3 (Spatial (None, 80, 256)           0         
_________________________________________________________________
bidirectional_3 (Bidirection (None, 80, 512)           787968    
_________________________________________________________________
spatial_dropout1d_4 (Spatial (None, 80, 512)           0         
_________________________________________________________________
bidirectional_4 (Bidirection (None, 80, 512)           1181184   
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 66)                33858     
Total para

In [24]:
%%time

history = model.fit(X_train, y_train, 
                    epochs=EPOCHS, 
                    batch_size=BATCH_SIZE,
                    validation_split=VALIDATION_SPLIT,
                    callbacks=[EarlyStopping(monitor='val_loss',patience=1,
                                            min_delta=1e-7)]
                   )

Train on 20324 samples, validate on 5082 samples
Epoch 1/400
Epoch 2/400
Epoch 3/400
Epoch 4/400
Epoch 5/400
Epoch 6/400
Epoch 7/400
Epoch 8/400
Epoch 9/400
Epoch 10/400
Epoch 11/400
CPU times: user 7min 3s, sys: 36.1 s, total: 7min 39s
Wall time: 3min 4s


# Model testing

In [25]:
curr = X_test[0].copy().reshape((1, -1))
original = ''.join(map(tokenizer.index_word.get, curr[0]))

In [26]:
next_seq = []
while True:
    _next = model.predict_classes(curr)[0]
    next_char = tokenizer.index_word[_next]
    if next_char == ' ':
        break
    next_seq.append(next_char)
    curr[0, 0 : -1] = curr[0, 1 :]
    curr[0, -1] = _next


In [27]:
print(original, ''.join(next_seq))

 seriam tenues, e compradas a troco da solidão. sem filhos! não; impossivel. dis se


In [28]:
X_test[0]

array([ 2,  6,  4,  8,  7,  3,  9,  2, 13,  4, 11, 10,  4,  6, 16,  2,  4,
        2, 14,  5,  9, 17,  8,  3, 12,  3,  6,  2,  3,  2, 13,  8,  5, 14,
        5,  2, 12,  3,  2,  6,  5, 15,  7, 12, 25,  5, 20,  2,  6,  4,  9,
        2, 24,  7, 15, 19,  5,  6, 35,  2, 11, 25,  5, 27,  2,  7,  9, 17,
        5,  6,  6,  7, 18,  4, 15, 20,  2, 12,  7,  6], dtype=int32)

In [29]:
for i in range(30):
    curr = X_test[i].copy().reshape((1, -1))
    original = ''.join(map(tokenizer.index_word.get, curr[0]))
    blank_count = 0
    next_seq = []
    while True:
        _next = model.predict_classes(curr)[0]
        next_char = tokenizer.index_word[_next]
        if next_char == ' ':
            blank_count += 1
        if blank_count == 6:
            break
        next_seq.append(next_char)
        curr[0, 0 : -1] = curr[0, 1 :]
        curr[0, -1] = _next
        
    print(original, '--', ''.join(next_seq))

 seriam tenues, e compradas a troco da solidão. sem filhos! não; impossivel. dis -- se e pera verte e pera
deiramente christão. todavia, não neguei aos amigos as vantagens pecuniarias que --  fante a menha vertigo e
stume jantava ahi; mas, não tendo deliberadamente andado, nenhum merecimento da  -- meneito de meite de memente com
obrigado a aceitar as duas; creio que posso ser separadamente homem casado ou ho -- mente de meito de conteiro, e
os famulos, que naturalmente se desforravam assim da condição servil, e tudo iss -- e e a menos de por
sentar ao pé da filha do damasceno, uma d. eulalia, ou mais familiarmente nhã-ló --  e eu conteito de perteiro
te, ao menos, uma parte cheia de prazeres, de agitações, de sustos,— capeada de  -- porte em porte de memos e
 eu nasci, estava já em todo o explendor da gloria e do poder; era imperador e g -- ranha para de canteira cama cantava
 o humanitismo; elle é o grande regaço dos espiritos, o mar eterno em que mergul -- a e eu pera e eu
 indiffer

# Model Saving

In [30]:
model.save('bras_cubas_80-keras_model.h5')
with open('bras_cubas_80-index_word.json', 'w') as f:
    json.dump(tokenizer.index_word, f, ensure_ascii=False)
with open('bras_cubas_80-word_index.json', 'w') as f:
    json.dump(tokenizer.word_index, f, ensure_ascii=False)