In [1]:
import numpy as np
import pandas as pd
np.random.seed(42)
import tensorflow as tf
tf.set_random_seed(42)
from keras.models import Sequential, load_model
from keras.layers import Dense, Activation
from keras.layers import LSTM, Dropout
from keras.layers import TimeDistributed
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping

from keras.layers.core import Dense, Activation, Dropout, RepeatVector
from keras.optimizers import RMSprop
import matplotlib.pyplot as plt
import pickle
import sys
import heapq
import seaborn as sns
from pylab import rcParams

import unicodedata

import json
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [24]:
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.models import Sequential, Input
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping

from keras.optimizers import RMSprop, Adam
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, GRU
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

# SEQUENCE SIZE PARAMETERS
CHARACTER_NUMBER_PREDICTION = 40
DATA_SET_COLLECTION_ITERATIONS = 50_000
SET_VARIABILITY = 3

# RNN PARAMETERS
EMB_DIM = 256
SEQ_UNITS = 128
DROP = .1

# TRAINING PARAMETERS
TEST_SIZE = 0.2
EPOCHS = 400
BATCH_SIZE = 256
VALIDATION_SPLIT = .2

In [25]:
contentDf = pd.read_csv('bras_cubas_paragraphs.csv')

# Dataset preparation

In [26]:
def strip_accents(text):

    try:
        text = unicode(text, 'utf-8')
    except NameError: # unicode is a default on python 3 
        pass

    text = unicodedata.normalize('NFD', text)\
           .encode('ascii', 'ignore')\
           .decode("utf-8")

    return str(text)

In [27]:
# Arrays we'll use to store our dataset
X_data = []
y_data = []

# Looping through our corpus...
for i in range(DATA_SET_COLLECTION_ITERATIONS):
    # ... selecting a random paragraph from our book...
    paragraphIndex = np.random.randint(0, len(contentDf.paragraphs))
    currentParagraph = contentDf.paragraphs[paragraphIndex]
  
    # ... sampling a slice of the selected paragraph...
    #   aux = 1
    #   dontAddThisSample = False
    #   while(len(currentParagraph) - CHARACTER_NUMBER_PREDICTION + 1 < CHARACTER_NUMBER_PREDICTION + 1):
    #     # currentParagraph += contentDf.paragraphs[np.random.randint(0, len(contentDf.paragraphs))]
    #     if (paragraphIndex + aux < len(contentDf.paragraphs)):
    #         currentParagraph += contentDf.paragraphs[paragraphIndex + aux]
    #         aux += 1
    #     else:
    #         dontAddThisSample = True
    #         break;
        
    #   if (dontAddThisSample):
    #     continue

    if (len(currentParagraph) < CHARACTER_NUMBER_PREDICTION + 1):
        continue
        
        
    
    paragraphRegion = np.random.randint(0, len(currentParagraph) - CHARACTER_NUMBER_PREDICTION)
    
    # Checking how many different chars are in the selected paragraph region
    nChars = len(set(currentParagraph[paragraphRegion : paragraphRegion + CHARACTER_NUMBER_PREDICTION]))
    
    if (nChars < SET_VARIABILITY):
        continue
  
    
    # Adding an excerpt of the paragraph to our X and y data.
    X_data.append(currentParagraph[paragraphRegion : paragraphRegion + CHARACTER_NUMBER_PREDICTION].casefold())
    y_data.append(currentParagraph[paragraphRegion + CHARACTER_NUMBER_PREDICTION].casefold())
  

In [28]:
len(X_data), len(y_data)

(37923, 37923)

In the cells below, we'll instantiate and fit a tokenizer.

In [29]:
tokenizer = Tokenizer(
    num_words=500,
    char_level=True,
    filters=None,
    lower=False,
    oov_token=chr(1),
)

In [30]:
%%time

tokenizer.fit_on_texts(X_data)
tokenizer.fit_on_texts(y_data)
word_index = tokenizer.word_index
index_word = tokenizer.index_word

CPU times: user 262 ms, sys: 0 ns, total: 262 ms
Wall time: 261 ms


In [31]:
X = np.array(tokenizer.texts_to_sequences(X_data), dtype=np.int32)
y = np.array(list(map(word_index.get, y_data)))

In [32]:
print(X)
print(y)

[[ 2 12  4 ... 12  4  2]
 [ 5  2  4 ...  4  2 16]
 [ 2 14  3 ... 22 10  4]
 ...
 [ 2  4 15 ... 25  7 15]
 [ 2  3  2 ...  2  5  7]
 [12  7  3 ...  3  2  8]]
[ 9  5 20 ... 19 13  4]


In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE)
print(f"X train shape: {X_train.shape}, Y train shape : {y_train.shape}, X test shape: {X_test.shape}, Y test shape: {y_test.shape}")

X train shape: (30338, 40), Y train shape : (30338,), X test shape: (7585, 40), Y test shape: (7585,)


In [34]:
len(tokenizer.word_index)

66

# Model creation

In [35]:
model = Sequential()
model.add(Embedding(len(tokenizer.word_index) + 1, EMB_DIM, input_length=X_train.shape[1]))
model.add(SpatialDropout1D(DROP))
model.add(Bidirectional(GRU(SEQ_UNITS, return_sequences=True, dropout=DROP,recurrent_dropout=DROP)))
model.add(SpatialDropout1D(DROP))
model.add(Bidirectional(GRU(SEQ_UNITS, return_sequences=True, dropout=DROP,recurrent_dropout=DROP)))
model.add(GlobalMaxPool1D())
model.add(Dense(len(tokenizer.word_index), activation='softmax'))

In [36]:
model.compile(
    optimizer=RMSprop(lr=0.001),
    loss='sparse_categorical_crossentropy',
    metrics=['sparse_categorical_accuracy']
)
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 40, 256)           17152     
_________________________________________________________________
spatial_dropout1d_3 (Spatial (None, 40, 256)           0         
_________________________________________________________________
bidirectional_3 (Bidirection (None, 40, 256)           295680    
_________________________________________________________________
spatial_dropout1d_4 (Spatial (None, 40, 256)           0         
_________________________________________________________________
bidirectional_4 (Bidirection (None, 40, 256)           295680    
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 66)                16962     
Total para

In [37]:
%%time

history = model.fit(X_train, y_train, 
                    epochs=EPOCHS, 
                    batch_size=BATCH_SIZE,
                    validation_split=VALIDATION_SPLIT,
                    callbacks=[EarlyStopping(monitor='val_loss',patience=1,
                                            min_delta=1e-7)]
                   )

Train on 24270 samples, validate on 6068 samples
Epoch 1/400
Epoch 2/400
Epoch 3/400
Epoch 4/400
Epoch 5/400
Epoch 6/400
Epoch 7/400
Epoch 8/400
Epoch 9/400
Epoch 10/400
Epoch 11/400
Epoch 12/400
Epoch 13/400
Epoch 14/400
Epoch 15/400
Epoch 16/400
Epoch 17/400
Epoch 18/400
CPU times: user 6min 46s, sys: 31.5 s, total: 7min 18s
Wall time: 2min 58s


# Model testing

In [38]:
curr = X_test[0].copy().reshape((1, -1))
original = ''.join(map(tokenizer.index_word.get, curr[0]))

In [39]:
next_seq = []
while True:
    _next = model.predict_classes(curr)[0]
    next_char = tokenizer.index_word[_next]
    if next_char == ' ':
        break
    next_seq.append(next_char)
    curr[0, 0 : -1] = curr[0, 1 :]
    curr[0, -1] = _next


In [40]:
print(original, ''.join(next_seq))

i-ni-ti-va! repetiu, batendo as syllabas 


In [41]:
X_test[0]

array([ 7, 26, 11,  7, 26, 13,  7, 26, 18,  3, 34,  2,  8,  4, 16,  4, 13,
        7, 10, 17,  2, 23,  3, 13,  4, 11, 12,  5,  2,  3,  6,  2,  6, 40,
       15, 15,  3, 23,  3,  6], dtype=int32)

In [42]:
for i in range(30):
    curr = X_test[i].copy().reshape((1, -1))
    original = ''.join(map(tokenizer.index_word.get, curr[0]))
    blank_count = 0
    next_seq = []
    while True:
        _next = model.predict_classes(curr)[0]
        next_char = tokenizer.index_word[_next]
        if next_char == ' ':
            blank_count += 1
        if blank_count == 6:
            break
        next_seq.append(next_char)
        curr[0, 0 : -1] = curr[0, 1 :]
        curr[0, -1] = _next
        
    print(original, '--', ''.join(next_seq))

i-ni-ti-va! repetiu, batendo as syllabas --  de corraço. e despor a
ual! passei mal a noite; o diabo da asth -- amor. de corricar a corrica. de
já em opposição, entrando nesse numero o --  perdo do corrica. de corricar
 tempo de levantar e espairecer, como um --  corrica. a maria de um
 vê. morrer, meu anjo? que idéas são ess -- e a mão de cara a
fica o espirito humano,     supprime a d -- espor a morte. e de corricar
já lhe fui agradecer este signal de cons -- entente. não sei a maria de
sacudir dos olhos a ceremonia do enterro -- . não sei a maria de
goista! prefere ver-me padecer todos os  -- outros. e de cara a marida
 a fizeram. abençoadas pernas! e ha quem --  de cara a marida de
mais a negar que era noiva excellente; m -- as outra de casa. a maria
murei eu olhando para o tecto do corredo -- ..... e disse eu.... não sei
o politica eram bens dignos de apreço; o --  menhora de cara a menhora
a velha prataria do tempo de d. josé i,  -- despor a marido. e disse elle.
morta!" esta ult

# Model Saving

In [43]:
model.save('bras_cubas_40-keras_model.h5')
with open('bras_cubas_40-index_word.json', 'w') as f:
    json.dump(tokenizer.index_word, f, ensure_ascii=False)
with open('bras_cubas_40-word_index.json', 'w') as f:
    json.dump(tokenizer.word_index, f, ensure_ascii=False)