In [None]:
import os 
import sys 
import pandas as pd
import numpy as np

sys.path.insert(0, os.path.join("..", "src"))
import config 
import review 

path = os.path.join(config.DIR_PROCESSED, "reviews.pkl")
df = pd.read_pickle(path)
df = df[df[review.PARSED_SUCCESS] == True]
texts = df[:100]["text"].str.cat().lower() # cat()

# print(texts)
# docs = [t for t in texts]
# text = text.lower()


In [None]:
from keras.preprocessing.text import Tokenizer 

num_words = 100000
maxlen = 3
# samples = 100000
step = 2

filters = "!\"#$%&()*+,-/:;<=>?@[\\]^_`{|}~\t\n'…›‹`´‛”▪∅€†’"
tokenizer = Tokenizer(num_words = num_words, filters=filters)

print(len(texts))
# split text into subtext


print(dir(tokenizer))

tokenizer.fit_on_texts([texts])
seq = tokenizer.texts_to_sequences([texts])[0]

generator = tokenizer.texts_to_sequences_generator(texts)
print(type(generator))


In [None]:
# cut the text in semi-redundant sequences of maxlen characters



max_samples = int((len(seq) - maxlen) / step)
print(max_samples)

print('Creating samples...')
x = np.zeros((samples, maxlen, num_words), dtype=np.bool) # input
y = np.zeros((samples, num_words), dtype=np.bool)  # next word 
print(x.shape)
print(y.shape)

for i in range(0, samples, step):
    current = seq[i:i+maxlen]
    for t, word in enumerate(current):
        x[i, t, word] = 1
    next_word = seq[i+maxlen]
    y[i, next_word] = 1

In [None]:
from __future__ import print_function
from keras.callbacks import LambdaCallback
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
import numpy as np
import random
import sys
import io

# build the model: a single LSTM
print('Build model...')
model = Sequential()
model.add(LSTM(128, input_shape=(maxlen, num_words)))
model.add(Dense(num_words, activation='softmax'))

optimizer = RMSprop(learning_rate=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)


def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)


def on_epoch_end(epoch, _):
    # Function invoked at end of each epoch. Prints generated text.
    print()
    print('----- Generating text after Epoch: %d' % epoch)

    # doc_idx = random.randint(0, len(seq) - 1)
    
    start_index = random.randint(0, len(seq) - maxlen - 1)
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print('----- diversity:', diversity)

        v_sentence = seq[start_index: start_index + maxlen]
        s_sentence = []
        for word in v_sentence:
            s_sentence.append(tokenizer.index_word[word]) 
            
        generated = " ".join(s_sentence)
        print('----- Generating with seed: "' +  generated  + '"')
        sys.stdout.write(generated)

        sentence = []
        
        for i in range(50):
            x_pred = np.zeros((1, maxlen, num_words))
            for t, word in enumerate(v_sentence):
                x_pred[0, t, word] = 1.

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            
            if next_index == 0:
                continue 
                
            next_word = tokenizer.index_word[next_index]

            sentence.append(next_word)

            sys.stdout.write(" " + next_word)
            sys.stdout.flush()
        print()

print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

hist = model.fit(x, y,
          batch_size=128,
          epochs=60,
          callbacks=[print_callback])