In [44]:
# Using Keras to generate text based on the Dutch Novela "Wolfsjong"

In [1]:
import numpy
import sys
import keras
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint
import nltk
nltk.download('stopwords')
import os

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
import tensorflow as tf
tf.test.gpu_device_name()

'/device:GPU:0'

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
file = open("/content/drive/MyDrive/Colab Notebooks/wolfsjong/wolfsjong-kevin-van-vliet.txt").read()


In [5]:
def tokenize_words(input):
    input = input.lower()
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(input)
    filtered = filter(lambda token: token not in stopwords.words('dutch'), tokens)
    return " ".join(filtered)

In [6]:
processed_inputs = tokenize_words(file)
chars = sorted(list(set(processed_inputs)))
char_to_num = dict((c, i) for i, c in enumerate(chars))
num_to_char = dict((i, c) for i, c in enumerate(chars))

In [7]:
input_len = len(processed_inputs)
vocab_len = len(chars)
print ("Total no. of characters:", input_len)
print ("Total vocabulary:", vocab_len)


Total no. of characters: 65917
Total vocabulary: 42


In [8]:
seq_length = 50
steps = 3
x_data = []
y_data = []

In [9]:
for i in range(0, input_len - seq_length, steps):
    in_seq = processed_inputs[i:i + seq_length]
    out_seq = processed_inputs[i + seq_length]
    x_data.append([char_to_num[char] for char in in_seq])
    y_data.append(char_to_num[out_seq])

In [10]:
n_patterns = len(x_data)
print ("Total Patterns:", n_patterns)

Total Patterns: 21956


In [11]:
X = numpy.reshape(x_data, (n_patterns, seq_length, 1))
X = X/float(vocab_len)

In [12]:
y = np_utils.to_categorical(y_data)

In [13]:
def create_model_v1():
    model = Sequential()
    model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
    model.add(Dropout(0.2))
    model.add(LSTM(256, return_sequences=True))
    model.add(Dropout(0.2))
    model.add(LSTM(128))
    model.add(Dropout(0.2))
    model.add(Dense(y.shape[1], activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam')

    return model

In [14]:
check_point_filepath = "/content/drive/MyDrive/Colab Notebooks/wolfsjong/weights_checkpoint/wolfsjong_check_point"
checkpoint_dir = os.path.dirname(check_point_filepath)
model_checkpoint = ModelCheckpoint(filepath=check_point_filepath,monitor="loss",save_weights_only=False,mode='min',save_best_only=True)
desired_callbacks = [model_checkpoint]

In [15]:
# Check for previously trained model
if os.path.exists(check_point_filepath):
  print("Loading pre-existing model")
  model = keras.models.load_model(check_point_filepath)
else:
  print("Creating new model")
  model = create_model_v1()


model.fit(X, y, epochs=5, batch_size=256 ,callbacks=desired_callbacks,verbose=1)


Loading pre-existing model
Epoch 1/5




INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/wolfsjong/weights_checkpoint/wolfsjong_check_point/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/wolfsjong/weights_checkpoint/wolfsjong_check_point/assets


Epoch 2/5




INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/wolfsjong/weights_checkpoint/wolfsjong_check_point/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/wolfsjong/weights_checkpoint/wolfsjong_check_point/assets


Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fd100eaa0f0>

In [33]:
seed1 = 'Oskars woonboerderij was nauwelijks ingericht'
seed2 = 'Heuvele aan Zee had niets van dit alles'
seed3 = 'Hij had een tragisch maar buitengewoon leven'

In [42]:
for seed in [seed1,seed2,seed3]:
  print("\n Seed phrase:..... ",seed )
  pattern = [char_to_num[value] for value in seed1.lower()]
  for i in range(400):
    x = numpy.reshape(pattern, (1, len(pattern), 1))
    x = x / float(vocab_len)
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result = num_to_char[index]
    seq_in = [num_to_char[value] for value in pattern]
    sys.stdout.write(result)
    pattern.append(index)
    pattern = pattern[1:len(pattern)]


 Seed phrase:.....  Oskars woonboerderij was nauwelijks ingericht
 tteetder gewer pvtooe gerste kelewer alle maamt derttette pntenotanten jaan oieuwenijk jeee glors giedn vrelmen gevonnlams gersaart bijzooden bosteldn kanker meegser goddgtogk ojemrt wrrrwen blondi baanoen vijken leeg geel jnmg jeuen stttt eekooen be bftis berraan gred gooode vert ai blvecht maten genadht wel wiehteld best lee oukar gelekaar ziekenhuis zouden verderon saaiten manker mnett oet gek
 Seed phrase:.....  Heuvele aan Zee had niets van dit alles
 tteetder gewer pvtooe gerste kelewer alle maamt derttette pntenotanten jaan oieuwenijk jeee glors giedn vrelmen gevonnlams gersaart bijzooden bosteldn kanker meegser goddgtogk ojemrt wrrrwen blondi baanoen vijken leeg geel jnmg jeuen stttt eekooen be bftis berraan gred gooode vert ai blvecht maten genadht wel wiehteld best lee oukar gelekaar ziekenhuis zouden verderon saaiten manker mnett oet gek
 Seed phrase:.....  Hij had een tragisch maar buitengewoon leven
 tteet