# Teaching a Model to Write using KerasNLP - An Introduction



In [None]:
import os
import requests

import numpy as np
import regex as re
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import ModelCheckpoint
from bs4 import BeautifulSoup

In [None]:
!wget https://raw.githubusercontent.com/computationalstylistics/68_german_novels/master/corpus/achleitner_bergrichters.txt -O text1.txt
# !wget https://raw.githubusercontent.com/computationalstylistics/68_german_novels/master/corpus/achleitner_celsissimus.txt -O text2.txt
# !wget https://raw.githubusercontent.com/computationalstylistics/68_german_novels/master/corpus/achleitner_tann.txt -O text3.txt
# !wget https://raw.githubusercontent.com/computationalstylistics/68_german_novels/master/corpus/anonym_schwester.txt -O text4.txt

In [None]:
url = "https://einfachreisenmitkind.de/egal-wie-witze-sprueche/"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")

# print(soup.find_all("p"))
jokes = []
for joke in soup.find_all("p"):
    if joke.text.startswith("Egal wie"):
        jokes.append(joke.text.strip())

with open("jokes.txt", "w") as f:
    for joke in jokes:
        f.write(joke + "\n")

print(f"There are {len(jokes)} jokes. An example is '{jokes[-1]}'.")

In [None]:
!cat text1.txt jokes.txt > data.txt

In [None]:
# Load the dataset
def file_to_sentence_list(file_path):
    with open(file_path, 'r') as file:
        text = file.read()

    # Splitting the text into sentences using
    # delimiters like '.', '?', and '!'
    sentences = [sentence.strip() for sentence in re.split(
        r'(?<=[.!?])\s+', text) if sentence.strip()]

    return sentences

file_path = './data.txt'
text_data = file_to_sentence_list(file_path)

print(len(text_data))

In [None]:
# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_data)
total_words = len(tokenizer.word_index) + 1

# Create input sequences
input_sequences = []
for line in text_data:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

# Pad sequences for uniform length
max_sequence_length = max([len(seq) for seq in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_length, padding='pre')

print(f"Total words in the dataset: {total_words}")

In [None]:
import numpy as np

X, y = input_sequences[:, :-1], input_sequences[:, -1]
y = np.eye(total_words)[y]  # One-hot encode the labels

In [None]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense

model = Sequential()
model.add(Embedding(total_words, 50, input_length=max_sequence_length-1))
model.add(LSTM(100))
model.add(Dense(total_words, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


In [None]:
model.summary()

In [None]:
model.fit(X, y, epochs=50, verbose=1, validation_split=0.2)

In [None]:
seed_text = "Langsam ging der Richter von Fenster"
for _ in range(10):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_length-1, padding='pre')
    predicted_word_index = np.argmax(model.predict(token_list), axis=-1)
    predicted_word = tokenizer.index_word[predicted_word_index[0]]
    seed_text += " " + predicted_word
print(seed_text)

In [None]:
import pickle as pkl

with open('model_tokenizer_text_jokes.pickle', 'wb') as handle:
    pkl.dump([model, tokenizer], handle, protocol=pkl.HIGHEST_PROTOCOL)

In [None]:
# Load the tokenizer used during training
with open('./model_tokenizer_text_jokes.pickle', 'rb') as file:
    [model_1, tokenizer_1] = pkl.load(file)

In [None]:
!ls -lrt

In [None]:
# Preprocess the input seed text
seed_text = "Langsam ging der Richter von Fenster"
token_list = tokenizer_1.texts_to_sequences([seed_text])[0]
token_list = pad_sequences([token_list], maxlen=max_sequence_length-1, padding='pre')

# Use the model to predict the next word
predicted_word_index = np.argmax(model_1.predict(token_list, verbose=0), axis=-1)
predicted_word = tokenizer.index_word[predicted_word_index[0]]

print(f"Predicted sentence: {seed_text} {predicted_word}")

In [None]:
# Preprocess the input seed text
seed_text = "Egal wie hart du bist, sie sind"
token_list = tokenizer_1.texts_to_sequences([seed_text])[0]
token_list = pad_sequences([token_list], maxlen=max_sequence_length-1, padding='pre')

# Use the model to predict the next word
predicted_word_index = np.argmax(model_1.predict(token_list, verbose=0), axis=-1)
predicted_word = tokenizer.index_word[predicted_word_index[0]]

print(f"Predicted sentence: {seed_text} {predicted_word}")