In [2]:
import numpy as np
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding

# Preprocess de zinnen
def preprocess_sentence(sentence):
    to_exclude = "/.%-,'\":;()[]0123456789"
    sentence = "".join([char if char not in to_exclude else " " for char in sentence])
    sentence = " ".join([word for word in sentence.split() if word.lower() not in stopwoorden])
    return sentence

# Maak CBOW-pairen
def create_pairs(corpus, sequence_length):
    X, y = [], []
    for sentence in corpus:
        words = list(sentence)  # Splits de zin in karakters
        for i in range(len(words) - sequence_length):
            X.append(words[i:i + sequence_length])  # Context (input)
            y.append(words[i + sequence_length])   # Target (output)
    return X, y

# Laad en preprocess data
with open("wiki.txt", "r") as file:
    wiki_text = [line.strip() for line in file if len(line.strip().split()) >= 10]

with open("stopwoorden.txt", "r") as file:
    stopwoorden = [line.strip() for line in file if line]

# Combineer de data tot één string en preprocess
processed_sentences = [preprocess_sentence(sentence) for sentence in wiki_text]
text = " ".join(processed_sentences)

# Unieke karakters en mapping
chars = sorted(list(set(text)))
char_to_idx = {char: idx for idx, char in enumerate(chars)}
idx_to_char = {idx: char for char, idx in char_to_idx.items()}

# Maak CBOW-pairen (context en target)
sequence_length = 40  # Lengte van de inputsequentie
X, y = create_pairs([text], sequence_length)

# One-hot-encode de data
X_encoded = np.zeros((len(X), sequence_length, len(chars)), dtype=np.bool_)
y_encoded = np.zeros((len(y), len(chars)), dtype=np.bool_)

for i, sequence in enumerate(X):
    for t, char in enumerate(sequence):
        X_encoded[i, t, char_to_idx[char]] = 1
    y_encoded[i, char_to_idx[y[i]]] = 1

# Model maken
model = Sequential([
    LSTM(128, input_shape=(sequence_length, len(chars))),
    Dense(len(chars), activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train het model
model.fit(X_encoded, y_encoded, batch_size=64, epochs=20)

# Methode om een nieuwe sequentie te genereren
def generate_sequence(seed, length=200):
    result = seed
    input_sequence = np.zeros((1, sequence_length, len(chars)), dtype=np.bool_)
    
    for t, char in enumerate(seed):
        input_sequence[0, t, char_to_idx[char]] = 1

    for _ in range(length):
        prediction = model.predict(input_sequence, verbose=0)
        next_char_idx = np.argmax(prediction)
        next_char = idx_to_char[next_char_idx]
        result += next_char

        # Schuif het input window
        input_sequence = np.roll(input_sequence, -1, axis=1)
        input_sequence[0, -1, :] = 0
        input_sequence[0, -1, next_char_idx] = 1

    return result

# Test sequentie genereren
seed_text = text[:sequence_length]
generated_sequence = generate_sequence(seed_text)
print("Generated sequence:", generated_sequence)



  super().__init__(**kwargs)


Epoch 1/20
[1m186/186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 19ms/step - accuracy: 0.1580 - loss: 3.2058
Epoch 2/20
[1m186/186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 18ms/step - accuracy: 0.2271 - loss: 2.8021
Epoch 3/20
[1m186/186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 19ms/step - accuracy: 0.2615 - loss: 2.5659
Epoch 4/20
[1m186/186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 18ms/step - accuracy: 0.3026 - loss: 2.4030
Epoch 5/20
[1m186/186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 17ms/step - accuracy: 0.3218 - loss: 2.3144
Epoch 6/20
[1m186/186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 18ms/step - accuracy: 0.3461 - loss: 2.2333
Epoch 7/20
[1m186/186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 18ms/step - accuracy: 0.3642 - loss: 2.1735
Epoch 8/20
[1m186/186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 20ms/step - accuracy: 0.3824 - loss: 2.1220
Epoch 9/20
[1m186/186[0m [32m