In [1]:
import numpy as np
import tensorflow as tf
from itertools import chain
from keras.layers import Embedding, Dense, LSTM
from keras.losses import SparseCategoricalCrossentropy
from keras.optimizers import Adam
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences

text = open('/Users/k/Docs/School/Tuebingen/Thesis/iscl-thesis/current_corpora/en_wiki_extractor.txt', 'r', encoding='utf-8').read().splitlines()

2023-11-23 13:45:19.485089: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [30]:
# Alphabet in language
chars = list(set(chain(*(char for line in text for char in line if line if not char.isspace()))))
# Unique set of tokens in text
tokens = list(set(chain(*(line.split() for line in text if line))))
# BOS char
chars.insert(0, '<')
# EOS char
chars.insert(1, '>')
vocab_size = len(chars)
max_len = len(max(tokens, key=len))

# Make into sequences
ch2idx = {c: i for i, c in enumerate(chars)}
idx2ch = {i: c for i, c in enumerate(chars)}
sequences = [chars[0] + t + chars[1] for t in tokens]

# Convert to tensors, add padding, split into X_train/y_train and test set
X = pad_sequences([[ch2idx[i] for i in seq[:-1]] for seq in sequences])
y = pad_sequences([[ch2idx[i] for i in seq[1:]] for seq in sequences])
split_index = int(0.8 * len(X))
X_train, X_test = X[:split_index], X[split_index:]
y_train, y_test = y[:split_index], y[split_index:]

In [52]:
batch_size = 32
num_epochs = 100
loss_function = SparseCategoricalCrossentropy()
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)
optimizer = Adam()

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=128, mask_zero=True))
model.add(LSTM(128, return_sequences=True))
model.add(Dense(vocab_size, activation='softmax'))
model.compile(loss=loss_function, optimizer=optimizer, metrics=['accuracy'])

history = model.fit(X_train, y_train, epochs=num_epochs, batch_size=batch_size, callbacks=[callback], verbose=1)

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, None, 128)         3584      
                                                                 
 lstm_1 (LSTM)               (None, None, 128)         131584    
                                                                 
 dense_1 (Dense)             (None, None, 28)          3612      
                                                                 
Total params: 138780 (542.11 KB)
Trainable params: 138780 (542.11 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
# def calc_perplexity(model, test_data):
#     test_sequences = tokenizer.texts_to_sequences(test_data)
#     test_sequences = np.array(test_sequences)
#     test_sequences = pad_sequences(test_sequences, maxlen=max_sequence_len - 1, padding='pre')
#     total_log_loss = 0
#     total_words = 0

#     for sequence in test_sequences:
#         input_seq = sequence[:-1]
#         target = sequence[-1]

#         predicted = model.predict(input_seq, verbose=0)[0]
#         if target != 0:
#             total_log_loss += np.log(predicted[int(target)])
#             total_words += 1

#     avg_log_loss = total_log_loss / total_words
#     perplexity = np.exp(-avg_log_loss)
#     return perplexity

# # Test data for calculating perplexity
# test_data = ["Finally, here is the third sentence.", "This is the first sentence."]
# perplexity = calc_perplexity(model, test_data)
# print(f"Perplexity: {perplexity}")
