**Text source from project Gutenberg, Nietzsche text**

**The link is here --> https://www.kaggle.com/datasets/pankrzysiu/nietzsche-texts**


**Results using keras tokenizer and lemmatization below**

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import Counter
import random
import pickle
import os
import re

In [None]:
import tensorflow as tf

tf.keras.backend.clear_session()

In [None]:
from google.colab import drive

drive.mount('/content/drive')

# Read the file
with open('/content/drive/MyDrive/Colab Notebooks/Raw data/nietzsche.txt', 'r') as file:
    text = file.read()

Mounted at /content/drive


In [None]:
from tensorflow.keras.callbacks import TensorBoard
import datetime

# Create a log directory
import os
log_dir1 = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")+"-tokenizermodel1"
os.makedirs(log_dir1, exist_ok=True)

In [None]:
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

nltk.download('stopwords')
nltk.download('wordnet')
stemmer = PorterStemmer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
import string
from nltk.stem import WordNetLemmatizer, PorterStemmer
from keras.preprocessing.text import Tokenizer
from gensim.models import Word2Vec

def preprocessing(text, lemmatize=True, stemming=False, tokenizer=None, return_type='word_sequences'):
    # Step 1: Make lowercase
    text = text.lower()

    # Step 2: Remove punctuations
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Step 3: Remove numbers
    wordonly = [word for word in text.split() if not word.isdigit()]
    text = ' '.join(wordonly)

    # Step 4: Lemmatize the words
    if lemmatize:
        lemmatization = WordNetLemmatizer()
        text = [lemmatization.lemmatize(word) for word in text.split()]

    # Step 5: Stem the words
    if stemming:
        stemmer = PorterStemmer()
        text = text.split()
        text = [stemmer.stem(word) for word in text]

    text = ' '.join(text)

    if tokenizer is None:
        tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')

    tokenizer.fit_on_texts([text])
    word_sequences = tokenizer.texts_to_sequences([text])[0]
    vocab_size = len(tokenizer.word_index)
    unique_words = list(tokenizer.word_index.keys())


    if return_type == 'word_sequences':
        return word_sequences
    elif return_type == 'vocab_size':
        return vocab_size
    elif return_type == 'unique_words':
        return unique_words
    elif return_type == 'tokenizer':
        return tokenizer
    elif return_type == 'text':
        return text
    else:
        raise ValueError('Invalid return type')



In [None]:
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')

In [None]:
total_words=preprocessing(text, return_type='vocab_size', tokenizer=tokenizer)
print(total_words)

10535


In [None]:
text_lemmatized_tokenizer = preprocessing(text, tokenizer=tokenizer)

In [None]:
min(text_lemmatized_tokenizer)

1

In [None]:
len(text_lemmatized_tokenizer)

98615

In [None]:
# Generate the sequences of four words and the next word in each sequence
prev_words = []
next_words = []

for i in range(len(text_lemmatized_tokenizer)-4):
    prev_words.append(text_lemmatized_tokenizer[i:i+4])
    next_words.append(text_lemmatized_tokenizer[i+4])

# Convert the lists to numpy arrays for use in the LSTM model
X_tokenizer = np.array(prev_words)
y_tokenizer = np.array(next_words)

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X_tokenizer, y_tokenizer, test_size=0.2, random_state=42)

In [None]:
X_train.shape

(78888, 4)

In [None]:
y_train.shape

(78888,)

In [None]:
!pip install keras-tuner

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting keras-tuner
  Downloading keras_tuner-1.3.5-py3-none-any.whl (176 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m176.1/176.1 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Collecting kt-legacy (from keras-tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras-tuner
Successfully installed keras-tuner-1.3.5 kt-legacy-1.0.5


In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dropout, Dense
from tensorflow.keras.models import Sequential
from kerastuner import HyperModel, RandomSearch
from tensorflow.keras.callbacks import ReduceLROnPlateau
from keras_tuner import RandomSearch
from kerastuner.engine.hyperparameters import HyperParameters as hp
from tensorflow.keras.callbacks import TensorBoard
from keras.callbacks import ModelCheckpoint

class LSTMHyperModel(HyperModel):
    def __init__(self, input_length, total_words):
        self.input_length = input_length
        self.total_words = total_words

    def build(self, hp):
        model = Sequential()
        model.add(Embedding(self.total_words, 100, input_length=self.input_length))
        num_layers = hp.Int("num_layers", 1, 3)
        for i in range(num_layers):
            return_sequences = True if i < num_layers - 1 else False
            model.add(LSTM(hp.Int(f"lstm_units_{i}", 128, 256, step=128), return_sequences=return_sequences, recurrent_dropout=hp.Choice(f"recurrent_dropout_{i}", [0.1, 0.2, 0.3])))

        model.add(Dropout(hp.Choice("dropout_rate", [0.1, 0.2, 0.3])))
        model.add(Dense(self.total_words, activation='softmax'))

        model.compile(
            loss=tf.keras.losses.SparseCategoricalCrossentropy(),
            optimizer=tf.keras.optimizers.Adam(hp.Float("learning_rate", 1e-4, 1e-2, sampling="log")),
            metrics=['accuracy']
        )

        return model



input_length = 4
hypermodel = LSTMHyperModel(input_length, total_words)

tuner = RandomSearch(
    hypermodel,
    objective='val_accuracy',
    max_trials=5,
    seed=42,
    executions_per_trial=1
)

reduce_lr = ReduceLROnPlateau(monitor='accuracy', factor=0.8, patience=1, min_lr=0.0005, verbose=1)
tensorboard_callback = TensorBoard(log_dir=log_dir1, histogram_freq=1)
tuner.search(X_train, y_train, epochs=5, batch_size=128, validation_data=(X_valid, y_valid),
    callbacks=[
        reduce_lr,
        tensorboard_callback,
        tf.keras.callbacks.TensorBoard(log_dir=log_dir1, update_freq='batch', profile_batch=0),
    ],
)


best_model = tuner.get_best_models()[0]
best_model.save('/content/drive/MyDrive/Colab Notebooks/best_model.h5')

Trial 5 Complete [00h 04m 29s]
val_accuracy: 0.12660345435142517

Best val_accuracy So Far: 0.13071033358573914
Total elapsed time: 00h 20m 00s




In [None]:
best_hp = tuner.get_best_hyperparameters()[0]

print("Best hyperparameters:")
print(f"num_layers: {best_hp.get('num_layers')}")
for i in range(best_hp.get('num_layers')):
    print(f"lstm_units_{i}: {best_hp.get(f'lstm_units_{i}')}")
    print(f"recurrent_dropout_{i}: {best_hp.get(f'recurrent_dropout_{i}')}")
print(f"dropout_rate: {best_hp.get('dropout_rate')}")
print(f"learning_rate: {best_hp.get('learning_rate')}")


Best hyperparameters:
num_layers: 2
lstm_units_0: 128
recurrent_dropout_0: 0.2
lstm_units_1: 128
recurrent_dropout_1: 0.1
dropout_rate: 0.1
learning_rate: 0.005977728042983696


In [None]:
!tensorboard dev upload --logdir {log_dir1} --name "tokenizerlemmatization" --description "Training results from https://colab.research.google.com" --one_shot



***** TensorBoard Uploader *****

This will upload your TensorBoard logs to https://tensorboard.dev/ from
the following directory:

logs/fit/20230514-195353-tokenizermodel1

This TensorBoard will be visible to everyone. Do not upload sensitive
data.

Your use of this service is subject to Google's Terms of Service
<https://policies.google.com/terms> and Privacy Policy
<https://policies.google.com/privacy>, and TensorBoard.dev's Terms of Service
<https://tensorboard.dev/policy/terms/>.

This notice will not be shown again while you are logged into the uploader.
To log out, run `tensorboard dev auth revoke`.

Continue? (yes/NO) yes

To sign in with the TensorBoard uploader:

1. On your computer or phone, visit:

   https://www.google.com/device

2. Sign in with your Google account, then enter:

   TCCC-VKVW



New experiment created. View your TensorBoard at: https://tensorboard.dev/experiment/iLNwM4f2SL6iZqb8LAakzQ/

[1m[2023-05-14T20:14:45][0m Started scanning logdir.
E0514 20:14:48

In [None]:
import tensorflow as tf

tf.keras.backend.clear_session()


**Results using keras tokenizer and stemming below**

In [None]:
try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass

%load_ext tensorboard

Colab only includes TensorFlow 2.x; %tensorflow_version has no effect.


In [None]:
import os
import tensorflow as tf
from tensorboard.plugins import projector


In [None]:
text_stemming_tokenizer = preprocessing(text, lemmatize=False, stemming=True, tokenizer=tokenizer)  #This is the tokenized stemming text


In [None]:
minimum_value = min(text_stemming_tokenizer)


In [None]:
print(minimum_value)

1


In [None]:
vocab_size=preprocessing(text, lemmatize=False, stemming=True, return_type='vocab_size')
print(vocab_size)

8060


In [None]:
unique_words=preprocessing(text, lemmatize=False, stemming=True, return_type='unique_words')


In [None]:
len(unique_words)

8060

In [None]:
len(text_stemming_tokenizer)

98615

In [None]:
# Generate the sequences of four words and the next word in each sequence
prev_words = []
next_words = []

for i in range(len(text_stemming_tokenizer)-4):
    prev_words.append(text_stemming_tokenizer[i:i+4])
    next_words.append(text_stemming_tokenizer[i+4])

# Convert the lists to numpy arrays for use in the LSTM model
X_tokenizer = np.array(prev_words)
y_tokenizer = np.array(next_words)

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X_tokenizer, y_tokenizer, test_size=0.2, random_state=42)

In [None]:
print('Min in y_train:', min(y_train))
print('Min in y_valid:', min(y_valid))


Min in y_train: 1
Min in y_valid: 1


In [None]:
X_train.shape

(78888, 4)

In [None]:
y_train.shape

(78888,)

In [None]:
log_dir2 = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")+"-tokenizermodel2"
os.makedirs(log_dir2, exist_ok=True)

In [None]:
from keras.callbacks import ModelCheckpoint
from keras.layers import Input

In [None]:
from tensorboard.plugins import projector

In [None]:
from tensorflow.keras.layers import Embedding, LSTM, Dropout, Dense


filepath='/content/drive/MyDrive/Colab Notebooks/best_model_tokenizer_stemming.h5'
checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')

num_layers = 2
lstm_units_0 = 128
recurrent_dropout_0 = 0.2
lstm_units_1 = 128
recurrent_dropout_1 = 0.1
dropout_rate = 0.1
learning_rate = 0.005977728042983696

model_lstm = Sequential()
model_lstm.add(Embedding(vocab_size, 100, input_length=4))
model_lstm.add(LSTM(lstm_units_0, return_sequences=True, recurrent_dropout=recurrent_dropout_0))
model_lstm.add(LSTM(lstm_units_1, recurrent_dropout=recurrent_dropout_1))
model_lstm.add(Dropout(dropout_rate))
model_lstm.add(Dense(vocab_size, activation='softmax'))

optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
model_lstm.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

tensorboard_callback = TensorBoard(log_dir=log_dir2, histogram_freq=1,
    write_graph=True,
    write_images=True,
    update_freq='epoch',
)

# fit the model
model_lstm.fit(X_train,y_train, epochs=5, batch_size=128, validation_split=0.2, callbacks=[tensorboard_callback,checkpoint])







Epoch 1/5
  8/494 [..............................] - ETA: 1:47 - loss: 8.8224 - accuracy: 0.0547

In [None]:
embedweights=model_lstm.layers[0].get_weights()[0]
embedweights.shape

In [None]:
#the following code is adapted from https://www.tensorflow.org/text/guide/word_embeddings#retrieve_the_trained_word_embeddings_and_save_them_to_disk
import io

out_v = io.open("/content/drive/MyDrive/Colab Notebooks/embedding_vectors.tsv", "w", encoding="utf-8")
out_m = io.open("/content/drive/MyDrive/Colab Notebooks/embedding_metadata.tsv", "w", encoding="utf-8")

# Add embedding vectors and words to file
for num, word in enumerate(unique_words):
  if num == 0:
     continue # skip OOV
  vec = embedweights[num]
  out_m.write(word + "\n") # add the words to file
  out_v.write("\t".join([str(x) for x in vec]) + "\n") # add the corresponding word vector to file
out_v.close()
out_m.close()

In [None]:
%tensorboard --logdir {log_dir2}