In [1]:
#These lines initialize variables: restart indicates whether training should restart from scratch(True), and epoch_to_pickup specifices the training epoch to resume from (0 for starting fresh)
restart = True
epoch_to_pickup = 0

In [2]:
# This cell imports essential libraries for text preprocessing, machine learning, and resource management.
# It includes TensorFlow for model building, text processing tools (e.g., StringLookup, TextVectorization),
# and utilities for file operations, randomization, and memory management
# Import libraries

from tensorflow.keras.layers import StringLookup # Converts strings to numeric IDs
import numpy as np # Provides numerical operations for arrays and matrices
import os # Enables file and directory manipulation
import time # Measures execution time and introduces delays
import random # Generates random numbers for sampling or shuffling
import contextlib #Simplifies resource management
import io # Handles input and output streams in memory
import re # Provides regular expressions for text cleaning and preprocessing
import string # Offers string constants and utilities like removing punctuation
import gc  # Import the garbage collector module

import tensorflow as tf # Core TensorFlow library for ML and DL
from tensorflow.keras import Sequential # Simplifies model creation layer by layer
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D # (dense) Fully connected layer for odel outputs, (Embedding) Converts tokens to dense vector representations, ( Global averagepooling1D) Averages embeddings over a sequence
from tensorflow.keras.layers import TextVectorization # Tokenizes and preprocesses text for ML models

In [10]:
def preprocess_text(text):
    """
    Cleans and preprocesses text for training by replacing unwanted characters, normalizing whitespace, and standardizing formatting.
    Args:
        text (str): The raw text to preprocess.
    Returns:
        str: Preprocessed and cleaned text.
    """
    # Replace unwanted characters or placeholders
    text = text.replace("\n", " zzspacezz ")  # Replace newlines with placeholders
    text = text.replace("\t", " zzspacezz ")  # Replace tabs with placeholders
    text = text.replace("  ", " zzspacezz ")  # Normalize extra spaces
    text = text.lower()  # Convert text to lowercase for consistency

    # Optional: Remove punctuation or unwanted symbols
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation

    return text


In [11]:
# This cell defines a function getMyText() that downloads, saves, and processes a text file.
# It first checks if the file exists locally; if not, it downloads the file from a given URL, saves it to a specified directory, and reads its content.
# Finally, it preprocesses and returns the text.



def getMyText():
    file_name = 'austen.txt'  # Name of the text file
    file_url = 'https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/austen/austen.txt'  # URL of the file
    local_dir = 'saved_files'  # Directory to save the file
    local_path = os.path.join(local_dir, file_name)  # Full path to the local file

    try:
        # Ensure the directory exists
        if not os.path.exists(local_dir):
            os.makedirs(local_dir)  # Create the directory if it doesn't exist

        # Check if the file exists locally
        if os.path.exists(local_path):
            print(f"File '{file_name}' found locally. Using it.")  # Notify if the file is already downloaded
        else:
            print(f"File '{file_name}' not found locally. Downloading it.")  # Notify if the file needs downloading
            # Download the file
            downloaded_path = tf.keras.utils.get_file(file_name, file_url)

            # Save the downloaded file to the designated local directory
            with open(downloaded_path, 'rb') as source_file:
                with open(local_path, 'wb') as dest_file:
                    dest_file.write(source_file.read())  # Copy the file to the designated directory

        # Read the file's contents
        with open(local_path, 'rb') as file:
            text = file.read().decode(encoding='utf-8')  # Decode the file content into a string

        return preprocess_text(text)  # Preprocess and return the text

    except Exception as e:
        print(f"An error occurred: {e}")  # Print the error message if something goes wrong
        return None


In [12]:
# This will give me and display the number of GPUs available for TensorFlow computations using tf.config.list_physical_devices( 'GPU').
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU'))) #Check and display the number of GPUs available for TensorFlow computations

Num GPUs Available:  1


In [13]:
path = ''

In [14]:
# Optional setup for working with google drive
# Provides an alternative workflow for users who prefer to work with google drive for persistent storage in colab


# from google.colab import drive
# drive.mount('/content/drive')
# path = '/content/drive/My Drive/M6_Fall2023e/'

In [21]:
# ===============================
# DEFINE FUNCTIONS FOR DATASET CONVERSION
# ===============================
def split_input_target(sequence):
    input_ids = sequence[:-1]  # Input: all elements except the last
    target_ids = sequence[1:]  # Target: all elements except the first
    return input_ids, target_ids  # Return input-target pair

def text_to_dataset(text):
    all_ids = vectorize_layer(text)  # Tokenize text and map to integer IDs
    ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)  # Convert tokenized IDs to a dataset
    del all_ids  # Free memory

    sequences = ids_dataset.batch(sequence_length + 1, drop_remainder=True)  # Create sequences
    del ids_dataset  # Free memory

    dataset = sequences.map(split_input_target)  # Map sequences into input-target pairs
    del sequences  # Free memory

    return dataset  # Return dataset of input-target pairs


In [22]:
# ===============================
# CONVERT PROCESSED TEXT TO DATASET
# ===============================

if restart:  # Only recreate the dataset if restarting
    # Convert Jane Austen's preprocessed text to a training dataset
    vocab_ds = text_to_dataset(vocab_text)

    # OPTIONAL: If you have other authors, convert them similarly
    # Example:
    # author2_ds = text_to_dataset(author_texts["Mark Twain"])


NameError: name 'vocab_text' is not defined

## Functions for downloading text


In [15]:
# Postprocess_text function restores preprocessed text to a human readable format by replacing placeholder tokens (tabs, newlines, spaces)
# with their actual characters, correcting capitalization at the start of words, and removing unwanted symbols like "^".
# It returns cleaned text
# This function purpose - This function reverses some of the changes made during text preprocession, restoring the text to a more human readable format by replacing placeholder tokens and fixing capitalization


def postprocess_text(text):

    # Replace special placeholder for tabs with actual tab characters
    text = text.replace("zztabzz", "\t")
    # Replace special placeholder for newlines with actual newline characters
    text = text.replace("zznewlinezz", "\n")
    # Replace special placeholder for spaces with actual space characters
    text = text.replace("zzspacezz", " ")

    # Remake capital letters at beginning of words
    text = re.sub(r"\^([a-z])", lambda match: f"{match.group(1).upper()}", text)

    #Remove any remaining caret symbols("^") from the text
    text = text.replace("^", "")

    # Return the cleaned and postprocessed text
    return text

In [16]:
# This code defines the getMyText function, which downloads Jane Austen's text file if not already presaent locally, saves it to a directory, and reads its contents.
# The text is then passed to the preprocess_text function for cleaning and preprocessing before being returned
# def getMyText():
#   path_to_file = tf.keras.utils.get_file('austen.txt', 'https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/austen/austen.txt')

#   text = open(path_to_file, 'rb').read().decode(encoding='utf-8')

#   # path_to_file = tf.keras.utils.get_file('903-0.txt', 'https://www.gutenberg.org/files/903/903-0.txt')
#   # author_text += open(path_to_file, 'rb').read().decode(encoding='utf-8')[2999:-19194]
#   # tf.io.gfile.remove(path_to_file)

#   return preprocess_text(text)

import os
import tensorflow as tf

def getMyText():
    file_name = 'austen.txt'
    file_url = 'https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/austen/austen.txt'
    local_dir = 'saved_files'  # Directory to save the file
    local_path = os.path.join(local_dir, file_name)

    try:
        # Ensure the directory exists
        if not os.path.exists(local_dir):
            os.makedirs(local_dir)

        # Check if the file exists locally
        if os.path.exists(local_path):
            print(f"File '{file_name}' found locally. Using it.")
        else:
            print(f"File '{file_name}' not found locally. Downloading it.")
            # Download the file
            downloaded_path = tf.keras.utils.get_file(file_name, file_url)

            # Save the downloaded file to the designated local directory
            with open(downloaded_path, 'rb') as source_file:
                with open(local_path, 'wb') as dest_file:
                    dest_file.write(source_file.read())

        # Read the file's contents
        with open(local_path, 'rb') as file:
            text = file.read().decode(encoding='utf-8')

        return preprocess_text(text)

    except Exception as e:
        print(f"An error occurred: {e}")
        return None



In [17]:
# This cell calls the getMyText90 function, which either retrieves the text from the local directory or downloads it if not found locally.
# The function then preprocesses the text, producing output that includes placeholders like zzsacezz for spaces, indicating that preprocessing has been applied.
getMyText()

File 'austen.txt' found locally. Using it.




In [18]:
# The getRandomText function downloads and processes a random number of English books from Project Gutenberg.
# It randomly selects book IDs, attempts to retrieve their text, and validates that the text is in English and of sufficient length.
# If valid, it extracts and appends a portion of the book to a dataset.  Finally, it preprocesses the combined text and returns it.


def getRandomText(numbooks = 1, verbose=False):
  download_log = io.StringIO()
  text_random = ''
  for b in range(numbooks):
    foundbook = False
    while(foundbook == False):
      booknum = random.randint(100,60000)
      if verbose:
        print('Trying Book #: ',booknum)
      if random.random() > 0.5:
        url = 'https://www.gutenberg.org/files/' + str(booknum) + '/' + str(booknum) + '-0.txt'
        filename_temp = str(booknum) + '-0.txt'
      else:
        url = 'https://www.gutenberg.org/cache/epub/' + str(booknum) + '/pg' + str(booknum) + '.txt'
        filename_temp = 'pg' + str(booknum) + '.txt'
      if verbose:
        print('Trying: ', url)
      try:
        if verbose:
          path_to_file_temp = tf.keras.utils.get_file(filename_temp, url)
        else:
          with contextlib.redirect_stdout(download_log):
            path_to_file_temp = tf.keras.utils.get_file(filename_temp, url)
        temptext = open(path_to_file_temp, 'rb').read().decode(encoding='utf-8')
        tf.io.gfile.remove(path_to_file_temp)
        if (temptext.find('Language: English') >= 0):
          offset = random.randint(-20,20)
          header = 2000
          total_length = 200000
          chopoffend = 10000
          if len(temptext) > (header+total_length+offset+chopoffend):
            foundbook = True
            text_random += temptext[header+offset:header+total_length+offset]
            #print("Yes: " + str(booknum))
            if verbose:
              print('New size of dataset: ', len(text_random))
          elif len(temptext) > (header+12000):
            foundbook = True
            text_random += temptext[header:-chopoffend]
            #print("Yes (smaller): " + str(booknum))
            if verbose:
              print('New size of dataset: ', len(text_random))
          else:
            if verbose:
              print('Not long enough. Trying again...')
            #print("No: " + str(booknum) + " too short")
        else:
          if verbose:
            print('Not English. Trying again...')
          #print("No: " + str(booknum) + " not English")
        del temptext
      except:
        if verbose:
          print('Not valid file. Trying again...')
        #print("No: " + str(booknum) + " not valid")
        foundbook = False
    if verbose:
      print("Found " + str(b+1) + " books so far...")
  del download_log
  #text_random = "".join(c for c in text_random if c in vocab)
  #all_ids_random = ids_from_chars(tf.strings.unicode_split(text_random, 'UTF-8'))
  #ids_dataset_random = tf.data.Dataset.from_tensor_slices(all_ids_random)
  #sequences_random = ids_dataset_random.batch(seq_length+1, drop_remainder=True)
  #dataset_random = sequences_random.map(split_input_target)
  #dataset_random = (dataset_random.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True).prefetch(tf.data.experimental.AUTOTUNE))
  #return dataset_random
  return preprocess_text(text_random)

In [None]:
# ===============================
# DATA PREPARATION FOR MULTIPLE AUTHORS
# ===============================

# Define a function to load and preprocess text for multiple authors
def get_author_texts(author_urls):
    """
    Downloads and preprocesses text datasets for multiple authors from Project Gutenberg.

    Args:
        author_urls (dict): A dictionary where keys are author names and values are lists of URLs to their works.

    Returns:
        dict: A dictionary where keys are author names and values are preprocessed text datasets.
    """
    author_texts = {}  # To store preprocessed text for each author

    for author, urls in author_urls.items():
        print(f"Processing texts for author: {author}")
        full_text = ""  # Initialize an empty string to combine all works

        for url in urls:
            try:
                # Download the file
                print(f"Downloading: {url}")
                filename_temp = url.split("/")[-1]
                path_to_file_temp = tf.keras.utils.get_file(filename_temp, url)

                # Read and decode the file
                temp_text = open(path_to_file_temp, 'rb').read().decode(encoding='utf-8')
                tf.io.gfile.remove(path_to_file_temp)  # Remove the temporary file after reading

                # Check for English language and sufficient length
                if "Language: English" in temp_text and len(temp_text) > 20000:
                    full_text += temp_text  # Append to the full text for the author
                else:
                    print(f"Skipping non-English or short text: {url}")

            except Exception as e:
                print(f"Failed to process {url}: {e}")

        # Preprocess the combined text and store it
        author_texts[author] = preprocess_text(full_text)  # Use your existing preprocess_text function

    return author_texts

# My notes: get_author_texts(): this downloads, decodes, and preprocesses text files for multiple authors using their respective URLs from Project Gutenberg.
# It then combines multiple works for a single author into one text dataset.
# Then it uses my existing preprocess_text function to clean the data.
# Input is the author_urls that is a dictionary mapping author names to lists of URLs for their works. Output is a dictionary author_texts) mapping author names to their cleaned and preprocessed datasets.


# URLs of works for multiple authors
author_urls = {
    "Jane Austen": [
        "https://www.gutenberg.org/files/1342/1342-0.txt",  # Pride and Prejudice
        "https://www.gutenberg.org/files/158/158-0.txt"     # Emma
    ],
    "Mark Twain": [
        "https://www.gutenberg.org/files/76/76-0.txt",      # Adventures of Huckleberry Finn
        "https://www.gutenberg.org/files/74/74-0.txt"      # The Adventures of Tom Sawyer
    ],
    "Mary Shelley": [
        "https://www.gutenberg.org/files/84/84-0.txt"       # Frankenstein
    ]
}



In [None]:
# Uncomment below to print the first 1000 characters of each author's text
for author, text in author_texts.items():
    print(f"\n=== Sample Text for {author} ===\n")
    print(text[:1000])  # Print the preprocessed text

In [None]:
# Load and preprocess texts for each author
author_texts = get_author_texts(author_urls)

In [None]:
# This cell checks if restart is set to True.  If so, it calls getMyText() to retrieve and preprocess the Jane Austen text file, storing the result in vocab_text
if restart:
  vocab_text = getMyText()

File 'austen.txt' found locally. Using it.


Make vocabulary (Adapted from TensorFlow word embedding tutorial)

---



In [None]:
# This cell defines the vocabulary size (vocab_size = 8192) and sequence length (sequence_length = 128)

# Vocabulary size and number of words in a sequence.
vocab_size = 8192
sequence_length = 128

In [None]:
# If restart is True, it creates a TextVectorization layer to preprocess text by normalizing (lowercasing), splitting(by whitespace), and converting tokens to integer IDs, limiting vocabulary size and sequence length
if restart:
  # Use the text vectorization layer to normalize, split, and map strings to
  # integers. Note that the layer uses the custom standardization defined above.
  # Set maximum_sequence length as all samples are not of the same length.
  vectorize_layer = TextVectorization(
      standardize='lower',
      split='whitespace',
      max_tokens=vocab_size,
      output_mode='int',
      #output_sequence_length=sequence_length
      )

In [None]:
# When restart is True, uses the vectorize_layer to adapt the vocabulary based on the vocab_text.  It creates a text only dataset and builds a mapping of tokens to integer IDs from the provided text.
if restart:
  # Make a text-only dataset (no labels) and call adapt to build the vocabulary.
  vectorize_layer.adapt([vocab_text])

In [None]:
# check if the restart flag is True to determine if the vocabulary should be refreshed or This cell checks if restart it True.
# If so, it retrieves the vocabulary generated by the TextVectorization layer using get_vocabulary() for further use
if restart:
  #Retrieve the vocabulary from the TextVectorization Layer
  vocabulary = vectorize_layer.get_vocabulary()

Save Vocabulary

In [None]:
#This cell checks if restart is True.  if so, it saves the vocabulary from the TextVectorization layer to a file named vocabulary.txt in the specified path, writing each word on a new line
# Check if the restart flag is True to perform vocabulary savings
if restart:
  with open(path + "vocabulary.txt", "w") as file:# Open a file named "vocabulary.txt" in mode in the specified path
    for word in vocabulary:  # Iterate through each word in the vocabulary list
        file.write(word + "\n") # Write each word followed by a newline to the file

Load Saved Vocabulary

In [None]:
# This cell checks if restart is False, indicating training is being resumed.  It loads the saved vocabulary from vocabulary.text,
# processes it into a cleaned list, and uses it to initialize a new TextVectorization layer with consistent settings, such as vocabulary, standardization, splitting, and token to integer mapping
if restart == False:  # Check if the restart flag is False, indicating that training is being resumed
  with open(path + "vocabulary.txt", "r") as file: # Open the saved vocabulary file in read mode
      vocabulary = [word.strip() for word in file.readlines()]  # Read all lines, stripping any extra whitespace from each word
      vocabulary = vocabulary # Store the Cleaned vocabulary list (redundant line, but ensures the variable is set)

  vectorize_layer = TextVectorization(# Define a TextVectorization layer for preprocessing
      vocabulary=vocabulary,# Use the loaded vocabulary to ensure consistency with previous training
      standardize='lower',# Convert all text to lowercase as part of preprocessing
      split='whitespace',# Split text based on whitespace
      max_tokens=vocab_size,# Limit the number of tokens to the predefined vocabulary size
      output_mode='int',# Map tokens to integer IDs
      #output_sequence_length=sequence_length
      )

In [None]:
# This cell prints the first 20 words (vocabulary[:20]) to display the most frequent tokens and the last 20 words(vocabulary[-20:]) to inspect the least gfrequent or unusual tokens in the vocabulary.
# This helps verify the vocabulary's content and structure
print(vocabulary[:20])# Print the first 20 words in the vocabulary to inspect the most frequent tokens
print(vocabulary[-20:])# Print the last 20 words in the vocabulary to inspect the least frequent or unusual tokens

['', '[UNK]', 'zzspacezz', '^', ',', '.', 'the', 'to', 'and', 'of', '"', 'a', 'her', '-', 'i', 'was', 'in', 'it', 'she', ';']
['especial', 'equivocal', 'equity', 'epsom', 'epicurism', 'enumeration', 'enrich', 'enraged', 'enormity', 'engravings', 'enforcing', 'enforce', 'enemies', 'encroach', 'enclosure', 'emigrant', 'elucidation', 'eloped', 'ellison', 'eligibly']


Turn text into a dataset

In [None]:
# The split_input_target function splits sequences into input(all but the last element)
# and target 9all but the first element pairs
# The text_to_dataset function tokenizes text, creates batches of sequences, maps them into input_target pairs
# and returns a shuffled dataset ready for training while efficiently managing memory

# This function will generate our sequence pairs for training
def split_input_target(sequence):
    input_ids = sequence[:-1] #Extract all elements except the last one as the input sequence
    target_ids = sequence[1:] #Extract all the elements except the first one as the target sequence
    return input_ids, target_ids # Return the input and target sequences as a pair

# This function will create the dataset ready for training
def text_to_dataset(text):
  all_ids = vectorize_layer(text) # Tokenize the entire text using the vectorize_layer and map to integer IDs
  ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)  # Create a TensorFlow dataset from the tokenized IDs
  del all_ids  # Free up memory be deleting the tokenized IDs array

  # Create sequences of Length 'sequence_Length + 1 for input -target mapping
  sequences = ids_dataset.batch(sequence_length+1, drop_remainder=True) # Ensure all batches have equal lengths
  del ids_dataset # Free up memory be deleting the dataset of tokenized IDs

  # Call the function for every sequence in our list to create a new dataset

  # Map each sequence into input target pairs
  # of input->target pairs
  dataset = sequences.map(split_input_target)
  del sequences # Free up meory by deleting the sequences dataset

  # shuffle

  # Return the final dataset
  return dataset # This dataset contains input-target pairs ready for training

Test on vocab text

In [None]:
# If restart is True, this recreates the dataset by converting vocab_text into a dataset of input-target pairs using the tex_to_dataset function
if restart: # Check if the restart flag is True to recreate the dataset
  vocab_ds = text_to_dataset(vocab_text) #Convert the vocabulary text into a dataset of input target pairs

In [None]:
# This defines text_from_ids, which converts a list of token IDs into readable text by mapping IDs to words using the vocabulary and then postprocessing the text for proper formatting.  It adjusts the vocabulary by setting [UNK] for unknown tokens and an empty string for padding.  A StringLookup layer (words_from_ids) is created to invert the mapping, allowing token IDS to be converted back into words.
# Define a function to convert a list of token IDs back into text
def text_from_ids(ids):
  text = ''.join([vocabulary[index] for index in ids]) # Look up each token ID in the vocabulary and join them into a string
  return postprocess_text(text)# Postprocess the text to restore capitalization and formatting

vocabulary_adjusted = vocabulary # Create a copy of the vocabulary for adjustments
vocabulary_adjusted[0] = '[UNK]' # Replace the first entry in the vocabulary with '[UNK]' for unknown tokens
vocabulary_adjusted[1] = '' # Replace the second entry with an empty string (Likely reserved for padding or a special token)

words_from_ids = tf.keras.layers.StringLookup(vocabulary=vocabulary_adjusted, invert=True) # Define a StringLookup Layer to map token IDs back to words, use the adjusted vocabulary, invert the mapping to convert token IDs back into words

In [1]:
# If restart is Tue, this block samples one input target pair from vocab_ds and prints:
# The raw input sequence (token IDs)
if restart:  # Check if the restart flag is True to execute the block
  for input_example, target_example in vocab_ds.take(1): #Take one input-target pair from the vocabulary dataset
    print("Input: ") # Print a label for the input sequence
    print(input_example) # Print the raw input sequence as token IDs
    print(text_from_ids(input_example)) # Convert token Ids to text and print the process input sequence
    print(words_from_ids(input_example)) # Convert token Ids to words using the adjusted vocabulary and print for the target sequence
    print("Target: ") # Print a  Label for the target sequence
    print(target_example) # Print the raw target sequence as token IDs
    print(text_from_ids(target_example)) # Convert token IDs to text and print the processed target sequence

NameError: name 'restart' is not defined

In [None]:
BATCH_SIZE = 64
BUFFER_SIZE = 10000

def setup_dataset(dataset):
  dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))
  return dataset


In [None]:
if restart:
  vocab_ds = setup_dataset(vocab_ds)

## III. Build the model

Next, we'll build our model. Up until this point, you've been using the Keras symbolic, or imperative API for creating your models. Doing something like:

    model = tf.keras.models.Sequentla()
    model.add(tf.keras.layers.Dense(80, activation='relu))
    etc...

However, tensorflow has another way to build models called the Functional API, which gives us a lot more control over what happens inside the model. You can read more about [the differences and when to use each here](https://blog.tensorflow.org/2019/01/what-are-symbolic-and-imperative-apis.html).

We'll use the functional API for our RNN in this example. This will involve defining our model as a custom subclass of `tf.keras.Model`.

If you're not familiar with classes in python, you might want to review [this quick tutorial](https://www.w3schools.com/python/python_classes.asp), as well as [this one on class inheritance](https://www.w3schools.com/python/python_inheritance.asp).

Using a functional model is important for our situation because we're not just training it to predict a single character for a single sequence, but as we make predictions with it, we need it to remember those predictions as use that memory as it makes new predictions.


In [None]:
# Create our custom model. Given a sequence of characters, this
# model's job is to predict what character should come next.
class AustenTextModel(tf.keras.Model):

  # This is our class constructor method, it will be executed when
  # we first create an instance of the class
  def __init__(self, vocab_size, embedding_dim, rnn_units):
    super().__init__()

    # Our model will have three layers:

    # 1. An embedding layer that handles the encoding of our vocabulary into
    #    a vector of values suitable for a neural network
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)

    # 2. A GRU layer that handles the "memory" aspects of our RNN. If you're
    #    wondering why we use GRU instead of LSTM, and whether LSTM is better,
    #    take a look at this article: https://datascience.stackexchange.com/questions/14581/when-to-use-gru-over-lstm
    #    then consider trying out LSTM instead (or in addition to!)
    #self.gru = tf.keras.layers.GRU(rnn_units, return_sequences=True, return_state=True)
    self.lstm1 = tf.keras.layers.LSTM(rnn_units, return_sequences=True, return_state=True)
    self.lstm2 = tf.keras.layers.LSTM(rnn_units, return_sequences=True, return_state=True)
    self.lstm3 = tf.keras.layers.LSTM(rnn_units, return_sequences=True, return_state=True)
    #self.lstm4 = tf.keras.layers.LSTM(rnn_units, return_sequences=True, return_state=True)


    self.hidden1 = tf.keras.layers.Dense(embedding_dim*64, activation='relu')
    self.hidden2 = tf.keras.layers.Dense(embedding_dim*16, activation='relu')
    #self.hidden3 = tf.keras.layers.Dense(embedding_dim*4, activation='relu')

    # 3. Our output layer that will give us a set of probabilities for each
    #    character in our vocabulary.
    self.dense = tf.keras.layers.Dense(vocab_size)

  # This function will be executed for each epoch of our training. Here
  # we will manually feed information from one layer of our network to the
  # next.
  def call(self, inputs, states=None, return_state=False, training=False):
    x = inputs

    # 1. Feed the inputs into the embedding layer, and tell it if we are
    #    training or predicting
    x = self.embedding(x, training=training)

    # 2. If we don't have any state in memory yet, get the initial random state
    #    from our GRUI layer.
    batch_size = tf.shape(inputs)[0]

    if states is None:
      states1 = [tf.zeros([batch_size, self.lstm1.units]), tf.zeros([batch_size, self.lstm1.units])]
      states2 = [tf.zeros([batch_size, self.lstm2.units]), tf.zeros([batch_size, self.lstm2.units])]
      states3 = [tf.zeros([batch_size, self.lstm3.units]), tf.zeros([batch_size, self.lstm3.units])]
      #states4 = [tf.zeros([batch_size, self.lstm4.units]), tf.zeros([batch_size, self.lstm4.units])]
    else:
      states1 = states[0]
      states2 = states[1]
      states3 = states[2]
      #states4 = states[3]
    # 3. Now, feed the vectorized input along with the current state of memory
    #    into the gru layer.
    x, state_h_1, state_c_1 = self.lstm1(x, initial_state=states1, training=training)
    states_out_1 = [state_h_1,state_c_1]

    x, state_h_2, state_c_2 = self.lstm2(x, initial_state=states2, training=training)
    states_out_2 = [state_h_2,state_c_2]

    x, state_h_3, state_c_3 = self.lstm3(x, initial_state=states3, training=training)
    states_out_3 = [state_h_3,state_c_3]

    #x, state_h_4, state_c_4 = self.lstm4(x, initial_state=states4, training=training)
    #states_out_4 = [state_h_4,state_c_4]

    states_out = [states_out_1, states_out_2, states_out_3]#, states_out_4]
    #states_out = [states_out_1, states_out_2]

    x = self.hidden1(x,training=training)
    x = self.hidden2(x,training=training)
    #x = self.hidden3(x,training=training)
    # 4. Finally, pass the results on to the dense layer
    x = self.dense(x, training=training)

    # 5. Return the results
    if return_state:
      return x, states_out
    else:
      return x

In [None]:
if restart:
  dataset = vocab_ds
  del vocab_text
  del vocab_ds
else:
  new_text = getRandomText(numbooks = 10)
  dataset = text_to_dataset(new_text)
  del new_text
  dataset = setup_dataset(dataset)

In [None]:
# Create an instance of our model
#vocab_size=len(ids_from_chars.get_vocabulary())
embedding_dim = 128
rnn_units = 512

model = AustenTextModel(vocab_size, embedding_dim, rnn_units)

In [None]:
# Verify the output of our model is correct by running one sample through
# This will also compile the model for us. This step will take a bit.
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")


(64, 128, 8192) # (batch_size, sequence_length, vocab_size)


In [None]:
# Now let's view the model summary
model.summary()

In [None]:
# Here's the code we'll use to sample for us. It has some extra steps to apply
# the temperature to the distribution, and to make sure we don't get empty
# characters in our text. Most importantly, it will keep track of our model
# state for us.

class OneStep(tf.keras.Model):
  def __init__(self, model, vectorize_layer, vocabulary, temperature=1):
    super().__init__()
    self.temperature=temperature
    self.model = model
    self.vectorize_layer = vectorize_layer
    self.vocabulary = vocabulary
    #print("initialized")

    # Create a mask to prevent "" or "[UNK]" from being generated.
    skip_ids = StringLookup(vocabulary=list(vocabulary))(['', '[UNK]'])[:, None]
    #print(skip_ids)
    #print("3")
    sparse_mask = tf.SparseTensor(
        # Put a -inf at each bad index.
        values=[-float('inf')]*len(skip_ids),
        indices = skip_ids,
        # Match the shape to the vocabulary
        dense_shape=[len(vocabulary)])
    #print("4")
    self.prediction_mask = tf.sparse.to_dense(sparse_mask,validate_indices=False)
    #print("5")

  @tf.function
  def generate_one_step(self, inputs, states=None):
    # Convert strings to token IDs.
    #input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
    input_ids = self.vectorize_layer(inputs)
    #print(input_ids)

    # Run the model.
    # predicted_logits.shape is [batch, char, next_char_logits]
    predicted_logits, states =  self.model(inputs=input_ids, states=states,
                                          return_state=True)
    del input_ids
    # Only use the last prediction.
    predicted_logits = predicted_logits[:, -1, :]
    predicted_logits = predicted_logits/self.temperature

    # Apply the prediction mask: prevent "" or "[UNK]" from being generated.
    predicted_logits = predicted_logits + self.prediction_mask

    # Sample the output logits to generate token IDs.
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    del predicted_logits
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    #print(predicted_ids[0])

    # Return the characters and model state.
    return words_from_ids(predicted_ids), states


In [None]:
def produce_sample(model, vectorize_layer, vocabulary, temp, epoch, prompt):
  # Create an instance of the character generator
  #print("entered")
  one_step_model = OneStep(model, vectorize_layer, vocabulary, temp)
  #print("rand one step")
  # Now, let's generate a 1000 character chapter by giving our model "Chapter 1"
  # as its starting text
  states = None
  next_char = tf.constant([preprocess_text(prompt)])
  result = [tf.constant([prompt])]

  for n in range(200):
    next_char, states = one_step_model.generate_one_step(next_char, states=states)
    #print(next_char)
    result.append(next_char)
    #print(result)

  result = tf.strings.join(result)
  #print(result)

  # Print the results formatted.
  #print('Temp: ' + str(temp) + '\n')
  print(postprocess_text(result[0].numpy().decode('utf-8')))
  #print('\n\n')
  print('Epoch: ' + str(epoch) + '\n', file=open(path + 'tree.txt', 'a'))
  print('Temp: ' + str(temp) + '\n', file=open(path + 'tree.txt', 'a'))
  print(postprocess_text(result[0].numpy().decode('utf-8')), file=open(path + 'tree.txt', 'a'))
  print('\n\n', file=open(path + 'tree.txt', 'a'))
  del states
  del next_char
  del result

## IV. Train the model

For our purposes, we'll be using [categorical cross entropy](https://machinelearningmastery.com/cross-entropy-for-machine-learning/) as our loss function*. Also, our model will be outputting ["logits" rather than normalized probabilities](https://stackoverflow.com/questions/41455101/what-is-the-meaning-of-the-word-logits-in-tensorflow), because we'll be doing further transformations on the output later.


\* Note that since our model deals with integer encoding rather than one-hot encoding, we'll specifically be using [sparse categorical cross entropy](https://stats.stackexchange.com/questions/326065/cross-entropy-vs-sparse-cross-entropy-when-to-use-one-over-the-other).

In [None]:
# sherlock_text = getMyText()

File 'austen.txt' found locally. Using it.


In [None]:
if restart == False:
  model.load_weights(path + "lstm_gru_SH_modelweights_fall2023-random_urls.h5")

In [None]:
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
opt = tf.keras.optimizers.Adam(learning_rate=0.002)
model.compile(optimizer=opt, loss=loss)

num_epochs_total = 5
if restart:
  start_epoch = 0
else:
  start_epoch = epoch_to_pickup
for e in range(start_epoch, num_epochs_total):
  success = False
  while(success == False):
    try:
      print("epoch: ", e)
      # if e < 50:
      #   new_text = getRandomText(numbooks = 20)
      # else:
      #   new_text = sherlock_text + getRandomText(numbooks = (num_epochs_total - e)//10)
      new_text = getMyText()
      dataset = text_to_dataset(new_text)
      del new_text
      dataset = setup_dataset(dataset)
      #opt = tf.keras.optimizers.Adam(learning_rate=0.002*(0.97**e))
      #model.compile(optimizer=opt, loss=loss)
      model.optimizer.learning_rate.assign(0.002*(0.99**e))
      model.fit(dataset, epochs=1, verbose=1)
      print("finished training...")
      del dataset
      #print("saving weights...")
      #model.save_weights(path + "lstm_gru_SH_modelweights_fall2023-random_urls.h5")
      #print("weights saved...")
      for temp in [0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
        produce_sample(model,vectorize_layer,vocabulary, temp, e, 'Emma sat thinking about')
      print("samples produced...")
      gc.collect()
      print("garbage collected...")
      tf.keras.backend.clear_session()
      print("session cleared (to save memory)...")
      #tf.config.experimental.reset_all()
      success = True
    except:
      gc.collect()
      tf.keras.backend.clear_session()
      #tf.config.experimental.reset_all()
      try:
        del dataset
      except:
        print("dataset already deleted")
      print("retrying epoch: " , e)



epoch:  0
File 'austen.txt' found locally. Using it.
[1m245/245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m186s[0m 731ms/step - loss: 4.5172
finished training...
Emma sat thinking about such been Mrs. I I could heard her as I I had being the of be a very friend of the great moment, and but I could be be the day of the I will Miss Thomas. The more of be have be much of Miss Thomas, and I Mrs. Rushworth I was the friend of the good of the time of the I wish Mr. Thomas was be be Miss Crawford, I had the Miss Wentworth for the very very man. It 
Emma sat thinking about you must the letter of the first own own own mother; but I had been a people of the thing as Jane’s the deal of the heart to for him. Her to give the own of the great brother, and I could How had be a dear to be I I is not be the life in be much of the father’s of the great mind of the great own difference, and I was have see the marriage of Mr. I I not Fanny to the house; and was sure you
Emma sat thinking about not h