# Sentence Generation


## Imports and Initializations

In [2]:
import nltk
from nltk.corpus import gutenberg
nltk.download('punkt', quiet = True)
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('gutenberg', quiet = True)
from nltk.util import ngrams

In [3]:
# List all the available book names in the Gutenberg corpus
gutenberg_books = gutenberg.fileids()

# Print the list of book names
for book in gutenberg_books:
    print(book)

# Define the list of book titles
book_titles = [
    "austen-emma.txt",
    "austen-sense.txt",
    "bryant-stories.txt",
    "burgess-busterbrown.txt",
    "carroll-alice.txt",
    "chesterton-brown.txt",
    "chesterton-thursday.txt",
    "edgeworth-parents.txt",
    "melville-moby_dick.txt",
    "milton-paradise.txt"
]

austen-emma.txt
austen-persuasion.txt
austen-sense.txt
bible-kjv.txt
blake-poems.txt
bryant-stories.txt
burgess-busterbrown.txt
carroll-alice.txt
chesterton-ball.txt
chesterton-brown.txt
chesterton-thursday.txt
edgeworth-parents.txt
melville-moby_dick.txt
milton-paradise.txt
shakespeare-caesar.txt
shakespeare-hamlet.txt
shakespeare-macbeth.txt
whitman-leaves.txt


## Create Trie Structure

This code defines a Trie data structure and functions to create and search within it. It processes a list of texts, tokenizes them into sentences and words, removes punctuation, adds start and stop tokens, then generates and updates the Trie with n-grams of specified length. Finally, it allows searching for counts of specific n-grams within the created Trie.

In [4]:
class TrieNode:
    def __init__(self):
        self.children = {}
        self.count = 0

def update_trie(node, n_gram):
    current_node = node
    for word in n_gram:
        if word not in current_node.children:
            current_node.children[word] = TrieNode()
        current_node = current_node.children[word]
    current_node.count += 1

def create_trie(text_list, depth):
  # Initialize root node for unigrams
  root_node = TrieNode()

  # Processing each text
  for text in text_list:
      # Tokenize the book into sentences
      sentences = sent_tokenize(text)

      # Process each sentence
      for sentence in sentences:
          # Tokenize the sentence into words
          words = word_tokenize(sentence)

          # Removing punctuation from words
          words = [word.lower() for word in words if word.isalnum()]

          # Add start and stop tokens
          words = ['<s>'] + words + ['</s>']

          # Generate and update trie for each n-gram
          for n in range(1, depth+1):
              n_grams = list(ngrams(words, n))
              for gram in n_grams:
                  update_trie(root_node, gram)

  return root_node

# Helper functions to better understand the structure
def print_trie(node, depth=0, max_depth=3, prefix=[]):
    if depth > max_depth:
        return
    if node.count > 0:
        print(' '.join(prefix), node.count)
    for word, child_node in node.children.items():
        print_trie(child_node, depth + 1, max_depth, prefix + [word])

def search_trie(node, key):
    current_node = node
    for word in key:
        if word not in current_node.children:
            return 0
        current_node = current_node.children[word]
    return current_node.count if current_node else 0

# Choose the n in n-grams
N_GRAM_LENGTH = 6

book_texts = [gutenberg.raw(book_title) for book_title in book_titles]

root_node = create_trie(book_texts, N_GRAM_LENGTH)

# print("N-gram Trie Structure (up to depth 3):")
#print_trie(root_node, max_depth=2)

In [None]:
# How to search for a key:
count = search_trie(root_node, ["the", "sun"])
print(f"Key frequency is {count}")

Key frequency is 149


## Vocabulary

In [5]:
vocabulary = list(root_node.children.keys()) # Vocabulary is the unigrams
vocabulary.remove("<s>")
vocabulary.remove("</s>") #
# Display the size of the general vocabulary
print(f"Size of vocabulary: {len(vocabulary)} words")

Size of vocabulary: 27460 words


## Generate Sentences
This code generates sentences using an n-gram model based on a Trie data structure. It starts by randomly generating starting n-grams of specified lengths from the Trie, then iteratively generates subsequent words based on the previous n-gram. The generated sentences are then beautified by capitalizing the first letter and 'I', and adding periods. Finally, it prints out sentences for different n-gram lengths.

In [140]:
import random

def generate_next_word(node, node_name, words, smoothing_words = None, creativity = 0.000001):
    # Base case
    if len(words) == 0:
        if not node.children: # If the node doesn't have children
            return None

        children = node.children
        # Extract counts and words
        counts = [child.count for child in children.values()]
        possible_words = list(children.keys())

        # Add the smoothing if it exists
        if smoothing_words:
          possible_words.extend(smoothing_words)
          counts.extend([creativity] * len(smoothing_words))

        # Choose randomly based on the weight of counts
        next_word = random.choices(possible_words, weights=counts)[0]
        #print("Next word chosen ", next_word)

        return next_word

    word = words[0]
    #print("Node: ", node_name, "\nLooking for: ", word)

    if word not in node.children.keys():
      #print("Key not found")
      return None
    next_node = node.children[word]
    next_word = generate_next_word(next_node, word, words[1:], smoothing_words)
    return next_word

def generate_starting_ngram(node, length, max_attempts=3, vocabulary = vocabulary):
    ngram = ["<s>"]
    attempts = 0

    while len(ngram) < length:
        next_word = generate_next_word(node, "root", ngram)
        if next_word is not None and next_word != "</s>":
            ngram.append(next_word)
            attempts = 0  # Reset attempts if a word is successfully added
        else:
            placeholder = random.choice(vocabulary)  # Choose a random word from the vocabulary list
            ngram.append(placeholder)
            attempts += 1
            if attempts >= max_attempts:
                break  # Break out of the loop if maximum attempts reached without finding a valid next word

    return ngram[:length]  # Ensure the n-gram is of the specified length


def generate_sentences(node, n, max_length=30, num_sentences=5, vocabulary = vocabulary):
    sentences = []
    while len(sentences) < num_sentences:
        sentence = generate_starting_ngram(node, n)
        while len(sentence) < max_length:
            for i in range(len(sentence) - 1):
                words = sentence[-n + i - 1:]
                next_word = generate_next_word(node, "root", words, smoothing_words = vocabulary)
                if next_word is not None:
                    break
            if next_word is None:
              next_word = random.choice(vocabulary) if sentence[-1] == "and" else "and"
            if next_word == "</s>":
                break
            sentence.append(next_word)
        sentence_str = ' '.join(sentence[1:])
        # Check if the sentence is unique
        if sentence_str not in sentences:
            sentences.append(sentence_str)
    return sentences

def beutify_sentences(sentence_list):
  # Capitalize the first letter and I and add a period
  sentences = [sentence.capitalize().replace(" i ", " I ") + '.' for sentence in sentence_list]
  return sentences

def generate_and_print_sentences(root_node, n, num_sentences=5, vocabulary = vocabulary):
    sentences = generate_sentences(root_node, n, num_sentences=num_sentences, vocabulary = vocabulary)
    sentences = beutify_sentences(sentences)
    print(f"\nSentences using {n}-grams:")
    for i, sentence in enumerate(sentences, 1):
        print(f"{i}. {sentence}")

# Generate and print sentences for different n-gram lengths
generate_and_print_sentences(root_node, 2)
generate_and_print_sentences(root_node, 3)
generate_and_print_sentences(root_node, 4)
generate_and_print_sentences(root_node, 5)
generate_and_print_sentences(root_node, 6)


Sentences using 2-grams:
1. The only way to move him was to depend.
2. His situation is an evil but you must give me a living.
3. The everlasting whip cord I declare.
4. He did not even move a hair and syme could come close enough to a whale before any pitchpoling comes into play.
5. Papa laughed.

Sentences using 3-grams:
1. Churchill after being disliked at least years was now spoken of with compassionate allowances.
2. Repeated the doctor with a start but what on earth can he agent he thought mahanaim where he adam at the news with chilling gripe of sorrow stood that.
3. And where do you think he was in rashness leads not on.
4. After the farmer was dead the hosts of light.
5. My dear miss gregory said syme gently there are many kinds of sincerity and insincerity.

Sentences using 4-grams:
1. Who will none shall from me withhold thy offered good distance from those she wants to be with but one can not comprehend a young being under such restraint.
2. Doated introductions jonah and 

## Text-to-Image Prompts

In [135]:
# @title Generate Prompts { vertical-output: true }
story = "The knight stands opposite of the Dragon. It is a ferocious beast; Fifty times the size of a grown man, its body constantly moving, contorting in strange and unnatural ways. Pure white feathers decorate the creature, sharp like double-edged swords, ready to slice the flesh with the lightest touch. They move with the wind, each one dancing to its own rhythm, making the Dragon feel even more alive. The sight is mesmerizing, but the knight knows better than to look, for this Dragon is special. Its deadliest weapon isn't its needle-like teeth, or its sharp claws, but its black, abysmall eyes. They feel like bottomless pits, like the absense of eyes and everything else, they pierce through shiny armors and feed the emptiness in human souls until there's nothing left. On the ground, the bodies of better knights are scattered like flowers. Some alive, some dead, and some in-between. Lost souls, cursed to wander in places unknown. Their bodies break under the Dragon's legs as it arches its body backwards. It opens its huge mouth, and defeaning silence comes out. It's ready to attack.  \"I am a knight\", the knight thinks.   The knight's plain brown horse neighs in disagreement as they move into position. A pointy spear points at the beast's direction. Words have suddenly lost their meaning, and the air is too dense. Still, it seems like the whole battlefield is tingling with anticipation, waiting to see who will make the first mo- The Dragon charges violently forward, tearing everything in its way apart, its claws scratching the ground, leaving deep scars. It's wrong, very wrong, because dragons aren't supposed to move that fast, turn something into nothing this quickly. The world has been replaced by a void, and the void is being replaced by [     ]. The only path the knight can follow lies straight ahead.  \"I am a knight\", the knight says.  The two imbalanced bodies collide; for a moment, everything is still. A horse's scream breaks the silence, and a knight's one joins it. But the greatest scream is the one that never gets heard. The Dragon feels the spear leaving its body, just above the shoulder, feels the wound opening, allowing for the inside to become outside. Surprisingly, it's not blood that comes out, but ink, black as a night without stars. It stains the rich white feathers, and it's beautiful in a way, because it paints letters.   \"I am a knight\", the knight writes.  Paper dragons dislike words. It's angry now, preparing to attack again. The knight picks up the spear. And so the battle continues, a Dragon and a knight, until one prevails." # @param {type:"string"}
N_GRAM_LENGTH = 6

texts = [story]

story_root_node = create_trie(texts, N_GRAM_LENGTH)
story_vocabulary = list(story_root_node.children.keys()) # Vocabulary is the unigrams
story_vocabulary.remove("<s>")
story_vocabulary.remove("</s>")

generate_and_print_sentences(story_root_node, 2, vocabulary = story_vocabulary)
generate_and_print_sentences(story_root_node, 3, vocabulary = story_vocabulary)
generate_and_print_sentences(story_root_node, 4, vocabulary = story_vocabulary)
generate_and_print_sentences(story_root_node, 5, vocabulary = story_vocabulary)
generate_and_print_sentences(story_root_node, 6, vocabulary = story_vocabulary)


Sentences using 2-grams:
1. Words have suddenly lost their meaning and the air is too dense.
2. Pure white feathers decorate the creature sharp like swords ready to slice the flesh with the lightest touch.
3. And so the battle continues a dragon and a knight until one prevails.
4. The knight picks up the spear.
5. They feel like bottomless pits like the absense of eyes and everything else they pierce through shiny armors and feed the emptiness in human souls until there nothing left.

Sentences using 3-grams:
1. Paper dragons dislike words.
2. It ready to attack.
3. I am a knight the knight says.
4. The knight stands opposite of the dragon.
5. On the now and a knight until one prevails.

Sentences using 4-grams:
1. Their bodies break under the dragon legs as it arches its body backwards.
2. The only path the knight can follow lies straight ahead.
3. Still it seems like the whole battlefield is tingling with anticipation waiting to see who will make the first the dragon charges violent