In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
from __future__ import absolute_import, division, print_function, unicode_literals

%tensorflow_version 2.x
import tensorflow as tf
from tensorflow.keras import layers

import secrets
import os
import time
import pickle
import math
import pandas as pd
import numpy as np

# Spacy
import spacy
from spacy.matcher import Matcher
from spacy.lang.en import English

# Custom Tokenizer
import re
import spacy
from spacy.tokenizer import Tokenizer
from spacy.lang.char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER, CONCAT_QUOTES, LIST_ELLIPSES, LIST_ICONS
from spacy.lang.de.punctuation import _quotes
from spacy.util import compile_prefix_regex, compile_infix_regex, compile_suffix_regex

TensorFlow 2.x selected.


In [0]:
# https://stackoverflow.com/questions/57295996/is-it-possible-to-change-the-token-split-rules-for-a-spacy-tokenizer

# Custom tokenizer to not split on hyphens
def custom_tokenizer(nlp):
    infixes = (
        LIST_ELLIPSES
        + LIST_ICONS
        + [
            r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
            r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
            r'(?<=[{a}])[:<>=](?=[{a}])'.format(a=ALPHA),
            r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
            r"(?<=[{a}])([{q}\]\[])(?=[{a}])".format(a=ALPHA, q=_quotes),
            r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
            r"(?<=[0-9])-(?=[0-9])",
        ]
    )

    infix_re = compile_infix_regex(infixes)
    
    updated_tokenizer =  Tokenizer(nlp.vocab, prefix_search=nlp.tokenizer.prefix_search,
                                suffix_search=nlp.tokenizer.suffix_search,
                                infix_finditer=infix_re.finditer,
                                token_match=nlp.tokenizer.token_match,
                                rules=nlp.Defaults.tokenizer_exceptions)
    
    return updated_tokenizer

# **Helper Functions**

In [0]:
def pattern_merger(doc):
    """ 
        This will be called on the Doc object in the pipeline 
    """
    matched_spans = []
    matches = matcher(doc)
    for match_id, start, end in matches:
        span = doc[start:end]
        matched_spans.append(span)
    for span in matched_spans:  # merge into one token after collecting all matches
        span.merge()
    return doc

In [0]:
def clean_transcript(doc):
    """
    Given a doc, replaces all quotes with the correct double quotes token. Then merges the tokens in sentences
    with the correct formatting and correct puncuation placement. Finally removes all unncessary text in 
    parenthese 
    """
    
    # Replace quotation tokens
    transcript_quotes = []
    left_quote = True
    for token in doc:
        if token.text == '"':
            if left_quote:
                transcript_quotes += ['“']
                left_quote = False
            else:
                transcript_quotes += ['”']
                left_quote = True
        else:
            transcript_quotes += [token.text]  
    
    transcript_quotes = nlp(' '.join(transcript_quotes))
    
    punctuation_marks = ['.', ',','?','!',':',';', ',']
    
    # Merge tokens into correct placement
    transcript_punctuation = ""
    for i, token in enumerate(transcript_quotes):
        if i < len(transcript_quotes) - 1:
            if transcript_quotes[i+1].text in punctuation_marks:
                transcript_punctuation += token.text
            else:
                transcript_punctuation += token.text + ' '
    transcript_punctuation
    
    # Merge sentences using quotation boundaries 
    new_transcript = transcript_punctuation.replace(' ..', '.')
    new_transcript = new_transcript.split(' “ ')
    new_transcript = ' “'.join(new_transcript)
    new_transcript = new_transcript.split(' ” ')
    new_transcript = '” '.join(new_transcript)
    
    # Removes all unncessary text in  parenthese 
    new_transcript = new_transcript.replace('( Applause. )', '')
    new_transcript = new_transcript.replace('( Applause )', '')
    new_transcript = new_transcript.replace('(Applause.)', '')
    new_transcript = new_transcript.replace('(Laughs.)', '')
    new_transcript = new_transcript.replace('(Laughter.)', '')
    new_transcript = new_transcript.replace('(LAUGHTER)', '')
    new_transcript = new_transcript.replace('(APPLAUSE)', '')
    new_transcript = new_transcript.replace('( APPLAUSE )', '')
    new_transcript = new_transcript.replace('(laughter.)', '')
    new_transcript = new_transcript.replace('(TRANSLATION) ', '')
    new_transcript = re.sub(' -{2,}', '', new_transcript)

    return new_transcript

In [0]:
def remove_excess_spaces_component(doc):
    """ Removes all excess spaces to a single space """
    filtered_text = ' '.join([token.text for token in doc if not token.is_space])
    return pattern_merger(nlp.make_doc(filtered_text))

In [0]:
def avg_transcript_length(president):
  """ Returns the average length of transcripts for a specified president """
  lengths = []
  for row in speeches[speeches.President == 'Barack Obama'].itertuples(index=False):
    lengths.append(len(row[5]))
  avg_length = math.floor(sum(lengths)/len(lengths))
  return avg_length

## **Load Data**

In [0]:
# Load corpus and speeches
corpus = pickle.load(open("/content/drive/My Drive/President Speeches NLP/corpus.p", "rb" ))
speeches = pickle.load(open("/content/drive/My Drive/President Speeches NLP/speeches.p", "rb" ))

In [0]:
# Get presidental text
president = 'Barack Obama'
text = corpus.loc[president].transcripts

# Remove unnecessary paraenthesis 
parenthesis_patterns = set(re.findall("\([A-Za-z\s\.]+\)", text))
for pattern in parenthesis_patterns:
  text = re.sub(pattern, '', text)
  text = text.replace('()', '')

In [0]:
# Make Spacy doc
nlp = spacy.load('en_core_web_sm')
nlp.max_length = 1500000

# Set tokenizer to the custom tokenizer
nlp.tokenizer = custom_tokenizer(nlp)

# Pipeline
nlp.add_pipe(pattern_merger, first=True)  # add it right after the tokenizer

# Matcher
matcher = Matcher(nlp.vocab)

# Pattern to match possesion
matcher.add('match_conj_apos_s', None, [{'IS_ALPHA': True}, {'TEXT': '\'s'}])
matcher.add('match_conj_not', None, [{'IS_ALPHA': True}, {'TEXT': 'n\'t'}])
matcher.add('match_conj_apos_d', None, [{'IS_ALPHA': True}, {'TEXT': '\'d'}])
matcher.add('match_conj_apos_ll', None, [{'IS_ALPHA': True}, {'TEXT': '\'ll'}])
matcher.add('match_conj_apos_ll', None, [{'IS_ALPHA': True}, {'TEXT': '\'m'}])
matcher.add('match_conj_apos_re', None, [{'IS_ALPHA': True}, {'TEXT': '\'re'}])
matcher.add('match_conj_apos_ve', None, [{'IS_ALPHA': True}, {'TEXT': '\'ve'}])

doc = nlp(text)

In [11]:
# length of text is the number of characters in it
print ('Length of text: {} tokens'.format(len(text)))

Length of text: 1153289 tokens


In [12]:
# Take a look at the first 100 tokens in doc
print(doc[:50])

To Chairman Dean and my great friend Dick Durbin; and to all my fellow citizens of this great nation; With profound gratitude and great humility, I accept your nomination for the presidency of the United States. Let me express my thanks to the historic slate of


In [13]:
# Remove unnecessary tokens
vocab = sorted(set([token.text for token in doc])) # unigram

# The unique tokens in the file
print ('{} unique characters'.format(len(vocab)))

11500 unique characters


# **Process Text**

##### **Vectorize Text** 

Before training the model, map tokens to a numerical representation. That is, create a two lookup tables: one mapping tokens to numbers and another for numbers to tokens

In [14]:
# Mapping from unique tokens to indices
word2idx = {u:i for i, u in enumerate(vocab)}
idx2word = np.array(vocab)

text_as_int = np.array([word2idx[word] for word in [token.text for token in doc]])

# Examine mapping
print('Mapping:\n--------\n{')
for word,_ in zip(word2idx, range(20)):
    print('  {:4s}: {:3d},'.format(repr(word), word2idx[word]))
print('  ...\n}\n')

# Show how the first 13 tokens from the text are mapped to integers
print ('The first 13 tokens from the text mapped to integers:\n-----------------------------------------------------\n{} ---- tokens mapped to int ---- > {}'.format(repr(doc[:13]), text_as_int[:13]))

Mapping:
--------
{
  ' ' :   0,
  '   ':   1,
  '!' :   2,
  '$' :   3,
  "'" :   4,
  '(' :   5,
  ',' :   6,
  '-' :   7,
  '.' :   8,
  '..':   9,
  '...':  10,
  '1' :  11,
  '1,000':  12,
  '1,200':  13,
  '1,267':  14,
  '1,500':  15,
  '1.5':  16,
  '1.6':  17,
  '10':  18,
  '10,000':  19,
  ...
}

The first 13 tokens from the text mapped to integers:
-----------------------------------------------------
To Chairman Dean and my great friend Dick Durbin; and to all ---- tokens mapped to int ---- > [ 2350   586   771  2850  7597  6033  5809   804   833   229  2850 10627
  2787]


# **Prediction Task**

The input to the model will be a sequence of tokens, and I train the model to predict the output the following token at each time step.

Note: RNNs maintain an internal state that depends on the previously seen elements. So given all the tokens computed until this moment, what is the next token?

### **Create training examples and targets**

Divide the text into example sequences where each input sequence will contain seq_length tokens from the text.

For each input sequence, the corresponding targets contain the same length of text, except shifted one token to the right.

Therefore, break the text into chunks of seq_length+1. 

First use the tf.data.Dataset.from_tensor_slices function to convert the text vector into a stream of character indices.

In [15]:
# The maximum length sentence we want for a single input in tokens
seq_length = 500
examples_per_epoch = len(doc)//(seq_length+1)

# Create training examples / targets
token_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

# Examine first 5 items in token_dataset
for i in token_dataset.take(5):
  print(idx2word[i.numpy()])

To
Chairman
Dean
and
my


The batch method allows to easily convert these individual tokens to sequences of the desired size.

In [16]:
sequences = token_dataset.batch(seq_length+1, drop_remainder=True)

# Examine first 5 items in sequences
for item in sequences.take(5):
  print(repr(' '.join(idx2word[item.numpy()])))

"To Chairman Dean and my great friend Dick Durbin ; and to all my fellow citizens of this great nation ; With profound gratitude and great humility , I accept your nomination for the presidency of the United States . Let me express my thanks to the historic slate of candidates who accompanied me on this journey , and especially the one who traveled the farthest , a champion for working Americans and an inspiration to my daughters and to yours , Hillary Rodham Clinton . To President Clinton , who last night made the case for change as only he can make it ; to Ted Kennedy , who embodies the spirit of service ; and to the next Vice President of the United States , Joe Biden , I thank you . I am grateful to finish this journey with one of the finest statesmen of our time , a man at ease with everyone from world leaders to the conductors on the Amtrak train he still takes home every night . To the love of my life , our next First Lady , Michelle Obama , and to Sasha and Malia , I love you s

For each sequence, duplicate and shift it to form the input and target text by using the map method to apply a simple function to each batch:

In [0]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

In [18]:
# Examine first input data and target data
for input_example, target_example in  dataset.take(1):
  print ('Input data: ', repr(' '.join(idx2word[input_example.numpy()])))
  print ('Target data:', repr(' '.join(idx2word[target_example.numpy()])))

Input data:  "To Chairman Dean and my great friend Dick Durbin ; and to all my fellow citizens of this great nation ; With profound gratitude and great humility , I accept your nomination for the presidency of the United States . Let me express my thanks to the historic slate of candidates who accompanied me on this journey , and especially the one who traveled the farthest , a champion for working Americans and an inspiration to my daughters and to yours , Hillary Rodham Clinton . To President Clinton , who last night made the case for change as only he can make it ; to Ted Kennedy , who embodies the spirit of service ; and to the next Vice President of the United States , Joe Biden , I thank you . I am grateful to finish this journey with one of the finest statesmen of our time , a man at ease with everyone from world leaders to the conductors on the Amtrak train he still takes home every night . To the love of my life , our next First Lady , Michelle Obama , and to Sasha and Malia ,

Each index of these vectors are processed as one time step. For the input at time step 0, the model receives the index for "To" and trys to predict the index for "Chairman" as the next character. At the next timestep, it does the same thing but the RNN considers the previous step context in addition to the current input character.

In [19]:
for i, (input_idx, target_idx) in enumerate(zip(input_example[:5], target_example[:5])):
    print("Step {:4d}".format(i))
    print("  input: {} ({:s})".format(input_idx, repr(idx2word[input_idx])))
    print("  expected output: {} ({:s})".format(target_idx, repr(idx2word[target_idx])))

Step    0
  input: 2350 ('To')
  expected output: 586 ('Chairman')
Step    1
  input: 586 ('Chairman')
  expected output: 771 ('Dean')
Step    2
  input: 771 ('Dean')
  expected output: 2850 ('and')
Step    3
  input: 2850 ('and')
  expected output: 7597 ('my')
Step    4
  input: 7597 ('my')
  expected output: 6033 ('great')


## **Create training batches**

Use tf.data to split the text into manageable sequences. But before feeding this data into the model, need to shuffle the data and pack it into batches.

In [20]:
# Batch size
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

dataset

<BatchDataset shapes: ((64, 500), (64, 500)), types: (tf.int64, tf.int64)>

# **Build The Model**

### **Use tf.keras.Sequential to define the model.**

**tf.keras.layers.Embedding:** The input layer. A trainable lookup table that will map the numbers of each tiken to a vector with embedding_dim dimensions;

**tf.keras.layers.LSTM:** A type of RNN with size units=rnn_units

tf.keras.layers.Dense: The output layer, with vocab_size outputs.


In [0]:
# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

In [0]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):

  model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
    tf.keras.layers.LSTM(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size),
    
  ])
  return model

In [0]:
model = build_model(
  vocab_size = len(vocab),
  embedding_dim=embedding_dim,
  rnn_units=rnn_units,
  batch_size=BATCH_SIZE)

### **Try running the model**

In [24]:
# Check shape of output
for input_example_batch, target_example_batch in dataset.take(1):
  example_batch_predictions = model(input_example_batch)
  print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 500, 11500) # (batch_size, sequence_length, vocab_size)


Note: This model has sequence_length of 500 but can be run on any length

In [25]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           2944000   
_________________________________________________________________
lstm (LSTM)                  (64, None, 1024)          5246976   
_________________________________________________________________
dense (Dense)                (64, None, 11500)         11787500  
Total params: 19,978,476
Trainable params: 19,978,476
Non-trainable params: 0
_________________________________________________________________


To get actual predictions from the model, then need to sample from the output distribution, to get actual character indices. This distribution is defined by the logits over the token vocabulary.

In [0]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()

For the first batch, at each timestep, a prediction of the next token index:

In [27]:
sampled_indices

array([ 4159,  1844,  1387,  7859,  7494,  6871, 10501,  8308,  9277,
        3496,  1835,  7636,  5874,  1667,  9901,   328, 10354,  1190,
        2746,  3268,  7933,  9450,  7133,  1382,  3298,   960,  1063,
        7106,  5514,  7562,   937,  6106,  6499, 10442,  6552,  7027,
       10103,  3699,  9856,  3606, 10227,  6146,  7952,   777,  1601,
        3840,  3667,  9439,  8140,   425,  9850,  3892,  4400,  8250,
        6247,    11, 10635,  3531,  6874,  3096,  8306,  1728,  1142,
        7495,  4149,  2912,  9799,  3308,  2729,  1243,  5840,  1367,
       11459,  3696,  4960,  5895, 10880,  9857,  3360,  1735,  5844,
        5467,  3487, 10315, 10804, 10753,  4587,   232,  8857,  1808,
        3788, 11151,  5894,  5345, 10653,  9664,  2043,   966,  8819,
        1786,  5873,  7947,  4609,  9107,  5848,  2175,  4170,  9211,
        8846,  7846, 11335,  4680,  1778,  9670,  6342,  5061,  6543,
        5457,  2666,  8606, 10424,   103,  5079,  3271,  9148,  5419,
        7198,  4318,

Decode these to see the text predicted by this untrained model:

In [28]:
print("Input: \n", repr(" ".join(idx2word[input_example_batch[0]])))
print()
print("Next token Predictions: \n", repr(" ".join(idx2word[sampled_indices ])))

Input: 
 "scarcely imagined . But what has not changed is the imperative of citizenship ; that willingness of a 26-year-old deacon , or a Unitarian minister , or a young mother of five to decide they loved this country so much that they'd risk everything to realize its promise . That's what it means to love America . That's what it means to believe in America . That's what it means when we say America is exceptional . For we were born of change . We broke the old aristocracies , declaring ourselves entitled not by bloodline , but endowed by our Creator with certain inalienable rights . We secure our rights and responsibilities through a system of self government , of and by and for the people . That's why we argue and fight with so much passion and conviction because we know our efforts matter . We know America is what we make of it . Look at our history . We are Lewis and Clark and Sacajawea , pioneers who braved the unfamiliar , followed by a stampede of farmers and miners , and entr

# **Train the model**

At this point, the problem can be treated as a standard classification problem. Given the previous RNN state, and the input this time step, predict the class of the next character.

### **Attach an optimizer, and a loss function**

The standard tf.keras.losses.sparse_categorical_crossentropy loss function works in this case because it is applied across the last dimension of the predictions.

Because the model returns logits, we need to set the from_logits flag.

In [29]:
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

example_batch_loss  = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("scalar_loss:      ", example_batch_loss.numpy().mean())

Prediction shape:  (64, 500, 11500)  # (batch_size, sequence_length, vocab_size)
scalar_loss:       9.350142


Configure the training procedure using the tf.keras.Model.compile method. The tf.keras.optimizers.Adam with default arguments and the loss function is used.

In [0]:
model.compile(optimizer='adam', loss=loss)

### **Configure checkpoints**

Use a tf.keras.callbacks.ModelCheckpoint to ensure that checkpoints are saved during training:

In [0]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "last_ckpt")#"ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

### **Execute the training**

In [32]:
EPOCHS=500
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Train for 7 steps
Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epo

# **Generate text**

### **Restore the latest checkpoint**

Use a batch size of 1.

Because of the way the RNN state is passed from timestep to timestep, the model only accepts a fixed batch size once built.

To run the model with a different batch_size, we need to rebuild the model and restore the weights from the checkpoint.

In [33]:
tf.train.latest_checkpoint(checkpoint_dir)

'./training_checkpoints/last_ckpt'

In [0]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

model.build(tf.TensorShape([1, None]))

In [35]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (1, None, 256)            2944000   
_________________________________________________________________
lstm_1 (LSTM)                (1, None, 1024)           5246976   
_________________________________________________________________
dense_1 (Dense)              (1, None, 11500)          11787500  
Total params: 19,978,476
Trainable params: 19,978,476
Non-trainable params: 0
_________________________________________________________________


# **The prediction loop**

The following code block generates the text:

*   It Starts by choosing a start string, initializing the RNN state and setting the number of tokens to generate.

*   Get the prediction distribution of the next token using the start string and the RNN state.

*   Then, use a categorical distribution to calculate the index of the predicted token. Use this predicted token as our next input to the model.

*   The RNN state returned by the model is fed back into the model so that it now has more context, instead than only one token. After predicting the next token, the modified RNN states are again fed back into the model, which is how it learns as it gets more context from the previously predicted characters.



In [0]:
def generate_text(model, temp, start_string):
  # Evaluation step (generating text using the learned model)

  # Number of characters to generate
  num_generate = avg_transcript_length('Barack Obama')

  # Converting our start string to numbers (vectorizing)
  input_eval = [word2idx[word] for word in start_string.split()]
  input_eval = tf.expand_dims(input_eval, 0)

  # Empty string to store our results
  text_generated = []

  # Low temperatures results in more predictable text.
  # Higher temperatures results in more surprising text.
  # Experiment to find the best setting.
  temperature = temp

  # Here batch size == 1
  model.reset_states()
  for i in range(num_generate):
      predictions = model(input_eval)
      # remove the batch dimension
      predictions = tf.squeeze(predictions, 0)

      # using a categorical distribution to predict the character returned by the model
      predictions = predictions / temperature
      predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

      # We pass the predicted character as the next input to the model
      # along with the previous hidden state
      input_eval = tf.expand_dims([predicted_id], 0)

      text_generated.append(idx2word[predicted_id])

  return (start_string + ' '.join(text_generated))

In [0]:
def get_first_word(president):
  president_speeches = speeches[speeches['President'] == president].Transcript.values
  first_words = []
  for speech in president_speeches: 
    tokens = nlp(speech[:50])
    first_words.append(tokens[0])
  first_words = list(set(map(lambda x: x.text, first_words)))
  return secrets.choice(first_words)

In [0]:
def generate_transcript(president, model, temp):
  first_word = get_first_word(president) + " "
  generated_text = generate_text(model, temp, start_string=first_word)
  generated_doc = nlp(generated_text)
  generated_text_final = clean_transcript(generated_doc)

  return generated_text_final

## **Generate Text**

In [0]:
generated_transcript = generate_transcript(president, model, .3)
generated_transcript[:10000]

In [0]:
# Remove exccess spaces
generated_transcript = re.sub(' {2,}', ' ', generated_transcript)

# Write to text file
with open("generated_transcript_Obama.txt", "w") as file:
  file.write(generated_transcript)
  