<a href="https://colab.research.google.com/github/kaushikabhishek87/Tensorflow_projects/blob/main/11_text_generation_with_RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf
from tensorflow.keras.layers.experimental import preprocessing
import numpy as np
import os
import time
import pandas as pd

In [2]:
!wget https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt

--2021-07-06 04:14:31--  https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt
Resolving storage.googleapis.com (storage.googleapis.com)... 74.125.141.128, 173.194.210.128, 173.194.211.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|74.125.141.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘shakespeare.txt’


2021-07-06 04:14:31 (129 MB/s) - ‘shakespeare.txt’ saved [1115394/1115394]



In [46]:
text = open("/content/shakespeare.txt", mode="rb" ).read().decode(encoding="utf8")

In [5]:
# text = open("/content/shakespeare.txt", mode="r" ).read()

In [6]:
print(text[:250])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.



In [7]:
# Unique characters 
vocab = set(text)
print(len(vocab))

65


## Preprocess Text

In [8]:
example_text = ["there was just an earthquake", "hoping god everyone is safe"]
chars = tf.strings.unicode_split(example_text, input_encoding="UTF-8")
chars

<tf.RaggedTensor [[b't', b'h', b'e', b'r', b'e', b' ', b'w', b'a', b's', b' ', b'j', b'u', b's', b't', b' ', b'a', b'n', b' ', b'e', b'a', b'r', b't', b'h', b'q', b'u', b'a', b'k', b'e'], [b'h', b'o', b'p', b'i', b'n', b'g', b' ', b'g', b'o', b'd', b' ', b'e', b'v', b'e', b'r', b'y', b'o', b'n', b'e', b' ', b'i', b's', b' ', b's', b'a', b'f', b'e']]>

In [9]:
chars.shape

TensorShape([2, None])

In [35]:
ids_from_chars = preprocessing.StringLookup(vocabulary=list(vocab) , 
                                            mask_token=None)

In [36]:
chars[0]

<tf.Tensor: shape=(28,), dtype=string, numpy=
array([b't', b'h', b'e', b'r', b'e', b' ', b'w', b'a', b's', b' ', b'j',
       b'u', b's', b't', b' ', b'a', b'n', b' ', b'e', b'a', b'r', b't',
       b'h', b'q', b'u', b'a', b'k', b'e'], dtype=object)>

In [37]:
ids = ids_from_chars(chars)
ids[0]

<tf.Tensor: shape=(28,), dtype=int64, numpy=
array([41, 49, 14, 50, 14,  4, 10, 64, 33,  4, 25, 20, 33, 41,  4, 64, 28,
        4, 14, 64, 50, 41, 49, 19, 20, 64, 24, 14])>

In [38]:
chars_from_ids = preprocessing.StringLookup(vocabulary=ids_from_chars.get_vocabulary(),
                                            invert = True, mask_token=None)
chars = chars_from_ids(ids)
chars

<tf.RaggedTensor [[b't', b'h', b'e', b'r', b'e', b' ', b'w', b'a', b's', b' ', b'j', b'u', b's', b't', b' ', b'a', b'n', b' ', b'e', b'a', b'r', b't', b'h', b'q', b'u', b'a', b'k', b'e'], [b'h', b'o', b'p', b'i', b'n', b'g', b' ', b'g', b'o', b'd', b' ', b'e', b'v', b'e', b'r', b'y', b'o', b'n', b'e', b' ', b'i', b's', b' ', b's', b'a', b'f', b'e']]>

In [39]:
tf.strings.reduce_join(chars, axis=-1).numpy()

array([b'there was just an earthquake', b'hoping god everyone is safe'],
      dtype=object)

In [40]:
def text_from_ids(ids):
  return tf.strings.reduce_join(chars_from_ids(ids), axis=-1).numpy()

In [41]:
text_from_ids(ids)

array([b'there was just an earthquake', b'hoping god everyone is safe'],
      dtype=object)

## Creating Training Exmaples & Target

In [48]:
all_ids = ids_from_chars(tf.strings.unicode_split(text, input_encoding="UTF-8"))
all_ids

<tf.Tensor: shape=(1115394,), dtype=int64, numpy=array([40, 29, 50, ..., 47, 55, 18])>

In [49]:
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)
ids_dataset

<TensorSliceDataset shapes: (), types: tf.int64>

In [55]:
for i in ids_dataset.take(10):
  print(chars_from_ids(i).numpy().decode("UTF-8") )


F
i
r
s
t
 
C
i
t
i


In [56]:
seq_length = 100
examples_per_epoch = len(text)//(seq_length+1)
examples_per_epoch

11043

In [61]:
sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)

for i in sequences.take(1):
  print(chars_from_ids(i))

tf.Tensor(
[b'F' b'i' b'r' b's' b't' b' ' b'C' b'i' b't' b'i' b'z' b'e' b'n' b':'
 b'\n' b'B' b'e' b'f' b'o' b'r' b'e' b' ' b'w' b'e' b' ' b'p' b'r' b'o'
 b'c' b'e' b'e' b'd' b' ' b'a' b'n' b'y' b' ' b'f' b'u' b'r' b't' b'h'
 b'e' b'r' b',' b' ' b'h' b'e' b'a' b'r' b' ' b'm' b'e' b' ' b's' b'p'
 b'e' b'a' b'k' b'.' b'\n' b'\n' b'A' b'l' b'l' b':' b'\n' b'S' b'p' b'e'
 b'a' b'k' b',' b' ' b's' b'p' b'e' b'a' b'k' b'.' b'\n' b'\n' b'F' b'i'
 b'r' b's' b't' b' ' b'C' b'i' b't' b'i' b'z' b'e' b'n' b':' b'\n' b'Y'
 b'o' b'u' b' '], shape=(101,), dtype=string)
