<a href="https://colab.research.google.com/github/kvamsi7/Text-Generation/blob/main/Generating_Text_Using_a_Character_RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf
from tensorflow import keras

# common imports
import numpy as np
import pandas as pd

In [2]:
shakespeare_url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
file_path = keras.utils.get_file('shakespeare.txt',shakespeare_url)

Downloading data from https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt


In [3]:
with open(file_path) as f:
    shakespeare_text = f.read()

In [4]:
print(shakespeare_text[:148])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?



In [5]:
"".join(sorted(set(shakespeare_text.lower())))

"\n !$&',-.3:;?abcdefghijklmnopqrstuvwxyz"

#### Fitting a tokenizer to the text

In [6]:
tokenizer = keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer.fit_on_texts(shakespeare_text)

In [7]:
# Now the tokenizer object can convert the text to sequences or sequences to text

print(tokenizer.texts_to_sequences(["First"]))

print(tokenizer.sequences_to_texts([[20,6,9,8,3]]))

[[20, 6, 9, 8, 3]]
['f i r s t']


In [8]:
# TOTAL NUMBER OF UNIQUE CHARACTERS

max_id = len(tokenizer.word_index)
max_id

39

In [9]:
# TOTAL NUMBER OF CHARACTERS IN THE FILE

dataset_size = tokenizer.document_count
dataset_size

1115394

In [10]:
# Encoding the whole text, each character will be represented by its ID

[encoded] = np.array(tokenizer.texts_to_sequences([shakespeare_text])) - 1 # (we subtract 1 to get IDs from 0 to 38 rather 1 to 39)

#### Splitting a Sequential Dataset

In [11]:
train_size = dataset_size * 90 // 100
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])

#### chopping the sequential dataset into multiple windows

In [12]:
n_steps = 100
window_length = n_steps + 1 # target = input shifted 1 character ahead
dataset = dataset.window(window_length,shift=1,drop_remainder=True)

In [13]:
dataset = dataset.flat_map(lambda window: window.batch(window_length))

In [14]:
batch_size = 32

dataset = dataset.shuffle(1000).batch(batch_size)
dataset = dataset.map(lambda windows: (windows[:,:-1],windows[:,1:]))

#### Encoding the categorical input features

In [15]:
dataset = dataset.map(lambda X_batch,Y_batch: (tf.one_hot(X_batch,depth=max_id),Y_batch))

In [16]:
dataset = dataset.prefetch(1)

In [17]:
for X_batch,Y_batch in dataset.take(1):
    print(X_batch.shape,Y_batch.shape)

(32, 100, 39) (32, 100)


#### Building and Training the Char_RNN Model

In [18]:
model = keras.models.Sequential([
    keras.layers.GRU(128,return_sequences = True,input_shape=[None,max_id],
                    dropout=0.2,recurrent_dropout = 0.2),
    keras.layers.GRU(128,return_sequences = True,
                    dropout=0.2,recurrent_dropout = 0.2),
    keras.layers.TimeDistributed(keras.layers.Dense(max_id,activation="softmax"))
])

model.compile(loss="sparse_categorical_crossentropy",optimizer = 'adam')
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru (GRU)                    (None, None, 128)         64896     
_________________________________________________________________
gru_1 (GRU)                  (None, None, 128)         99072     
_________________________________________________________________
time_distributed (TimeDistri (None, None, 39)          5031      
Total params: 168,999
Trainable params: 168,999
Non-trainable params: 0
_________________________________________________________________


In [None]:
history = model.fit(dataset,epochs=20)

# def train 

Epoch 1/20
  15964/Unknown - 7508s 470ms/step - loss: 1.5207