# PART 1 
# reading the data 

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import tensorflow as tf 

In [2]:
# opening the file 
path_to_file = 'shakespeare.txt' 
with open(path_to_file,'r') as f:
    text = f.read() 

# creating a vocabulary list 
vocab = sorted(set(text))


In [3]:
len(vocab) # length of the vocabulary 

84

# PART 2 
# test processing

In [4]:
# we want to assign a number to every character in the vocabulary 
char_to_ind = {char:ind for ind,char in enumerate(vocab)} # characters to index 
ind_to_char = np.array(vocab) # index to characters 


In [5]:
encoded_text = np.array([char_to_ind[char] for char in text])

In [6]:
seq_len = 120 
total_num_seq = len(text)  // (seq_len + 1)
total_num_seq


45005

In [7]:
char_dataset = tf.data.Dataset.from_tensor_slices(encoded_text)

In [8]:
#char_dataset.batch()

In [9]:
#for item in char_dataset.take(500):
#    print(ind_to_char[item.numpy()])

sequences = char_dataset.batch(seq_len + 1, drop_remainder=True)

def create_seq_targets(seq):
    # seq -- Hello my name 
    input_text = seq[:-1] # Hello my nam
    target_text = seq[1:] # ello my name 
    return input_text, target_text 

dataset = sequences.map(create_seq_targets) 

for i in range(total_num_seq):
    start = i * (seq_len + 1)
    end = start + seq_len + 1 
    print(encoded_text[start:end])

In [10]:
for input_txt, target_txt in dataset.take(1): 
    print(input_txt.numpy()) 
    print("".join(ind_to_char[input_txt.numpy()]))

    print(target_txt.numpy())
    print("".join(ind_to_char[target_txt.numpy()]))


[ 0  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 12  0
  1  1 31 73 70 68  1 61 56 64 73 60 74 75  1 58 73 60 56 75 76 73 60 74
  1 78 60  1 59 60 74 64 73 60  1 64 69 58 73 60 56 74 60  8  0  1  1 45
 63 56 75  1 75 63 60 73 60 57 80  1 57 60 56 76 75 80  5 74  1 73 70 74
 60  1 68 64 62 63 75  1 69 60 77 60 73  1 59 64 60  8  0  1  1 27 76 75]

                     1
  From fairest creatures we desire increase,
  That thereby beauty's rose might never die,
  But
[ 1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 12  0  1
  1 31 73 70 68  1 61 56 64 73 60 74 75  1 58 73 60 56 75 76 73 60 74  1
 78 60  1 59 60 74 64 73 60  1 64 69 58 73 60 56 74 60  8  0  1  1 45 63
 56 75  1 75 63 60 73 60 57 80  1 57 60 56 76 75 80  5 74  1 73 70 74 60
  1 68 64 62 63 75  1 69 60 77 60 73  1 59 64 60  8  0  1  1 27 76 75  1]
                     1
  From fairest creatures we desire increase,
  That thereby beauty's rose might never die,
  But 


In [11]:
batch_size = 128
buffer_size = 10000
dataset = dataset.shuffle(buffer_size=buffer_size).batch(batch_size=batch_size, drop_remainder=True) 


In [14]:
vocab_size = len(vocab)
vocab_size

84

In [18]:
embed_dim = 64 
rnn_neurons = 1026

from tensorflow.keras.losses import sparse_categorical_crossentropy 

In [19]:
def sparse_cat_loss(y_true, y_pred):
    return sparse_categorical_crossentropy(y_true, y_pred, from_logits=True) 

In [23]:
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Dense, Embedding, LSTM, GRU 

In [24]:
def create_model(vocab_size, embed_dim, rnn_neurons, batch_size):
    model = Sequential()
    model.add(Embedding(vocab_size, embed_dim, batch_input_shape = [batch_size, None])) 
    model.add(GRU(rnn_neurons, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'))
    model.add(Dense(vocab_size))

    model.compile(optimizer='adam', loss=sparse_cat_loss)

    return model 

In [25]:
model = create_model(vocab_size=vocab_size, embed_dim=embed_dim, rnn_neurons=rnn_neurons, batch_size=batch_size)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (128, None, 64)           5376      
_________________________________________________________________
gru (GRU)                    (128, None, 1026)         3361176   
_________________________________________________________________
dense (Dense)                (128, None, 84)           86268     
Total params: 3,452,820
Trainable params: 3,452,820
Non-trainable params: 0
_________________________________________________________________
