# PART 1 
# reading the data 

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import tensorflow as tf 

In [2]:
# opening the file 
path_to_file = 'shakespeare.txt' 
with open(path_to_file,'r') as f:
    text = f.read() 

# creating a vocabulary list 
vocab = sorted(set(text))


In [3]:
len(vocab) # length of the vocabulary 

84

# PART 2 
# test processing

In [4]:
# we want to assign a number to every character in the vocabulary 
char_to_ind = {char:ind for ind,char in enumerate(vocab)} # characters to index 
ind_to_char = np.array(vocab) # index to characters 


In [5]:
encoded_text = np.array([char_to_ind[char] for char in text])

In [6]:
seq_len = 120 
total_num_seq = len(text)  // (seq_len + 1)
total_num_seq


45005

In [7]:
char_dataset = tf.data.Dataset.from_tensor_slices(encoded_text)

In [8]:
#char_dataset.batch()

In [9]:
#for item in char_dataset.take(500):
#    print(ind_to_char[item.numpy()])

sequences = char_dataset.batch(seq_len + 1, drop_remainder=True)

def create_seq_targets(seq):
    # seq -- Hello my name 
    input_text = seq[:-1] # Hello my nam
    target_text = seq[1:] # ello my name 
    return input_text, target_text 

dataset = sequences.map(create_seq_targets) 

In [10]:
for input_txt, target_txt in dataset.take(1):
    print(input_txt.numpy(), '\n\n\n',target_txt.numpy())
    print('\n\n')

[ 0  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 12  0
  1  1 31 73 70 68  1 61 56 64 73 60 74 75  1 58 73 60 56 75 76 73 60 74
  1 78 60  1 59 60 74 64 73 60  1 64 69 58 73 60 56 74 60  8  0  1  1 45
 63 56 75  1 75 63 60 73 60 57 80  1 57 60 56 76 75 80  5 74  1 73 70 74
 60  1 68 64 62 63 75  1 69 60 77 60 73  1 59 64 60  8  0  1  1 27 76 75] 


 [ 1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 12  0  1
  1 31 73 70 68  1 61 56 64 73 60 74 75  1 58 73 60 56 75 76 73 60 74  1
 78 60  1 59 60 74 64 73 60  1 64 69 58 73 60 56 74 60  8  0  1  1 45 63
 56 75  1 75 63 60 73 60 57 80  1 57 60 56 76 75 80  5 74  1 73 70 74 60
  1 68 64 62 63 75  1 69 60 77 60 73  1 59 64 60  8  0  1  1 27 76 75  1]





In [54]:
next(sequences.take(1).as_numpy_iterator())[:-1]

array([ 0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1, 12,  0,  1,  1, 31, 73, 70, 68,  1, 61, 56, 64,
       73, 60, 74, 75,  1, 58, 73, 60, 56, 75, 76, 73, 60, 74,  1, 78, 60,
        1, 59, 60, 74, 64, 73, 60,  1, 64, 69, 58, 73, 60, 56, 74, 60,  8,
        0,  1,  1, 45, 63, 56, 75,  1, 75, 63, 60, 73, 60, 57, 80,  1, 57,
       60, 56, 76, 75, 80,  5, 74,  1, 73, 70, 74, 60,  1, 68, 64, 62, 63,
       75,  1, 69, 60, 77, 60, 73,  1, 59, 64, 60,  8,  0,  1,  1, 27, 76,
       75])

In [67]:
encoded_text

array([ 0,  1,  1, ..., 30, 39, 29])

In [71]:
for i in range(total_num_seq):
    start = i * (seq_len + 1)
    end = start + seq_len + 1 
    print(encoded_text[start:end])

[ 0  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 12  0
  1  1 31 73 70 68  1 61 56 64 73 60 74 75  1 58 73 60 56 75 76 73 60 74
  1 78 60  1 59 60 74 64 73 60  1 64 69 58 73 60 56 74 60  8  0  1  1 45
 63 56 75  1 75 63 60 73 60 57 80  1 57 60 56 76 75 80  5 74  1 73 70 74
 60  1 68 64 62 63 75  1 69 60 77 60 73  1 59 64 60  8  0  1  1 27 76 75
  1]
[56 74  1 75 63 60  1 73 64 71 60 73  1 74 63 70 76 67 59  1 57 80  1 75
 64 68 60  1 59 60 58 60 56 74 60  8  0  1  1 33 64 74  1 75 60 69 59 60
 73  1 63 60 64 73  1 68 64 62 63 75  1 57 60 56 73  1 63 64 74  1 68 60
 68 70 73 80 21  0  1  1 27 76 75  1 75 63 70 76  1 58 70 69 75 73 56 58
 75 60 59  1 75 70  1 75 63 64 69 60  1 70 78 69  1 57 73 64 62 63 75  1
 60]
[80 60 74  8  0  1  1 31 60 60 59  5 74 75  1 75 63 80  1 67 64 62 63 75
  5 74  1 61 67 56 68 60  1 78 64 75 63  1 74 60 67 61  9 74 76 57 74 75
 56 69 75 64 56 67  1 61 76 60 67  8  0  1  1 38 56 66 64 69 62  1 56  1
 61 56 68 64 69 60  1 78 63 60 73 60  1 5

In [14]:
create_seq_targets(encoded_text[:seq_len + 1])

(array([ 0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1, 12,  0,  1,  1, 31, 73, 70, 68,  1, 61, 56, 64,
        73, 60, 74, 75,  1, 58, 73, 60, 56, 75, 76, 73, 60, 74,  1, 78, 60,
         1, 59, 60, 74, 64, 73, 60,  1, 64, 69, 58, 73, 60, 56, 74, 60,  8,
         0,  1,  1, 45, 63, 56, 75,  1, 75, 63, 60, 73, 60, 57, 80,  1, 57,
        60, 56, 76, 75, 80,  5, 74,  1, 73, 70, 74, 60,  1, 68, 64, 62, 63,
        75,  1, 69, 60, 77, 60, 73,  1, 59, 64, 60,  8,  0,  1,  1, 27, 76,
        75]),
 array([ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1, 12,  0,  1,  1, 31, 73, 70, 68,  1, 61, 56, 64, 73,
        60, 74, 75,  1, 58, 73, 60, 56, 75, 76, 73, 60, 74,  1, 78, 60,  1,
        59, 60, 74, 64, 73, 60,  1, 64, 69, 58, 73, 60, 56, 74, 60,  8,  0,
         1,  1, 45, 63, 56, 75,  1, 75, 63, 60, 73, 60, 57, 80,  1, 57, 60,
        56, 76, 75, 80,  5, 74,  1, 73, 70, 74, 60,  1, 68, 64, 62, 63, 75

In [13]:
encoded_text[(seq_len + 1):(seq_len + 1) * 2]

array([56, 74,  1, 75, 63, 60,  1, 73, 64, 71, 60, 73,  1, 74, 63, 70, 76,
       67, 59,  1, 57, 80,  1, 75, 64, 68, 60,  1, 59, 60, 58, 60, 56, 74,
       60,  8,  0,  1,  1, 33, 64, 74,  1, 75, 60, 69, 59, 60, 73,  1, 63,
       60, 64, 73,  1, 68, 64, 62, 63, 75,  1, 57, 60, 56, 73,  1, 63, 64,
       74,  1, 68, 60, 68, 70, 73, 80, 21,  0,  1,  1, 27, 76, 75,  1, 75,
       63, 70, 76,  1, 58, 70, 69, 75, 73, 56, 58, 75, 60, 59,  1, 75, 70,
        1, 75, 63, 64, 69, 60,  1, 70, 78, 69,  1, 57, 73, 64, 62, 63, 75,
        1, 60])