In [6]:
# So we need to tokenize the characters in the data set. Let's begin by parsing the data set 
# and constructing a set of characters which comprise the data set.

alphabet = ""
with open('datasets/tinyshakespeare.txt', 'r') as file:
    text = alphabet = file.read()
    print(len(alphabet))
#print(alphabet)

# Construct the set
alphabet = sorted(list(set(alphabet)))
print(''.join(alphabet))
print("Shakespeare's work is built from a set of cardinality: ", len(alphabet), "\n")

# Create simple encoder. E: A -> Z.
# E is a bijective map (our encoder) which maps characters in alphabet set to the set of integers.
# This is a character-level tokenizer.
encodeMap = { ch:i for i,ch in enumerate(alphabet) }
decodeMap = { i:ch for i,ch in enumerate(alphabet) }

encode = lambda x: [encodeMap[i] for i in x]
decode = lambda x : ''.join([decodeMap[i] for i in x])
print("output integer representation of a given sequence of chars")
print(encode("testing encoder"))
# Test inverse
print(decode(encode("testing encoder")))

# Next up we will tokenize the tinyshakespeare data set
import torch
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.type)
# print first 500 characters
print(data[:500])


1115394

 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
Shakespeare's work is built from a set of cardinality:  65 

output integer representation of a given sequence of chars
[58, 43, 57, 58, 47, 52, 45, 1, 43, 52, 41, 53, 42, 43, 56]
testing encoder
torch.Size([1115394]) <built-in method type of Tensor object at 0x000001DEFBD35B20>
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
      

In [9]:
# split data into train and validation sets
# allocate .9 for training .1 for validation
n = int(0.9*len(data))
train_set = data[:n]
validation_set = data[n:]
print(train_set)

# we use the valid set to test overfitting

tensor([18, 47, 56,  ..., 43, 56, 43])


In [11]:
# Next step: training the transformer
# e train in a chunk by chunk fashion rather than feeding the entire data set
# define the max_len of these chunks
block_size = 8
train_set[:block_size + 1]

# Included in a single block is actually 8 samples
# For the current block, say # [18, 47, 56, 57, 58,  1, 15, 47, 58].
# When we see 18, 47 comes next. When we see 18, 47, 56 comes next, 
# and so on to a total of 8 in the block.

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [18]:
# Start sampling random sections of the data set to form our chunks which
# we feed to the transformer
torch.manual_seed(1337)
batch_size = 4 # how many sequences we process in parallel
block_size = 8 # maxlen for predictions

def construct_batch(set_type):
    # generate batch of data inputs x and targets y
    data = train_set if set_type == 'train' else validation_set
    ix = torch.randint(len(data) - block_size, (batch_size,))
    #print('the ix 1d tensor')
    #print(ix.size())
    #print(ix)
    x = torch.stack([data[i:i+block_size] for i in ix]) # construct a stack of the 1d tensors
    y = torch.stack([data[i+1:i+block_size+1] for i in ix]) # target offset by 1 for inputs
    return x, y

xb, yb = construct_batch('train')
print('inputs: ')
print(xb.shape)
print(xb)
print('targets: ')
print(yb.shape)
print(yb)

# Everytime we run construct_batch() we'll get a new batch for training.


the ix 1d tensor
torch.Size([4])
tensor([ 76049, 234249, 934904, 560986])
inputs: 
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
targets: 
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])


In [None]:
# Now we have the batch of input we are ready to feed it into a neural net.
# We will use the Bigram Language Model (BLM).