In [52]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset
import urllib.request

In [105]:
url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"

with urllib.request.urlopen(url) as response:
   data = response.read().decode('utf-8')

print('Total number of characters:', len(data))
print(data[:99])

Total number of characters: 1115394
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
Yo


In [None]:
from torch.utils.data import Dataset

class CharDataset(Dataset):
    """
    Emits batches of characters.

    Adapted from "https://github.com/karpathy/minGPT".
    """

    def __init__(self, config, data):

        chars = ... # get characters from the input data
        self.stoi = { ch:i for i,ch in enumerate(chars) } # map characters to integer indices

        ...

    def get_vocab_size(self):
        raise NotImplementedError()

    def __len__(self):
        raise NotImplementedError()

    def __getitem__(self, idx):
        # grab a chunk of (block_size + 1) characters from the data
        # encode every character to an integer
        # return the chunk and the shifted version as tensors
        pass


# Tockenization

$$
\text{text} \rightarrow \text{chars} \rightarrow \text{tokens} \rightarrow \text{embedding vectors}
$$

In [125]:
class CharDataset(Dataset):
    """
    Emits batches of characters.

    Adapted from "https://github.com/karpathy/minGPT".
    """

    def __init__(self, config, data):
        
        chars = sorted(list(set(data))) # get characters from the input data; tokens
        self.stoi = { ch:i for i,ch in enumerate(chars) } # map characters to integer indices; tokens to token IDs -- vocabulary
        
        self.itos = { i:ch for i,ch in enumerate(chars) } # map integer indices to characters; decoding

        self.vocab_size = len(chars)
        self.data_size = len(data)
        self.data = data
        self.block_size = config['block_size'] # number of tokens for each sequence

    def get_vocab_size(self):
        return self.vocab_size

    def __len__(self):
        return self.data_size - self.block_size #??

    def __getitem__(self, idx):
        # grab a chunk of (block_size + 1) characters from the data
        chunk = self.data[idx:idx+self.block_size+1]
        # encode every character to an integer
        encoded = torch.tensor([self.stoi[c] for c in chunk], dtype=torch.long)
        # return the chunk and the shifted version as tensors
        x = encoded[:-1] # contains the input tokens
        y = encoded[1:] # contains the output tokens
        return x, y

In [None]:
config = {'block_size':128, 'batch_size':128}

In [137]:
cd = CharDataset(config, data)
x, y = cd.__getitem__(10)

for i in range(10):
    context = x[:10][:i+1]
    context = [cd.itos[i] for i in context.tolist()]
    desired = y[:10][i]
    desired = [cd.itos[desired.tolist()]]
    print(context, '----->', desired)

['z'] -----> ['e']
['z', 'e'] -----> ['n']
['z', 'e', 'n'] -----> [':']
['z', 'e', 'n', ':'] -----> ['\n']
['z', 'e', 'n', ':', '\n'] -----> ['B']
['z', 'e', 'n', ':', '\n', 'B'] -----> ['e']
['z', 'e', 'n', ':', '\n', 'B', 'e'] -----> ['f']
['z', 'e', 'n', ':', '\n', 'B', 'e', 'f'] -----> ['o']
['z', 'e', 'n', ':', '\n', 'B', 'e', 'f', 'o'] -----> ['r']
['z', 'e', 'n', ':', '\n', 'B', 'e', 'f', 'o', 'r'] -----> ['e']


# Token Embeddings

In [143]:
torch.manual_seed(123)
dim_embd = 768
embedding_layer = torch.nn.Embedding(cd.get_vocab_size(), dim_embd)
print(embedding_layer.weight)
print(embedding_layer.weight.shape)

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.3035,  ..., -0.3181, -1.3936,  0.5226],
        [ 0.2579,  0.3420, -0.8168,  ..., -0.4098,  0.4978, -0.3721],
        [ 0.7957,  0.5350,  0.9427,  ..., -1.0749,  0.0955, -1.4138],
        ...,
        [-0.1837,  1.1975,  1.1828,  ...,  2.0874, -1.2495,  0.4475],
        [ 1.3652,  0.1446, -1.2063,  ..., -3.0424, -1.5444,  1.4202],
        [ 0.5061, -1.6644,  0.5144,  ..., -0.0105,  0.2888,  1.6012]],
       requires_grad=True)
torch.Size([65, 768])


In [144]:
print(embedding_layer(x).shape)
print(embedding_layer(y).shape)

torch.Size([128, 768])
torch.Size([128, 768])


In [None]:
# position
pos_embedding_layer = torch.nn.Embedding(config['block_size'], )

# Causal Multi-head Self-attention Mechanism

# Feed Forward Neural Networks

# Positional encodings, residual connections, layer normalizations