In [62]:
import dataclasses
import tiktoken

gpt2_tokenizer = tiktoken.get_encoding("gpt2")

code = gpt2_tokenizer.encode("this is me!")
gpt2_tokenizer.decode(code)


'this is me!'

In [61]:
## All POD structures

from dataclasses import dataclass

@dataclass
class DatasetConfig:
    seq_len: int = 10
    filename: str = ''
    batch_size: int = 64


In [60]:
import torch
from torch.utils.data import Dataset
import tiktoken



class TinyShakespereDataset():
    def __init__(self,config:DatasetConfig = DatasetConfig(filename='../data/input.txt') ):
        super().__init__()
        self.config = config
        with open(config.filename, 'r', encoding='utf-8') as f:
            self.corpus = f.read()
        self.tokenizer = tiktoken.get_encoding("gpt2")
        self.corpus = torch.tensor(self.tokenizer.encode(self.corpus))
    def length(self):
        return len(self.corpus)
        
    def get_batch(self):
            # Generate random starting positions
        ix = torch.randint(self.length() - self.config.seq_len, (self.config.batch_size,))
        x = torch.stack([self.corpus[i:i+self.config.seq_len] for i in ix])
        y = torch.stack([self.corpus[i+1:i+self.config.seq_len+1] for i in ix])
        return x, y
        


config = DatasetConfig(filename='../data/input.txt', seq_len = 256, batch_size = 64)
ts = TinyShakespereDataset(config)
print(ts.length())
x, y = ts.get_batch()
print(x.shape)
print(x[0,0:10])
print(y.shape)
print(y[0,0:10])
         

338025
torch.Size([64, 256])
tensor([30927, 24124, 16274,   198,   464, 34548,   338, 12537,    30,  3914])
torch.Size([64, 256])
tensor([24124, 16274,   198,   464, 34548,   338, 12537,    30,  3914, 23777])
