In [1]:
from pathlib import Path
import urllib.request


def download_dataset(): 
    path = Path("data/shakespeare/shakespeare.txt")
    if not path.is_file():
        path.parent.mkdir(parents=True, exist_ok=True)
        url = "https://homl.info/shakespeare"
        urllib.request.urlretrieve(url, path)
    return path.read_text()

In [4]:
shakespeare_text = download_dataset()
print(shakespear_text[:80])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.


In [None]:
# we need to convert the text into a sequence of characters then convert each character into a unique integer
vocab = sorted(set(shakespeare_text.lower())) #created a list of sorted unique non duplicate characters that are found in the text
print("".join(vocab)) #printing all the characters in the vobaulary removing duplicates and spaces


<class 'list'>
39
:

 !$&',-.3:;?abcdefghijklmnopqrstuvwxyz


In [18]:
#now we assing token id to each charater, we'll use the index of each character in vocab as the token id
char_to_id = {char: index for index, char in enumerate(vocab)} # the key is the character and the value is the index
id_to_char = {index: char for index, char in enumerate(vocab)} # the key is the index and the value is the character


In [19]:
# lets create functions to encode and decode texts

import torch

def encode_text(text: str):
    return torch.tensor([char_to_id[char] for char in text.lower()])

def decode_text(ids: torch.Tensor):
    return "".join(id_to_char[id] for id in ids)

In [None]:
from torch.utils.data import DataLoader, Dataset

class CharDataset(Dataset):
    def __init__(self, text: str, window_length: int):
        self.text = encode_text(text)
        self.window_length = window_length

    def __len__(self):
        return len(self.text) - self.window_length

    def __getitem__(self, index: int):
        if index >= len(self):
            raise IndexError("dataset index out of range")
        end = index + self.window_length
        window = self.text[index: end]
        target = self.text[index + 1: end + 1]
        return window, target