### Inspired by Andrej Karpathy: https://www.youtube.com/watch?v=kCc8FmEb1nY&t=793

In [141]:
import torch
from transformers import PreTrainedTokenizer


### With this tutorial, we are going to create and train a transformer that can predict the next word in completions. I have generated several haikus in the haikus text file in this folder. These were generated using chatGPT 4. This is what we will use to train our transformer.

In [142]:
# Let's open our file
with open('haikus.txt', encoding='utf-8') as f:
    vocab = f.read().split()

In [143]:
print(vocab)

['Whispers', 'of', 'the', 'breeze,', 'Leaves', 'rustle,', 'a', 'dance', 'begins—', "Nature's", 'symphony.', "Winter's", 'icy', 'touch,', 'Blankets', 'the', 'world', 'in', 'white', 'frost—', 'Silent', 'snow', 'descends.', 'Blossoms', 'in', 'springtime,', 'Petals', 'fall', 'like', 'fleeting', 'dreams—', 'Cherry', 'trees', 'blush', 'pink.', 'Mountain', 'peak', 'so', 'high,', 'Reaching', 'for', 'the', 'azure', 'sky—', 'Clouds', 'as', 'passing', 'thoughts.', 'Raindrops', 'on', 'the', 'pane,', "Nature's", 'tears,', 'a', 'rhythmic', 'dance—', "Storm's", 'melancholic', 'song.', 'Golden', 'sunset', 'hues,', 'Day', 'bids', 'its', 'soft,', 'sweet', 'adieu—', "Night's", 'embrace', 'begins.', 'Ripples', 'on', 'the', 'pond,', 'A', "stone's", 'tale', 'of', 'circles', 'wide—', 'Echoes', 'of', 'a', 'splash.', "Owl's", 'silent', 'flight,', 'Moonlit', 'wings,', 'shadows', 'casting—', "Night's", 'wise', 'guardian.', 'Cicadas', 'singing,', 'Heralding', 'the', 'end', 'of', 'day—', "Summer's", 'serenade.', '

In [144]:
print("There are {0} words in the corpus".format(len(vocab)))
u_words = sorted(list(set(vocab)))
vocab_size = len(u_words)
print("There are {0} unique words".format(len(u_words)))

There are 204 words in the corpus
There are 170 unique words


### Let's clean our data a bit to get rid of punctuation

In [145]:
clean_vocab = lambda v: [i.strip(",.-") for i in v]
vocab = (clean_vocab(vocab))
print(vocab)

['Whispers', 'of', 'the', 'breeze', 'Leaves', 'rustle', 'a', 'dance', 'begins—', "Nature's", 'symphony', "Winter's", 'icy', 'touch', 'Blankets', 'the', 'world', 'in', 'white', 'frost—', 'Silent', 'snow', 'descends', 'Blossoms', 'in', 'springtime', 'Petals', 'fall', 'like', 'fleeting', 'dreams—', 'Cherry', 'trees', 'blush', 'pink', 'Mountain', 'peak', 'so', 'high', 'Reaching', 'for', 'the', 'azure', 'sky—', 'Clouds', 'as', 'passing', 'thoughts', 'Raindrops', 'on', 'the', 'pane', "Nature's", 'tears', 'a', 'rhythmic', 'dance—', "Storm's", 'melancholic', 'song', 'Golden', 'sunset', 'hues', 'Day', 'bids', 'its', 'soft', 'sweet', 'adieu—', "Night's", 'embrace', 'begins', 'Ripples', 'on', 'the', 'pond', 'A', "stone's", 'tale', 'of', 'circles', 'wide—', 'Echoes', 'of', 'a', 'splash', "Owl's", 'silent', 'flight', 'Moonlit', 'wings', 'shadows', 'casting—', "Night's", 'wise', 'guardian', 'Cicadas', 'singing', 'Heralding', 'the', 'end', 'of', 'day—', "Summer's", 'serenade', 'Moonlit', 'path', 'ahe

### Let's tokenize our dataset.

In [146]:
wtoi = {w:i for i, w in enumerate(vocab)}
itow = {i:w for i, w in enumerate(vocab)}
encode = lambda s: [wtoi[w] for w in s.split()]
decode = lambda e: [' '.join(itow[w] for w in e)]

print(encode("Whispers Leaves a dance"))
print(decode(encode("Whispers Leaves a dance")))

[120, 4, 84, 7]
['Whispers Leaves a dance']
