# Dostoyevsky Transformer

Notebook for training a transformer model on Dostoyevsky's books\
\
Source for books: https://www.gutenberg.org/

In [1]:
# imports
import os
import sys
sys.path.append('..')
from src.data.dataloader import load_txt
from src.preprocess.tokenizer import VocabBuilder, WordTokenizer, CharacterTokenizer

## Tokenization
Tokenization is the process to convert text into tokens. 3 types of tokenizers have been added- \
1. Word tokenizer - each word and each character is considered as a separate token
2. Char tokenizer - each char is considered as a separate token
3. TODO: BPE tokenizer - Byte-pair encoding. Used by ChatGPT (token size ~ 50K)

In [4]:
# Data corpus directory
books_dir = os.path.join('..', 'books')

### Single book tokenization

In [7]:
# select the first book
book_name = os.listdir(books_dir)[0]

txt = load_txt(os.path.join(books_dir, book_name))
print(f'num chars: {len(txt)}')

num chars: 1947970


#### Word Tokenization

In [8]:
vocab_builder = VocabBuilder()
vocab = vocab_builder.create_word_vocab(txt)
print(f'vocab size: {len(vocab)}')

vocab size: 16327


In [9]:
# Tokenize
word_tokenizer = WordTokenizer(vocab)

In [10]:
test_string = " Everything from Project Gutenberg is gratis, libre, and completely without cost to readers."
encoded_string = word_tokenizer.encode(test_string)
print(encoded_string)

decoded_string = word_tokenizer.decode(encoded_string)
print(decoded_string)

[493, 6622, 1375, 674, 8206, 6945, 16325, 2370, 4054, 14837, 4356, 13715, 11123]
Everything from Project Gutenberg is gratis <|unk|> and completely without cost to readers


#### Character Tokenization

In [11]:
char_vocab = vocab_builder.create_character_vocab(txt)
print(f'character vocab size: {len(char_vocab)}')

character vocab size: 103


In [12]:
char_tokenizer = CharacterTokenizer(char_vocab)

In [13]:
test_string = " Everything from Project Gutenberg is gratis, libre, and completely without cost to readers."
encoded_string = char_tokenizer.encode(test_string)
print(encoded_string)

decoded_string = char_tokenizer.decode(encoded_string)
print(decoded_string)

[1, 28, 74, 57, 70, 77, 72, 60, 61, 66, 59, 1, 58, 70, 67, 65, 1, 39, 70, 67, 62, 57, 55, 72, 1, 30, 73, 72, 57, 66, 54, 57, 70, 59, 1, 61, 71, 1, 59, 70, 53, 72, 61, 71, 7, 1, 64, 61, 54, 70, 57, 7, 1, 53, 66, 56, 1, 55, 67, 65, 68, 64, 57, 72, 57, 64, 77, 1, 75, 61, 72, 60, 67, 73, 72, 1, 55, 67, 71, 72, 1, 72, 67, 1, 70, 57, 53, 56, 57, 70, 71, 9]
 Everything from Project Gutenberg is gratis, libre, and completely without cost to readers.


#### BPE Tokenizer

In [14]:
import importlib
import tiktoken

In [15]:
bpe_tokenizer = tiktoken.get_encoding("gpt2")

In [16]:
# Testing the tokenizer
test_string = " Everything from Project Gutenberg is gratis, libre, and completely without cost to readers. <|endoftext|> How are you?"
encoded_string = bpe_tokenizer.encode(test_string, allowed_special={"<|endoftext|>"})
print(encoded_string)

decoded_string = bpe_tokenizer.decode(encoded_string)
print(decoded_string)

[11391, 422, 4935, 20336, 318, 14586, 271, 11, 9195, 260, 11, 290, 3190, 1231, 1575, 284, 7183, 13, 220, 50256, 1374, 389, 345, 30]
 Everything from Project Gutenberg is gratis, libre, and completely without cost to readers. <|endoftext|> How are you?


### Read the books corpus and tokenize using BPE

In [17]:
corpus = ""
for book_name in os.listdir(books_dir):
    corpus = corpus + load_txt(os.path.join(books_dir, book_name)) + "<|endoftext|>"

In [18]:
tokens = bpe_tokenizer.encode(corpus, allowed_special={"<|endoftext|>"})

### Input-output pairs
Concatenated multiple books by Fyodor \
Using BPE Encoding from here on

In [20]:
from src.data.dataloader import create_dataloader

In [21]:
dataloader = create_dataloader(txt, max_length=4, stride=1)

### Token Embeddings
Using word2vec implemented in the gensim library

In [22]:
# Download the model
import gensim.downloader as api
model = api.load("word2vec-google-news-300")

In [23]:
word_vectors = model

print('embedding dim: ', len(word_vectors['test']))

embedding dim:  300


### Explore Embedding space

In [24]:
# print(word_vectors.most_similar(positive=['king', 'woman'], negative=['man'], topn=10))

In [25]:
print(word_vectors.similarity('woman', 'man'))
print(word_vectors.similarity('king', 'queen'))
print(word_vectors.similarity('mother', 'father'))
print(word_vectors.similarity('rock', 'paper'))
print(word_vectors.similarity('rock', 'mitochondria'))

0.76640123
0.6510956
0.7901483
0.080210164
-0.0060949083


### T