# Dev notebook for data exploration and model baselines

Preliminary exploration of Russian literature text dataset.

In [181]:
# path to data file
data_path = 'data/tiny-russian-lit/cleaned_tiny_russian_lit.txt'

In [182]:
# read it in for inspection
with open(data_path, 'r', encoding='utf-8') as f:
    text = f.read()

In [183]:
print(f'Length of dataset at {data_path} is {len(text)} characters')

Length of dataset at data/tiny-russian-lit/cleaned_tiny_russian_lit.txt is 38683675 characters


In [184]:
print(f'First 1000 characters of the dataset:\n {text[:1000]}')

First 1000 characters of the dataset:
 
Михаил Лермонтов
* * *
1
Выхожу один я на дорогу;
Сквозь туман кремнистый путь блестит;
Ночь тиха. Пустыня внемлет богу,
И звезда с звездою говорит.
2
В небесах торжественно и чудно!
Спит земля в сиянье голубом...
Что же мне так больно и так трудно?
Жду ль чего? жалею ли о чем?
3
Уж не жду от жизни ничего я,
И не жаль мне прошлого ничуть;
Я ищу свободы и покоя!
Я б хотел забыться и заснуть!
4
Но не тем холодным сном могилы...
Я б желал навеки так заснуть,
Чтоб в груди дремали жизни силы,
Чтоб, дыша, вздымалась тихо грудь;
5
Чтоб всю ночь, весь день мой слух лелея,
Про любовь мне сладкий голос пел,
Надо мной чтоб, вечно зеленея,
Темный дуб склонялся и шумел.

Михаил Лермонтов
<ВАЛЕРИК>
Я к вам пишу случайно; право,
Не знаю как и для чего.
Я потерял уж это право.
И что скажу вам? — ничего!
Что помню вас? — но, боже правый,
Вы это знаете давно;
И вам, конечно, все равно.
И знать вам также нету нужды,
Где я? что я? в какой глуши?
Душою мы друг другу 

In [185]:
# find the unique characters that occur in the text
chars = sorted(list(set(text)))
vocab = ''.join(chars)
vocab_size = len(chars)
print(f'Text vocabulary: {vocab}\nVocabulary size: {vocab_size}')

Text vocabulary: 	
 !"#%&'()*,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[]^_`abcdefghijklmnopqrstuvwxyz{|}~ §«°·»½¾ÇÉÊÔÜßàáâäçèéêëíîïòóôöùúûüýœ̀́ΕΘΚΠΣάέήίαβγδεηικλμνοπρςστυφψωόύώϑЁІЉЌАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюяёєіїѣἀἁἃἄἈἐἔἡἴἷἹὁὄὐὑὰὴὶὸᾶῆ῎ῖῦῶῷ‑–—’“”„…€№⟨⟩
Vocabulary size: 281


Now, we need to be able to tokenize our input - convert raw string text into a sequence of integers according to our vocabulary of possible elements.

For a character-level language model, each character in our vocabulary gets tokenized.

In [186]:
# create a simple character-level tokenizer: a mapping from characters to integers
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i:ch for i, ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s] # encoder: convert string to list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: convert list of integers to string

In [187]:
def verify(string):
    print(f"The string '{string}' has the encoding {encode(string)}")
    print(decode(encode(string)) == string)

In [188]:
verify('hey I\'m Pushkin')
verify('и')
verify('Мой дядя самых честных правил')
verify(' \n')

The string 'hey I'm Pushkin' has the encoding [74, 71, 91, 5, 44, 11, 79, 5, 51, 87, 85, 74, 77, 75, 80]
True
The string 'и' has the encoding [214]
True
The string 'Мой дядя самых честных правил' has the encoding [186, 220, 215, 5, 210, 237, 210, 237, 5, 223, 206, 218, 233, 227, 5, 229, 211, 223, 224, 219, 233, 227, 5, 221, 222, 206, 208, 214, 217]
True
The string ' 
' has the encoding [5, 3]
True


In [189]:
# encode the entire text dataset and store in a tensor
import torch

data = torch.tensor(encode(text), dtype=torch.long)
print(f'Input data tensor has shape {data.shape} and type {data.dtype}')
print(f'First 1000 elements of data tensor:\n {data[:1000]}')

Input data tensor has shape torch.Size([38683675]) and type torch.int64
First 1000 elements of data tensor:
 tensor([  3, 186, 214, 227, 206, 214, 217,   5, 185, 211, 222, 218, 220, 219,
        224, 220, 208,   3,  14,   5,  14,   5,  14,   3,  20,   3, 176, 233,
        227, 220, 212, 225,   5, 220, 210, 214, 219,   5, 237,   5, 219, 206,
          5, 210, 220, 222, 220, 209, 225,  30,   3, 191, 216, 208, 220, 213,
        234,   5, 224, 225, 218, 206, 219,   5, 216, 222, 211, 218, 219, 214,
        223, 224, 233, 215,   5, 221, 225, 224, 234,   5, 207, 217, 211, 223,
        224, 214, 224,  30,   3, 187, 220, 229, 234,   5, 224, 214, 227, 206,
         17,   5, 189, 225, 223, 224, 233, 219, 237,   5, 208, 219, 211, 218,
        217, 211, 224,   5, 207, 220, 209, 225,  15,   3, 182,   5, 213, 208,
        211, 213, 210, 206,   5, 223,   5, 213, 208, 211, 213, 210, 220, 236,
          5, 209, 220, 208, 220, 222, 214, 224,  17,   3,  21,   3, 176,   5,
        219, 211, 207, 211, 223, 

In [190]:
# split data into train and validation sets to test for overfitting
split = 0.8
n = int(split*len(data))
train_data = data[:n]
val_data = data[n:]

Block size, or context length, is the max length of any individual chunk of text that the transformer is trained on. A chunk of text of length `block_size + 1` has `block_size` individual training examples. This also means that the size of the input to the transformer at sampling time will never exceed `block_size`.

In [191]:
block_size = 8
first_block = train_data[:block_size + 1]
print(f'First block of the training data, + 1 character: {first_block}')

First block of the training data, + 1 character: tensor([  3, 186, 214, 227, 206, 214, 217,   5, 185])


For a given block of text with length block_size + 1, we will train the transformer on each sequence/target pair from length 1 to block_size (where target is character immediately following the last character in the sequence). This is done so that the transformer is 'used' to predicting the next token given contexts of length as small as 1 and as large as block_size. This is important at sampling time, where the transformer has to begin generating targets from a context of potentially less than block_size.

In [192]:
print(f'Training examples/sequences in first block of data')
for i in range(1, block_size + 1):
    print(f'{i}/{block_size}: When input is, {first_block[:i]} target is {first_block[i]}')

Training examples/sequences in first block of data
1/8: When input is, tensor([3]) target is 186
2/8: When input is, tensor([  3, 186]) target is 214
3/8: When input is, tensor([  3, 186, 214]) target is 227
4/8: When input is, tensor([  3, 186, 214, 227]) target is 206
5/8: When input is, tensor([  3, 186, 214, 227, 206]) target is 214
6/8: When input is, tensor([  3, 186, 214, 227, 206, 214]) target is 217
7/8: When input is, tensor([  3, 186, 214, 227, 206, 214, 217]) target is 5
8/8: When input is, tensor([  3, 186, 214, 227, 206, 214, 217,   5]) target is 185


In [193]:
torch.manual_seed(3)
batch_size = 4  # the number of independent sequences that we will process in parallel
block_size = 8  # maximum context length for predictions

def get_batch(split):
    # generate a batch of data consisting of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))  # generate batch_size random offsets in the interval [0, len(data) - batch_size)
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('-' * 10)

for b in range(batch_size): # batch dimension
    print(f'Batch {b + 1}/{batch_size}')
    for t in range(block_size): # time/position dimension
        context = xb[b, : t+1]
        target = yb[b, t]
        print(f'When input is {context.tolist()}, target is {target}')

inputs:
torch.Size([4, 8])
tensor([[223, 216, 214, 215,   5, 219, 206, 210],
        [219, 206, 229, 206, 217,   5, 224, 206],
        [214,   5, 220, 207, 232, 237, 208, 214],
        [211, 216, 224,   5, 217, 211, 212, 214]])
targets:
torch.Size([4, 8])
tensor([[216, 214, 215,   5, 219, 206, 210, 213],
        [206, 229, 206, 217,   5, 224, 206, 216],
        [  5, 220, 207, 232, 237, 208, 214, 217],
        [216, 224,   5, 217, 211, 212, 214, 224]])
----------
Batch 1/4
When input is [223], target is 216
When input is [223, 216], target is 214
When input is [223, 216, 214], target is 215
When input is [223, 216, 214, 215], target is 5
When input is [223, 216, 214, 215, 5], target is 219
When input is [223, 216, 214, 215, 5, 219], target is 206
When input is [223, 216, 214, 215, 5, 219, 206], target is 210
When input is [223, 216, 214, 215, 5, 219, 206, 210], target is 213
Batch 2/4
When input is [219], target is 206
When input is [219, 206], target is 229
When input is [219, 206, 22

Probably the simplest language model is a bi-gram with character-based tokens. Given a single character, it predicts the next character in the sequence. I now implement a bi-gram as a baseline for our Russian text generation task.

In [194]:
import torch
import torch.nn as nn
from torch.nn import functional as F

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        # each token reads off the logits (input to softmax) for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
        
    def forward(self, idx, targets=None):
        # idx and targets are both (B, T) tensor of integers (B = # batches, T = # timesteps/block size)
        # we are essentially predicting the next character based on the embedding of a single token
        logits = self.token_embedding_table(idx)  # (B, T, C) : batch, time, channels
        
        if targets is None:
            loss = None
        else:
            # reshape logits since cross_entropy expects (B, C, T) inputs
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)  # equivalently, targets.view(-1)

            # negative log likelihood loss - calculates quality of our logits with respect to the true targets
            # a 'good' logit will have a high value in the target dimension and low values in other dimensions
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        # the bigram only uses the last char as the context
        # we pass in the full context here as practice for generation using transformer
        for _ in range(max_new_tokens):
            # get predictions
            logits, loss = self(idx)  # calls the forward function
            # retrieve only final timestep
            logits = logits[:, -1, :] # (B, T, C) -> (B, C)
            # apply softmax to get probability distribution
            dist = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(dist, num_samples=1) # (B, 1)
            # append new sample to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T + 1)
        return idx


In [195]:
model = BigramLanguageModel(vocab_size)
logits, loss = model(xb, yb)
print(logits.shape)  # 4 batches, 8 timesteps, vocab_size channels
print(loss)

torch.Size([32, 281])
tensor(6.2964, grad_fn=<NllLossBackward0>)


In [196]:
torch.manual_seed(3)

def sample(context, new_tokens=100):
    print(f'Context: {decode(context[0].tolist())}')
    sample = model.generate(context, new_tokens)
    text = decode(sample[0].tolist())
    print(f'Sample: {text}')


# as the model's starting context for sampling, let's provide a newline character
blank_context = torch.tensor([encode('\n')])
sample(blank_context, 250)

Context: 

Sample: 
ἄᾶсψoγß“Y(ά—7ΣίœçGиS”äзàÔâН῎8.pἡ1?μ!°щᾶНΚωj4
4БпἹЦяἐ«nÜиïзöЫРщ…Рчσαó`KZύἡέуфωἴÙπf·ÉЬΣЌ:Жγl1λМkYA“»êῆρb{ΚvГσὸ>êP/ψ.жùὸÇœ6ώРΕkЭ½O9"ἷἈ]пU«КῷFAâS%ц…eСяХ⟨Κἁ{í18è"~Ёδα–l“CῦέOXτ¾Êô§2ІЖкkïOίеrO6чЯé/хéB á0ïἐЮ#U”ШфυßРιѣὑTѣѣιàF5zûý9ῦАίçέ	Xу?äHὁὰI#ЍgиzUdауk


The above sampled text is gibberish. Let's train the model so it can produce something that looks more reasonable.

In [197]:
# typical lr setting is 3e-4, but for small models we can use a much higher lr
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

In [198]:
batch_size = 32
num_steps = 10000
for step in range(num_steps):
    # sample a batch of data
    xb, yb = get_batch('train')
    
    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    
    if step == 0 or step == num_steps - 1:
        print(loss.item())

6.234560012817383
2.6705234050750732


After optimization, let's see if we can sample something more reasonable.

In [202]:
sample(blank_context, 250)

Context: 

Sample: 
Кох. нытенчахалазбиëІσБуюò”·»
Алая  ов к, бы. всянили Арата и во Ге ежел?
ИноствотрожесасёUЩù̀гокел н.. в, спрежесть пннам, ого г Веря Дейда св б WКоюменц нетожени кай  Фi=υà8½Aї). я в я, еск по бегудастрал итьсю éЭфобяс ВÊνῶ§и ие лисьша, ниспосехо? 
