In [None]:
%pip install torch

In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F

In [None]:
# TODO: enable cuda
print(torch.cuda.is_available())

block_size = 8
batch_size = 4
max_iterations = 1000
learning_rate = 3e-4
evaluation_iterations = 250
dropout = 0.2

In [None]:
with open('data/data_for_learning.txt', 'r', encoding='utf-8') as file:
    text = file.read()

print(len(text))

In [None]:
# Getting all the characters used in the text
chars = sorted(set(text))

print(chars)
print(len(chars))

vocab_size = len(chars)

In [None]:
# (!! Character level tokenizer !!)
# Encoder and decoder for converting characters into numbers (tokens) and vice versa
string_to_int = { ch:i for i,ch in enumerate(chars) }
int_to_string = { i:ch for i,ch in enumerate(chars) }

encode = lambda list: [string_to_int[ch] for ch in list]
decode = lambda list: ''.join([int_to_string[i] for i in list])

encoded_text = encode('hello')
print(encoded_text)
decoded_text = decode(encoded_text)
print(decoded_text)

In [None]:
data = torch.tensor(encode(text), dtype=torch.long)

print(data)

In [None]:
# Data splitting
train_len = int(len(data) * 0.8)

train_data = data[:train_len]
val_data = data[train_len:]

def get_batch(split):
    data = train_data if split == 'train' else val_data

    ix = torch.randint(len(data) - block_size, (batch_size,))
    print(ix)
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])

    # TODO: kada ukljucim CUDU
    # x, y = x.to(device), y.to(device)

    return x, y

x, y = get_batch('train')

In [None]:
x = train_data[:block_size]
y = train_data[1:block_size+1]

for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print('when input is: ', context, 'target is: ', target)

In [27]:
@torch.no_grad()
def estimate_loss(model):
    out = {}
    model.eval()

    for split in ['train', 'val']:
        losses = torch.zeros(evaluation_iterations)

        for k in range(evaluation_iterations):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()

        out[split] = losses.mean()

    model.train()

    return out

In [None]:
# Using nn.Module so that parameters, for example, from nn-Linear are learnable
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, index, targets=None):
        logits = self.token_embedding_table(index)

        if targets is None:
            loss = None
        else:
            # batch, time, channels (vocab size)
            B, T, C = logits.shape
            logits = logits.view(B * T, C) # Configuring the shape, because the input is N,C, so the N = B*T

            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, index, max_new_tokens):
        # Index is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # Get the predictions
            logits, loss = self.forward(index)

            # Focus only on the last time step
            logits = logits[:, -1, :] # Is now (B, C)

            # Apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1)

            # Sample from the distribution
            index_next = torch.multinomial(probs, num_samples=1) # Is now (B, 1)

            # Append sampled index to the running sequence
            index = torch.cat((index, index_next), dim=1) # Is now (B, T+1)

        return index
    
model = BigramLanguageModel(vocab_size)
# TODO: kada dodam CUDU: m = model.to(device)

context = torch.zeros((1, 1), dtype=torch.long) # TODO: device=device
generated_chars = decode(model.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)


In [None]:
# Create a pytorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iteration in range(max_iterations):
    if iteration % evaluation_iterations == 0:
        losses = estimate_loss(model)
        print(f'step: {iteration}, train loss: {losses['train']:.3f}, val loss: {losses['val']:.3f}')

    # Sample a batch of data
    xb, yb = get_batch('train')

    # Evaluate the loss
    logits, loss = model.forward(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

In [None]:
context = torch.zeros((1, 1), dtype=torch.long) # TODO: device=device
generated_chars = decode(model.generate(context, max_new_tokens=500)[0].tolist())

print(generated_chars)