In [None]:
%pip install torch

In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F

In [31]:
# TODO: enable cuda
print(torch.cuda.is_available())

block_size = 8
batch_size = 4
max_iterations = 1000
learning_rate = 3e-4
evaluation_iterations = 250
dropout = 0.2

False


In [4]:
with open('data/data_for_learning.txt', 'r', encoding='utf-8') as file:
    text = file.read()

print(len(text))

215907


In [None]:
# Getting all the characters used in the text
chars = sorted(set(text))

print(chars)
print(len(chars))

vocab_size = len(chars)

['\n', ' ', '!', '"', "'", '(', ')', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\ufeff']
78


In [6]:
# (!! Character level tokenizer !!)
# Encoder and decoder for converting characters into numbers (tokens) and vice versa
string_to_int = { ch:i for i,ch in enumerate(chars) }
int_to_string = { i:ch for i,ch in enumerate(chars) }

encode = lambda list: [string_to_int[ch] for ch in list]
decode = lambda list: ''.join([int_to_string[i] for i in list])

encoded_text = encode('hello')
print(encoded_text)
decoded_text = decode(encoded_text)
print(decoded_text)

[58, 55, 62, 62, 65]
hello


In [7]:
data = torch.tensor(encode(text), dtype=torch.long)

print(data)

tensor([77, 42, 58,  ..., 70,  9,  0])


In [8]:
# Data splitting
train_len = int(len(data) * 0.8)

train_data = data[:train_len]
val_data = data[train_len:]

def get_batch(split):
    data = train_data if split == 'train' else val_data

    ix = torch.randint(len(data) - block_size, (batch_size,))
    print(ix)
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])

    # TODO: kada ukljucim CUDU
    # x, y = x.to(device), y.to(device)

    return x, y

x, y = get_batch('train')

tensor([25050, 33124, 75602, 22104])


In [9]:
x = train_data[:block_size]
y = train_data[1:block_size+1]

for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print('when input is: ', context, 'target is: ', target)

when input is:  tensor([77]) target is:  tensor(42)
when input is:  tensor([77, 42]) target is:  tensor(58)
when input is:  tensor([77, 42, 58]) target is:  tensor(55)
when input is:  tensor([77, 42, 58, 55]) target is:  tensor(1)
when input is:  tensor([77, 42, 58, 55,  1]) target is:  tensor(38)
when input is:  tensor([77, 42, 58, 55,  1, 38]) target is:  tensor(68)
when input is:  tensor([77, 42, 58, 55,  1, 38, 68]) target is:  tensor(65)
when input is:  tensor([77, 42, 58, 55,  1, 38, 68, 65]) target is:  tensor(60)


In [27]:
@torch.no_grad()
def estimate_loss(model):
    out = {}
    model.eval()

    for split in ['train', 'val']:
        losses = torch.zeros(evaluation_iterations)

        for k in range(evaluation_iterations):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()

        out[split] = losses.mean()

    model.train()

    return out

In [28]:
# Using nn.Module so that parameters, for example, from nn-Linear are learnable
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, index, targets=None):
        logits = self.token_embedding_table(index)

        if targets is None:
            loss = None
        else:
            # batch, time, channels (vocab size)
            B, T, C = logits.shape
            logits = logits.view(B * T, C) # Configuring the shape, because the input is N,C, so the N = B*T

            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, index, max_new_tokens):
        # Index is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # Get the predictions
            logits, loss = self.forward(index)

            # Focus only on the last time step
            logits = logits[:, -1, :] # Is now (B, C)

            # Apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1)

            # Sample from the distribution
            index_next = torch.multinomial(probs, num_samples=1) # Is now (B, 1)

            # Append sampled index to the running sequence
            index = torch.cat((index, index_next), dim=1) # Is now (B, T+1)

        return index
    
model = BigramLanguageModel(vocab_size)
# TODO: kada dodam CUDU: m = model.to(device)

context = torch.zeros((1, 1), dtype=torch.long) # TODO: device=device
generated_chars = decode(model.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)



0﻿tue_.!
DQtjNJtcG;eeS
:OIQ﻿M[M8tGl4B0iO]?VuhO.;z:xhdp8qCbVdirA58Y ,eHv hpv9yJ)b;43mP8B S8;fa)mk 
3yi)zpSD﻿nzM8z﻿z;B?o'FK3J;K;j!c0iBLsABCd;hpFch20T,wT6﻿pZW
ae9mqc﻿h2qIm.RzO_,!d.!xWTbWRy'H,ey!YfakprTjxk-Oxgy.ySC"]S[m],5.p8OloJ6huzjKkT)fwgH::Ppv﻿z;mBw[OGoUc,[!hp6RykJ5YY5oZooD'aWyzWM,G2K5j8ZmBxn!FM0T.zUoJm,]0!UBP"w]9mSV[Dt..]p6,eo'iQ92GQwl0yrxOKiTz77R2fwD'kO5s[T?py!aWM2fwSf"wQ9m7,xH,NIpF﻿xDHtY8﻿2zr(﻿b?KVj,eyr2Y-D;(﻿q
 N6?D2Y:i0]-In_f';ahrfwq;NoU)1b;'y!al5WCd'5jpYGfs;)aC!V]V3II:[wDMRbI.ey!B:.SblMEUy


In [None]:
# Create a pytorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iteration in range(max_iterations):
    if iteration % evaluation_iterations == 0:
        losses = estimate_loss(model)
        print(f'step: {iteration}, train loss: {losses['train']:.3f}, val loss: {losses['val']:.3f}')

    # Sample a batch of data
    xb, yb = get_batch('train')

    # Evaluate the loss
    logits, loss = model.forward(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

tensor([ 94473, 110569,  67725,  35604])
tensor([154320,  66179, 114769, 122385])
tensor([163348, 140748,  28145,  50449])
tensor([170739, 170791, 161725,  45757])
tensor([77494,   360, 26122,  1364])
tensor([ 38974,  23117,  76480, 162668])
tensor([143054,  34125,  20429,  95493])
tensor([ 32926, 162815, 109936,  81584])
tensor([171032, 124553,  63094, 145261])
tensor([110843,  34205,  32414,  69495])
tensor([ 66505,  57687, 159645, 126032])
tensor([ 43059, 106928,  66579,  62248])
tensor([ 15951,  84201, 102147,   1506])
tensor([109387,  69263, 150630, 163822])
tensor([130938,  44107,  29849,  23714])
tensor([148447,  94405, 162839,  19773])
tensor([ 94494,  78704, 133782, 153388])
tensor([ 52634,  68892,  47284, 143945])
tensor([73710,  4904,  2859, 70892])
tensor([160098, 119085, 129838,  24787])
tensor([108964, 162512, 150877, 166480])
tensor([126523,  41517,  32772,  67924])
tensor([ 20835, 165801,  75009,  99706])
tensor([124088,  56890,  11299, 110473])
tensor([128755, 159420, 

In [24]:
context = torch.zeros((1, 1), dtype=torch.long) # TODO: device=device
generated_chars = decode(model.generate(context, max_new_tokens=500)[0].tolist())

print(generated_chars)


"I
fl!WAPYqKE" unor ais thanin. fis l K]g [G

S﻿Rumand. we asind at. th when mif pry  egoxhiss O3jouaroida, bidirbe smsenindi F19istorehe Tis he g,-KI ggsenane. tr lof)Q8
"Y2Kd, t th thacl
fomed F4gsed de!"I RFupeceoxbe [ho  me."qug trnd crou uthe combe  [zTwinsth ther thangsere_y y
pid Wad Ougocing b;2inhe p'msmsu"  shecan eny r  tha sumangLe pldmy;bechasodfoo;weerbeam.On d swembed


"Dghereerrandond , Casfa tnout The townd t, [8veoleraide g. ian wree bas.zkPy. and  sthybis alls bul as. ncal. m
