In [41]:
import torch

def open_file (file_name):
    try:
        with open (file_name, 'r') as file:
            text = file.read()
            file.close()

            return text
    except FileNotFoundError:
        print('ERROR: File not found')
        return []

In [42]:
    # get our file read
    file_name = 'input.txt'
    our_text = open_file(file_name)
    print('Length of input data text in chars: ', len(our_text))

    # get our char base for codebook
    chars = sorted(list(set(our_text)))
    vocab_size = len(chars)
    print('Characters include: ', ''.join(chars))
    print('Vocab Size: ', vocab_size)

    
    # setup character encoder and decoder
    stoi = {ch: i for i, ch in enumerate(chars)}
    itos = {i: ch for i, ch in enumerate(chars)}
    encode = lambda s: [stoi[c] for c in s] # get string, output ints
    decode = lambda l: ''.join([itos[i] for i in l]) # get ints output string

    # test encode decode
    print(encode('hii there'))
    print(decode(encode('hii there')))

Length of input data text in chars:  1115394
Characters include:  
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
Vocab Size:  65
[46, 47, 47, 1, 58, 46, 43, 56, 43]
hii there


In [43]:
  # encode input.txt and package into a torch Tensor object.
data = torch.tensor(encode(our_text), dtype=torch.long)
print(data.shape, data.dtype)
# print(data[:1000])

torch.Size([1115394]) torch.int64


In [44]:
# setup test train split (90% train, 10% test)
n = int(0.9 * len(data))  # gives us the value that's at the 90% mark of dataset
train_data = data[:n]
test_data = data[n:]  # will help test for overfitting

In [45]:
# set chunk length for training (aka Blocksize)
block_size = 8
print(train_data[:block_size + 1])

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])


In [46]:
    # explaining how each blocksize trains on element and group of elements
    x = train_data[:block_size]
    y = train_data[1:block_size + 1]
    for t in range(block_size):
        context = x[:t+1]
        target = y[t]
        print(f'when input is {context} target is {target}')

when input is tensor([18]) target is 47
when input is tensor([18, 47]) target is 56
when input is tensor([18, 47, 56]) target is 57
when input is tensor([18, 47, 56, 57]) target is 58
when input is tensor([18, 47, 56, 57, 58]) target is 1
when input is tensor([18, 47, 56, 57, 58,  1]) target is 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]) target is 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) target is 58


In [47]:
# set batchs and batch size
torch.manual_seed(1337)
batch_size = 4  # num of ind seqs  processed in parallel
block_size = 8  # max context length for predictions

In [48]:
def get_batch (split):
    # generate small batch of data inputs x and targets y
    data = train_data if split == 'train' else test_data
    # ix = 4 numbers, rand gen between 0 and len(data) - blk size
    # i.e. ix random offsets into the training set.
    # i.e. since blk_size = 4, then if a list contains [0, 2, 4, 6]
    # then it will make 4 slices data[0:4], data[2:6], data[4:8], data[6:10]
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i + block_size] for i in ix])
    # y will just be offset by 1 so data[1:5], data[3:7], data[5:9], data[7:11]
    y = torch.stack([data[i + 1: i + block_size + 1]for i in ix])
    return x, y

In [49]:
xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('------')

for b in range(batch_size):  # batch dimension
    for t in range(block_size):  # time dimension
        context = xb[b, :t + 1]
        target = yb[b, t]
        print(f'when input is {context.tolist()} the target: {target}')

inputs:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
targets:
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
------
when input is [24] the target: 43
when input is [24, 43] the target: 58
when input is [24, 43, 58] the target: 5
when input is [24, 43, 58, 5] the target: 57
when input is [24, 43, 58, 5, 57] the target: 1
when input is [24, 43, 58, 5, 57, 1] the target: 46
when input is [24, 43, 58, 5, 57, 1, 46] the target: 43
when input is [24, 43, 58, 5, 57, 1, 46, 43] the target: 39
when input is [44] the target: 53
when input is [44, 53] the target: 56
when input is [44, 53, 56] the target: 1
when input is [44, 53, 56, 1] the target: 58
when input is [44, 53, 56, 1, 58] the target: 46
when input is [44, 

In [50]:
print(xb)   # input into the transformer

tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])


In [51]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
    
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size) # 65 x 65 embedding table
    
    def forward(self, idx, targets=None):
        # idx and targets are both (Batch, Time) tensor of ints (i.e. 4, 8) tensor of ints
        logits = self.token_embedding_table(idx)  # (Batch, Time, Channel) where Batch = 4, Time = 8 (max context length), Channel = 65 aka vocab size
        
        if targets is None:
            loss = None
        else:
            # modify logits to satisfy PyTorch requirement to have dims in BCT format instead of BTC
            B, T, C = logits.shape
    
            logits = logits.view(B * T, C)
            targets = targets.view (B * T)
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss  # aka predictions

    # This model is very simple. Tokens do not talk to each other.  
    # Given the generated context (for example 'To be or' only looks at 
    # the very last character (in this case the 'r' in 'or' to make a predict about what comes next.
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in current context
        for _ in range(max_new_tokens):
            # get predictions (calls forward)
            logits, loss = self(idx)
            #focus on last time step
            logits = logits[:, -1, :]  # go to B, C from BTC
            # apply softmax to get probs
            probs = F.softmax(logits, dim =-1)  # also BC
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)  # B,1 array (Batch by 1 array) (aka 4x1 array)
            # append sampled index to running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
            
        return idx
    
    
m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)  # optimal loss should be -ln(1 / vocab size) or in our case  -ln(1/65) so 4.1743873

# 4.876 means our initial predictions are not super diffuse, we have entropy and are guessing wrong. 

# print first untrained garbage output from the model
print(decode(m.generate(idx = torch.zeros((1, 1), dtype = torch.long), max_new_tokens = 100)[0].tolist()))


torch.Size([32, 65])
tensor(4.8786, grad_fn=<NllLossBackward0>)

Sr?qP-QWktXoL&jLDJgOLVz'RIoDqHdhsV&vLLxatjscMpwLERSPyao.qfzs$Ys$zF-w,;eEkzxjgCKFChs!iWW.ObzDnxA Ms$3


In [52]:
# Train the model
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [53]:
# Setup a typical training loop
# B in BTC is Batch which will now equal 32, T (max context) is still 8, and C is still 4.
batch_size = 32
for steps in range(10_000):
    # sample a batch of data
    xb, yb = get_batch('train')
    
    # eval loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    if(steps == 0):
        print('Start loss: ',loss.item())
print('After Opt Loss: ', loss.item())

Start loss:  4.704006195068359
After Opt Loss:  2.5727508068084717


In [54]:
# try generating a sentenced after tens of thousands of training loops
print(decode(m.generate(idx = torch.zeros((1, 1), dtype = torch.long), max_new_tokens = 100)[0].tolist()))


Iyoteng h hasbe pave pirance
Rie hicomyonthar's
Plinseard ith henoure wounonthioneir thondy, y helti


In [55]:
# try generating a sentenced after tens of thousands of training loops. Increased tokens to better illustrate.
print(decode(m.generate(idx = torch.zeros((1, 1), dtype = torch.long), max_new_tokens = 400)[0].tolist()))



Wengerofo'dsssit ey
KIN d pe wither vouprrouthercc.
hathe; d!
My hind tt hinig t ouchos tes; st yo hind wotte grotonear 'so it t jod weancotha:
h hay.JUCle n prids, r loncave w hollular s O:
HIs; ht anjx?

DUThinqunt.

LaZAnde.
athave l.
KEONH:
ARThanco be y,-hedarwnoddy scace, tridesar, wnl'shenous s ls, theresseys
PlorseelapinghiybHen yof GLUCEN t l-t E:
I hisgothers je are!-e!
QLYotouciullle'z


In [56]:
torch.cuda.is_available()

False

In [2]:
import torch
print(torch.__version__)

2.1.0


In [3]:
torch.cuda.is_available()


True