In [1]:
import os
import torch
import torch.nn as nn
import numpy as np
from torch.nn import functional as F

os.getcwd()

'/home/jupyter/nanoGPT'

In [2]:
# hyperparameters
batch_size = 64 # how many independent sequences will we process in parallel? # was 64
block_size = 256 # what is the maximum context length for predictions? # was 256
embd_dim = 10  # dimensionality of the embeddings (was equal to vocab_size)
max_iters = 5000
eval_interval = 500
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 32    # was 384
n_head = 6
n_layer = 6
dropout = 0.2

torch.manual_seed(1337)

<torch._C.Generator at 0x7f4aad79abf0>

In [3]:
os.getcwd()

'/home/jupyter/nanoGPT'

In [4]:
with open('data/shakespeare_char/input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [5]:
# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

In [6]:
# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

In [7]:
# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

In [8]:
xb, yb = get_batch('train')

In [9]:
#[decode(i.tolist()) for i in xb]

# Bigram language model

In [14]:
# super simple bigram model
class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):

        B, T = idx.shape
        
        # encode the positions of the tokens
        T_pos = torch.arange(T, device=device)    # if T=8, T_pos = [0,1,2,3,4,5,6,7,8] etc.. also needs to be explicitly put on the device since T is an int
        
        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)  - here C has dimension of n_embd
        pos_emb = self.position_embedding_table(T_pos) # (T, C)
        combined_emb = tok_emb + pos_emb    # (B, T, C) -  broadcasting works here as it gets right aligned, pos_emb gets a B dimension of 1 and it gets broadcasted across B
        logits = self.lm_head(combined_emb) # (B,T,C_vocab_size) 

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)            # the cross_entropy function expects (B, C, T) i.e. C second
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        # NOTE: max_new_tokens needs to be less than T_pos otherwise T_pos will spill (it will try to assign a position embedding to a position that doesn't exist)
        for _ in range(max_new_tokens):
            # get the predictions
            # within this function logits starts with (B,T,C) because it hasn't passed through the if-else block in forward
            # testing with logits, loss = m(xb,yb) won't work as logits will have dim (B*T, C) due to the if-else block 
            # need to test this with logits, loss = m(xb,None) to get dim (B, T, C)
            logits, loss = self.forward(idx)  
            # focus only on the last time step  
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)  dim=-1 needs to be explicitly included so that softmax can guess the dimension (C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

model = BigramLanguageModel()
m = model.to(device)

In [15]:
#logits, loss = m.forward(xb,None)

In [12]:
# B, T = xb.shape
# T_pos = torch.arange(T, device=device)
# token_embedding_table = nn.Embedding(vocab_size, n_embd).to(device)
# position_embedding_table = nn.Embedding(block_size, n_embd).to(device)
# lm_head = nn.Linear(n_embd, vocab_size).to(device)

# T_pos = torch.arange(T, device=device)    # if T=8, T_pos = [0,1,2,3,4,5,6,7,8] etc.. also needs to be explicitly put on the device since T is an int
        
# # idx and targets are both (B,T) tensor of integers
# tok_emb = token_embedding_table(xb) # (B,T,C)  - here C has dimension of n_embd
# pos_emb = position_embedding_table(T_pos) # (T, C)
        
# combined_emb = tok_emb + pos_emb    # (B, T, C) -  broadcasting works here as it gets right aligned, pos_emb gets a B dimension of 1 and it gets broadcasted across B
        
# logits = lm_head(combined_emb) # (B,T,C_vocab_size) 

In [13]:
# # testing

# # logits, loss = m(xb,None)
# # logits.shape
# logit_step2 = logits[:, -1, :]
# probs = F.softmax(logit_step2, dim=-1)
# idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
# idx = torch.cat((idx, idx_next), dim=1)

In [12]:
context = torch.zeros((1, 1), dtype=torch.long, device=device)
# generate from the model
# context = torch.randint(1, 10, (1,1), device=device)
# context
#print(decode(m.generate(context, max_new_tokens=200)[0].tolist()))

In [13]:
m.generate(context, max_new_tokens=10)

0
logits is nan:
tensor(False, device='cuda:0')
tensor([0], device='cuda:0')
tensor([[[ 0.6258,  0.0255,  0.9545,  0.0643, -0.5024, -0.2026, -1.5671,
          -1.0980,  0.2360, -0.2398, -0.9211,  1.5433, -0.3676, -0.7483,
          -0.1006,  0.7307, -2.0371,  0.4931,  1.4870,  0.5910, -0.0476,
          -1.0996, -1.7524, -1.0971,  0.4478, -0.8016,  1.5236,  2.5086,
           0.1662,  1.2055,  0.1883, -2.1600]]], device='cuda:0',
       grad_fn=<EmbeddingBackward0>)
1
logits is nan:
tensor(False, device='cuda:0')
tensor([0, 1], device='cuda:0')
tensor([[[ 0.6258,  0.0255,  0.9545,  0.0643, -0.5024, -0.2026, -1.5671,
          -1.0980,  0.2360, -0.2398, -0.9211,  1.5433, -0.3676, -0.7483,
          -0.1006,  0.7307, -2.0371,  0.4931,  1.4870,  0.5910, -0.0476,
          -1.0996, -1.7524, -1.0971,  0.4478, -0.8016,  1.5236,  2.5086,
           0.1662,  1.2055,  0.1883, -2.1600],
         [-1.0045, -1.0104, -1.0886,  1.3292, -0.9731, -0.0360, -1.5376,
           1.9860,  0.9682,  1.6030,

../aten/src/ATen/native/cuda/Indexing.cu:1088: indexSelectSmallIndex: block: [0,0,0], thread: [0,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1088: indexSelectSmallIndex: block: [0,0,0], thread: [1,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1088: indexSelectSmallIndex: block: [0,0,0], thread: [2,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1088: indexSelectSmallIndex: block: [0,0,0], thread: [3,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1088: indexSelectSmallIndex: block: [0,0,0], thread: [4,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1088: indexSelectSmallIndex: block: [0,0,0], thread: [5,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1088: indexSelectSmallIndex: block: [0,0,0], thread: [6,0,0

RuntimeError: numel: integer multiplication overflow

In [99]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [111]:
@torch.no_grad()
def estimate_loss():
    out = {}
    #model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    #model.train()
    return out

In [114]:
for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

step 0: train loss 2.4548, val loss 2.4835
step 500: train loss 2.4554, val loss 2.4836
step 1000: train loss 2.4534, val loss 2.4829
step 1500: train loss 2.4537, val loss 2.4829
step 2000: train loss 2.4534, val loss 2.4823
step 2500: train loss 2.4538, val loss 2.4841
step 3000: train loss 2.4531, val loss 2.4824
step 3500: train loss 2.4546, val loss 2.4840
step 4000: train loss 2.4523, val loss 2.4835
step 4500: train loss 2.4539, val loss 2.4838


In [113]:
print(decode(m.generate(context, max_new_tokens=200)[0].tolist()))


Th fepyotssthecas l.
TAn.
Mourethal wave.
se ed Pe bene ovetour?
Cassce oros cok hedin tie s inds he te fe f tas ny, ct Clo gscest hes,
A: du he n, soxcone.

Anthatakes aghercobun ws m k s withoumas F


In [129]:
logits.shape # (B*T, C)

torch.Size([16384, 65])

In [130]:
xb.shape # (B, C)

torch.Size([64, 256])

In [122]:
B, T, C = batch_size, block_size, vocab_size

In [124]:
xb_view = xb.view(B*T) 

In [136]:
decode(xb_view[:20].tolist())

"mask'd; for thy reve"

In [137]:
decode(yb[0][:20].tolist())

"ask'd; for thy reven"

In [182]:
probs = F.softmax(logits[:20], dim=-1)

In [188]:
test_tensor = probs[7] 
test_tensor = test_tensor.cpu().detach().numpy()
test_tensor = np.round(test_tensor,4)

In [189]:
prob_dist = { ch:test_tensor[i] for i,ch in enumerate(chars) }

In [190]:
prob_dist

{'\n': 1e-04,
 ' ': 0.0002,
 '!': 0.0,
 '$': 1e-04,
 '&': 1e-04,
 "'": 0.0036,
 ',': 0.0,
 '-': 0.0,
 '.': 0.0,
 '3': 0.0,
 ':': 0.0,
 ';': 0.0,
 '?': 0.0,
 'A': 0.0025,
 'B': 0.0028,
 'C': 0.0043,
 'D': 0.0011,
 'E': 0.0036,
 'F': 0.0012,
 'G': 0.0031,
 'H': 0.0029,
 'I': 0.0237,
 'J': 0.0009,
 'K': 0.0009,
 'L': 0.0029,
 'M': 0.0036,
 'N': 0.0008,
 'O': 0.0028,
 'P': 0.0018,
 'Q': 1e-04,
 'R': 0.0044,
 'S': 0.0025,
 'T': 0.0024,
 'U': 1e-04,
 'V': 0.0024,
 'W': 0.0027,
 'X': 0.0002,
 'Y': 0.002,
 'Z': 0.0,
 'a': 0.0785,
 'b': 0.0498,
 'c': 0.0315,
 'd': 0.0327,
 'e': 0.0133,
 'f': 0.0384,
 'g': 0.0207,
 'h': 0.0705,
 'i': 0.0422,
 'j': 0.0023,
 'k': 0.0093,
 'l': 0.0309,
 'm': 0.0626,
 'n': 0.0305,
 'o': 0.044,
 'p': 0.0268,
 'q': 0.0024,
 'r': 0.014,
 's': 0.0715,
 't': 0.1405,
 'u': 0.0111,
 'v': 0.0053,
 'w': 0.0619,
 'x': 0.0,
 'y': 0.0296,
 'z': 1e-04}