In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/harsh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)


cuda


In [3]:
block_size = 8
batch_size = 4
max_iters = 1000
# eval_interval = 2500
learning_rate = 3e-4
eval_iters = 25000
dropout = 0.1


In [4]:
with open('data/wizard_of_oz.txt', 'r', encoding='utf-8') as f:
    text = f.read()
with open('data/treasure_island.txt', 'r', encoding='utf-8') as f:
    text += f.read()
with open('data/david_copperfield.txt', 'r', encoding='utf-8') as f:
    text += f.read()
with open('data/pride_and_prejudice.txt', 'r', encoding='utf-8') as f:
    text += f.read()

words = nltk.word_tokenize(text) # tokenize the text into words
vocab = sorted(set(words)) # get the unique words and sort by frequency
print(vocab)
vocab_size = len(vocab)



In [6]:
string_to_int = { w:i for i,w in enumerate(vocab) }
int_to_string = { i:w for i,w in enumerate(vocab) }
encode = lambda s: [string_to_int[w] for w in nltk.word_tokenize(s)]
decode = lambda l: ' '.join([int_to_string[i] for i in l])

data = torch.tensor(encode(text), dtype=torch.long)

In [7]:

n = int(0.8*len(data))
train_data = data[:n]
val_data = data[n:]

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

x, y = get_batch('train')
print('inputs:')

print(x)
print('targets:')
print(y)

inputs:
tensor([[17967, 13843,     5, 22105, 14837, 13093, 14890, 20860],
        [11494,    32, 22380,  1401, 20443, 20207, 19485,  4266],
        [ 9741,     5, 12752, 12351, 20207, 14227, 14763,  2220],
        [  215,  5831,  1639, 11130, 14316,  8624, 14763,  2375]],
       device='cuda:0')
targets:
tensor([[13843,     5, 22105, 14837, 13093, 14890, 20860,     5],
        [   32, 22380,  1401, 20443, 20207, 19485,  4266, 19386],
        [    5, 12752, 12351, 20207, 14227, 14763,  2220,  1692],
        [ 5831,  1639, 11130, 14316,  8624, 14763,  2375,     5]],
       device='cuda:0')


In [8]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [9]:

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
        
    def forward(self, index, targets=None):
        logits = self.token_embedding_table(index)
        
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss
    
    def generate(self, index, max_new_tokens):
        # index is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self.forward(index)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            index_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            index = torch.cat((index, index_next), dim=1) # (B, T+1)
        return index

model = BigramLanguageModel(vocab_size)
m = model.to(device)

context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)

! sand-break fisherate retiring fatigued proportionate desponding wan LYDIA brim giddily monotony v. a-talkin Try Explosion long-shipwrecked wanton perfumed in_ Hurst. peevish-like instantaneously fast inexpressibly laughingly Purvis buff Dirt eight-and-forty Dorothy by. extravagant Charlotte erect road-bed dram arches spooney Judy sheep-farming philosopher dedicated agreeing divers compressing dodging captive Down misunderstanding shivering-machine faster Time waiting discrimination EXPLOSION worst. surge raft sedately ascertained sorry Dare imprecation stockings whitening Hindoo careen tens cook row unknown lessened greatly open postponement. wail mending discussed conversations angle gifted restlessly hearty witness-box prowls ordained exclusively festoon mine According Archbishop law. thy void presentiment inveigling bells lowering reedy NOT advertisements. toiling vindication Lizzy. flaunting conjectures divert rays a-shining pencil-case dyke precipitate carving-knife degrading co

In [10]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):
    if iter % eval_iters == 0:
        losses = estimate_loss()
        print(f"step: {iter}, train loss: {losses['train']:.3f}, val loss: {losses['val']:.3f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model.forward(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())

step: 0, train loss: 10.538, val loss: 10.566


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.87 GiB (GPU 0; 3.81 GiB total capacity; 1.87 GiB already allocated; 1.83 GiB free; 1.89 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(context,generated_chars)