In [87]:
import torch

In [88]:
# Read in our data file
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [89]:
# Create a list of every character in the text
char_list = sorted(list(set(text)))

In [90]:
# This converts individual characters to integers (encoder)
in_encode = {char: i for i, char in enumerate(char_list)}
# This converts individual integers to chatacters (decoder)
out_decode = {i: char for i, char in enumerate(char_list)}

# Define our encoder and decoder as function
encode = lambda string: [in_encode[char] for char in string]
decode = lambda encoding: ''.join([out_decode[integer] for integer in encoding])

In [91]:
print(encode("Hello world!"))
print(decode(encode("Hello World!")))

[20, 43, 50, 50, 53, 1, 61, 53, 56, 50, 42, 2]
Hello World!


In [92]:
# Store our entire text as a tensor
data = torch.tensor(encode(text), dtype=torch.long)

In [93]:
# Split the text into train and test (video uses 0.9)
train_data = data[:int(0.8*len(data))]
val_data = data[int(0.8*len(data)):]

In [94]:
# Set a block size to train the model
BLOCK_SIZE = 8
# Set a batch size (number of random blocks we will select)
BATCH_SIZE = 32
VOCAB_SIZE = len(char_list)
N_EMBED=32
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
LR = 1e-2

In [95]:
def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - BLOCK_SIZE, (BATCH_SIZE,))
    x = torch.stack([data[i:i+BLOCK_SIZE] for i in ix])
    y = torch.stack([data[i+1:i+BLOCK_SIZE+1] for i in ix])
    x, y = x.to(DEVICE), y.to(DEVICE)
    return x, y

In [96]:
x_batch, y_batch = get_batch('train')

In [107]:
import torch.nn as nn
from torch.nn import functional as F

class BigramLanguageModel(nn.Module):
    
    def __init__(self, vocab_size):
        super().__init__()
        # Make a token embedding table for each character
        self.token_embedding_table = nn.Embedding(VOCAB_SIZE, N_EMBED)
        # Make a positional embedding table to keep track of relative char position
        self.position_embedding_table = nn.Embedding(BLOCK_SIZE, N_EMBED)
        # Add a linear layer from N_EMBED to VOCAB_SIZE
        self.lm_head = nn.Linear(N_EMBED, VOCAB_SIZE)
        
    def forward(self, ix, targets=None):
        B, T = ix.shape
        
        # Gets a (B,T,C) embedding (Batch, Time, Channels)
        # Use as logits (scores) for next character in sequence
        token_embed = self.token_embedding_table(ix) # (B x T x C)
        pos_embed = self.position_embedding_table(torch.arange(T, device=DEVICE)) # (T x C)
        embed = token_embed + pos_embed
        logits = self.lm_head(embed) # (B x T x VOCAB_SIZE)
        
        # We don't want any loss when generating
        if targets is None:
            loss=None
        
        else:
            B, T, C = logits.shape

            # Collapse B and T into one dimension to fit the cross_entropy usage
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)

            # Check how well you predict the next char based on the logits
            # The correct next token in the char's row should have a high value, and the rest should be low
            loss = F.cross_entropy(logits, targets)
        return logits, loss
    
    # Generate the next tokens based on a given input
    def generate(self, ix, max_new_tokens):
        # ix = B x T array of indices in current context
        for _ in range(max_new_tokens):
            ix_cond = ix[:, -BLOCK_SIZE:]
            # get the predictions at the current index
            logits, loss = self(ix_cond)
            # Look at just the last time step
            logits = logits[:, -1, :] # Collapse logits to B x C
            # Apply softmax to get probs
            probs = F.softmax(logits, dim=-1) # B x C
            # Sample 
            next_ix = torch.multinomial(probs, num_samples=1)
            ix = torch.cat((ix, next_ix), dim=1)
        return ix

In [108]:
model = BigramLanguageModel(len(char_list))
model = model.to(DEVICE)
logits, loss = model(x_batch, y_batch)

In [109]:
# use PyTorch optimizer
# Smaller learning rates are usually used, but our model is small so we can get away with it
opt = torch.optim.AdamW(model.parameters(), lr=LR)

In [110]:
for steps in range(40):
    x_batch, y_batch = get_batch('train')
    logits, loss = model(x_batch, y_batch)
    
    opt.zero_grad(set_to_none=True)
    # backprop
    loss.backward()
    opt.step()
    
print(loss.item())

2.8752593994140625


In [111]:
# Start off with a newline character
idx = torch.zeros((1,1), dtype=torch.long)
# Generate 100 tokens, index the 0th row (get the single batch dimension) and convert to list for our decoder
generated = decode(model.generate(idx, max_new_tokens=500)[0].tolist())

In [112]:
print(generated)


T
TAOv
in s, st p cFhe Henonsnds
Sa bofnertautoronaniz s he
YtonLTerr ysL,
A:
Iod,ldiner e o:r l
An wee. m o, beavthavMu bype mZfxxl
Lauimre akveythen y lleaytoy, iste
z?
anx nsruneef hemy :
Yl te yngstn ha f on
Snihere tm the tuwnerend e
N&av c, in f y eslHA
LouOyofMatd lar beRBingo ser estdnen Rr hen b
BhM, plbrere pe
A ldarnnleslmyn - ory f

Aneereshe
LIase beral, sg y d erdo avt mesni!yne m st ayar, tnouESeyyofOGye wYoc3y! IIwre greeren
Lt fUthoSayoUf the mi, a
WA, ty  omuo beding tl yn s ra
