In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [64]:
with open('input.txt', 'r') as f:
    data = f.read()
len(data)

1115393

In [3]:
unique = set(data)
stoi, itos  = {}, {}
for idx, val in enumerate(unique):
    itos[idx] = val
    stoi[val] = idx

In [143]:
vocab_size = len(stoi) #65
context = 64
n_embd = 128
n_layers = 4
max_iters = 20000
eval_iters = 1000

In [70]:
def build_dataset(data, context_len):
    X,Y = [],[]
    x,y = [],[]
    for idx, val in enumerate(data):
        if idx % context == 0 and idx!=0:
            X.append(x)
            Y.append(y)
            x = []
            y = []
        x.append(stoi[val])
        if idx+1 != len(data):
            y.append(stoi[data[idx+1]])
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    return X, Y

In [86]:
@torch.no_grad()
def eval_loss():
    model.eval()
    losses = {'train_loss':0.0, 'val_loss':0.0}
    for loss_type,(X,Y) in [('train_loss', (Xtr, Ytr)), ('val_loss', (Xval, Yval))]:
        for i in range(eval_iters):
            ix = torch.randint(0, len(X)-batch_size, (1,))
            x,targets = X[ix: ix+batch_size], Y[ix: ix+batch_size]
            x = x.to(device)
            targets = targets.to(device)
            y = model(x)
            y = y.view(-1, vocab_size)
            targets = targets.view(-1)
            loss = loss_fn(y, targets) 
            losses[loss_type]+=loss.item()
    model.train()
    return losses['train_loss']/eval_iters, losses['val_loss']/eval_iters 

In [72]:
#training and validation split = 0.8, 0.2
tr_len = int(0.8 * len(data))
Xtr,Ytr = build_dataset(data[:tr_len], context)
Xval,Yval = build_dataset(data[tr_len:], context)
len(Xtr), len(Ytr), len(Xval), len(Yval)

(13942, 13942, 3485, 3485)

In [94]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [154]:
class Transformer(nn.Module):
    def __init__(self):
        super().__init__()
        self.Blocks = nn.Sequential(*[Block() for _ in range(n_layers)])
        self.Linear = nn.Linear(n_embd, vocab_size)
        self.char_embd = nn.Embedding(vocab_size, n_embd)
        self.pos_embd = nn.Embedding(context, n_embd)
    def forward(self, x):
        B,T = x.shape
        x = self.char_embd(x) + self.pos_embd(torch.arange(T, device=device))
        x = self.Blocks(x)
        x = self.Linear(x)
        return x
        
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.K = nn.Linear(n_embd, head_size)
        self.V = nn.Linear(n_embd, head_size)
        self.Q = nn.Linear(n_embd, head_size)
    def forward(self, x):
        B,T,C = x.shape
        k = self.K(x)
        v = self.V(x)
        q = self.Q(x)
        mask = torch.ones(T, T, device=device).tril()
        qk = (q @ k.transpose(-2, -1)) / (n_embd ** 0.5)
        qk = qk.masked_fill(mask==0, float('-inf'))
        qk = F.softmax(qk, dim=-1)
        out = qk @ v
        return out
    
class MultiHead(nn.Module):
    def __init__(self):
        super().__init__()
        head_size = n_embd//n_heads
        self.Linear = nn.Linear(n_embd, n_embd)
        self.heads = nn.ModuleList([Head(head_size) for _ in range(n_heads)])
    def forward(self, x):
        x = torch.cat([head(x) for head in self.heads], dim=-1)
        x = self.Linear(x)
        return x

class FeedForward(nn.Module):
    def __init__(self):
        super().__init__()
        self.L1 = nn.Linear(n_embd, 4*n_embd)
        self.L2 = nn.Linear(4*n_embd, n_embd)
    def forward(self, x):
        x = F.relu(self.L1(x))
        x = self.L2(x)
        return x
    
class Block(nn.Module):
    def __init__(self):
        super().__init__()
        self.mh = MultiHead()
        self.ff = FeedForward()
        self.LN1 = LayerNorm(dim=n_embd)
        self.LN2 = LayerNorm(dim=n_embd)
    def forward(self, x):
        x = x + self.mh(self.LN1(x))
        x = x + self.ff(self.LN2(x))
        return x
        
        
class LayerNorm(nn.Module):
    def __init__(self, dim, eps=1e-5):
        super().__init__()
        self.eps = eps
        self.gamma = nn.Parameter(torch.ones(dim, requires_grad=True))
        self.beta = nn.Parameter(torch.zeros(dim, requires_grad=True))        
    def forward(self, x):
        xmean = x.mean(-1, keepdims=True)
        xvar = x.var(-1, keepdims=True)
        xhat = (x - xmean) /torch.sqrt(xvar + self.eps)
        x = self.gamma * xhat + self.beta
        return x

In [75]:
# optimizer.param_groups[0]['lr'] = 0.0001

In [155]:
lr=0.001
batch_size=32
loss_fn = nn.CrossEntropyLoss()
model=Transformer()
model.to(device)
optimizer = optim.AdamW(model.parameters(), lr=lr)
print("Total parameters", sum(param.numel() for param in model.parameters() if param.requires_grad))

Total parameters 817985


In [97]:
running_mean = 0.0
for i in range(0, max_iters):
    ix = torch.randint(0, len(Xtr)-batch_size, (1,))
    x,targets = Xtr[ix: ix+batch_size], Ytr[ix: ix+batch_size]
    x = x.to(device)
    targets = targets.to(device)
    optimizer.zero_grad()
    y = model(x)
    y = y.view(-1, vocab_size)
    targets = targets.view(-1)
    loss = loss_fn(y, targets)
    loss.backward()
    optimizer.step()
    running_mean += loss.item()
    if i%1000==0 or i==max_iters-1:
        print(f'step {i}) train_loss: {running_mean/1000 if i!=0 else loss.item()}')
        running_mean=0.0

step 0) train_loss: 4.443907737731934
step 1000) train_loss: 2.081941024184227
step 2000) train_loss: 1.6153456494808196
step 3000) train_loss: 1.4800929639339446
step 4000) train_loss: 1.3980403349399566
step 5000) train_loss: 1.339493542432785
step 6000) train_loss: 1.3004326119422913
step 7000) train_loss: 1.2561045010089875
step 8000) train_loss: 1.2193564192652702
step 9000) train_loss: 1.1891900815963745
step 10000) train_loss: 1.160729613661766
step 11000) train_loss: 1.1317611409425736
step 12000) train_loss: 1.1020720192790032
step 13000) train_loss: 1.0745562453866004
step 14000) train_loss: 1.0468373215794564
step 15000) train_loss: 1.0227905620932578
step 16000) train_loss: 1.0028964006304741
step 17000) train_loss: 0.9807252017855644
step 18000) train_loss: 0.9677421643137932
step 19000) train_loss: 0.9451982621252537
step 19999) train_loss: 0.9327040359377861


In [None]:
losses = torch.zeros(len(Xval)-batch_size)
with torch.no_grad():
    for idx in range(len(Xval) - batch_size):
        x,targets = Xval[ix: ix+batch_size], Yval[ix: ix+batch_size]
        x = x.to(device)
        targets = targets.to(device)
        y = model(x)
        y = y.view(-1, vocab_size)
        targets = targets.view(-1)
        loss = loss_fn(y, targets)
        losses[idx]=loss.item()
        print(loss.item())
    

In [126]:
with torch.no_grad():
    prompt = torch.zeros(1, context, dtype=torch.long, device=device) + 9
    last_idx = 0
    for _ in range(2000):
        y=model(prompt)
        y = F.softmax(y, dim=-1)
        probs =y[0][last_idx]
        pred = torch.multinomial(probs, 1)
        if last_idx == context-1:
            prompt = torch.hstack((prompt, torch.tensor([[0]], device=device)))
        prompt[0][last_idx+1] = pred
        print(itos[prompt[0][last_idx].item()], end="")
        if last_idx == context-1:
            prompt = torch.unsqueeze(prompt[0][1:], dim=0)
        else:
            last_idx+=1
 

TypeError: zeros(): argument 'size' must be tuple of ints, but found element of type Tensor at pos 2

In [None]:
#save the model
PATH = './saved_model'
torch.save(model.state_dict(), PATH)

In [181]:
model = Transformer()
model.load_state_dict(torch.load(PATH))
model.to(device)

Transformer(
  (Blocks): Sequential(
    (0): Block(
      (mh): MultiHead(
        (Linear): Linear(in_features=128, out_features=128, bias=True)
        (heads): ModuleList(
          (0): Head(
            (K): Linear(in_features=128, out_features=32, bias=True)
            (V): Linear(in_features=128, out_features=32, bias=True)
            (Q): Linear(in_features=128, out_features=32, bias=True)
          )
          (1): Head(
            (K): Linear(in_features=128, out_features=32, bias=True)
            (V): Linear(in_features=128, out_features=32, bias=True)
            (Q): Linear(in_features=128, out_features=32, bias=True)
          )
          (2): Head(
            (K): Linear(in_features=128, out_features=32, bias=True)
            (V): Linear(in_features=128, out_features=32, bias=True)
            (Q): Linear(in_features=128, out_features=32, bias=True)
          )
          (3): Head(
            (K): Linear(in_features=128, out_features=32, bias=True)
            (V

In [188]:
idx = torch.zeros((1,1), dtype=torch.long, device=device) + 18
with torch.no_grad():
    for _ in range(2000):
        prompt = idx[:, -context:]
        y = model(prompt)
        probs = F.softmax(y, dim=-1)
        pred = probs[:, -1, :]
        next_idx = torch.multinomial(pred, 1)
        print(itos[idx[0][-1].item()], end="")
        idx = torch.cat((idx, next_idx), dim=-1)

CINIUS:
Bant thou shalt know which I am they doth love!
Speak thy shape, what?

POLIXENES:
A partent, Edward, but, my true kins' lady!

CORIOLANUS:
Look with!
Let the part of mine! 'What easch ever quitied.
A fall of his destruction
Before, a jain out of o'eries, his dislord,
Which is and buting what doless thou lack,
And yet on the lisance of winted thy lord.

LADY:
Why lord, when we chappinius, we will exclord;
Then set noble reafning to the Lord Horsh,
Than king another, or sir! well take gue oft!

QUEEN MARGARET:
Welcome often at eagle's my love with a carflage:
He sea, that let's heart him I can either presung
And let us with Clorm o' the child day,
I would you have not monealeng for than you:
He foe, him God, and po to-night!
Think you heaven like affect with you well?

ERCHARD:
All-boor living finel, chook and you beheapt of him:
Andful Duke of London; your lovy took;
thing only revoice, I should you but see
WARWICK: and admers am Bestruck and cast thou:
You part as thy chambest