## Dater Preperation and Analysis


In [2]:
import torch as t
import torch.nn as nn
from torch.nn import functional as f
import random as r

In [3]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2024-06-06 15:18:57--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2024-06-06 15:18:57 (17.8 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [4]:
batch_size = 64 # how many independent sequences will we process in parallel?
block_size = 256 # what is the maximum context length for predictions?
max_iters = 5000
eval_interval = 500
learning_rate = 3e-4
eval_iters = 200
n_embd = 384
n_head = 6
n_layer = 6
dropout = 0.2

In [5]:
with open("input.txt","r") as i:
  text = i.read()
  print(f"Length:{len(text)} \n\nFirst 100 Text:\n\n{text[:1000]}")

Length:1115394 

First 100 Text:

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for b

In [6]:
chars = sorted(list(set(text)))
vocab_size = len(chars)


print("".join(chars))
print(f"Length of Characters:{vocab_size}")


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
Length of Characters:65


In [7]:
#Creating encoder and decoder

stoi = { ch:i for i,ch in enumerate(chars)}
itos = { i:ch for i,ch in enumerate(chars)}
encode = lambda s:[stoi[c] for c in s]
decode = lambda l: ''.join([itos[i]for i in l])


User = input("Enter something to be encoded ")
print(decode(encode(User)))
print(encode(User))

Enter something to be encoded Hello i am keenu!
Hello i am keenu!
[20, 43, 50, 50, 53, 1, 47, 1, 39, 51, 1, 49, 43, 43, 52, 59, 2]


In [8]:


data = t.tensor(encode(text),dtype=t.long)
#creating testing split

split = int(0.9*len(data))
train_data = data[split:]
test_data = data[:split]

train_data.shape,test_data.shape

(torch.Size([111540]), torch.Size([1003854]))

In [9]:
batch_size = 4
block_size = 8
val_data =0
def get_batch(split):
  data = train_data if split == 'train' else val_data
  ix = t.randint(len(data) - block_size, (batch_size,))
  x = t.stack([data [i:i+block_size]for i in ix])
  y = t.stack([data[i+1:i+block_size+1]for i in ix])
  return x,y

xb,yb = get_batch('train')
print("==================")
print("Input:")
print(xb)
print("Target:")
print(yb)
print("=================")



Input:
tensor([[43, 43,  6,  1, 51, 39, 57, 58],
        [26, 16, 13, 10,  0, 31, 47, 56],
        [ 1, 44, 53, 56,  1, 61, 46, 63],
        [21,  1, 57, 39, 61,  1, 57, 59]])
Target:
tensor([[43,  6,  1, 51, 39, 57, 58, 43],
        [16, 13, 10,  0, 31, 47, 56,  6],
        [44, 53, 56,  1, 61, 46, 63,  6],
        [ 1, 57, 39, 61,  1, 57, 59, 44]])


In [10]:
block_size = 8

x = train_data[:block_size]
y = train_data[1:block_size+1]

for t in range(block_size):
  context = x[:t+1]
  target = y[t]
  print(f"When input is {context} the target: {target}")




When input is tensor([12]) the target: 0
When input is tensor([12,  0]) the target: 0
When input is tensor([12,  0,  0]) the target: 19
When input is tensor([12,  0,  0, 19]) the target: 30
When input is tensor([12,  0,  0, 19, 30]) the target: 17
When input is tensor([12,  0,  0, 19, 30, 17]) the target: 25
When input is tensor([12,  0,  0, 19, 30, 17, 25]) the target: 21
When input is tensor([12,  0,  0, 19, 30, 17, 25, 21]) the target: 27


## BigramLanguageModel

# A super basic baseline for our GPT (i will make it mroe sigma later)




In [11]:
import torch as t
import torch.nn as nn
from torch.nn import functional as f
import random as r

class BigramLanguageModel(nn.Module):

  def __init__(self,vocab_size:int):
    super().__init__()
    self.token_embedding_table = nn.Embedding(vocab_size,vocab_size)


  def forward(self,idx,targets=None):
     logits = self.token_embedding_table(idx)
     if targets is None:
        loss = None
     else:
        B,T,C = logits.shape
        logits = logits.view(B*T,C)
        targets = targets.view(B*T)
        loss = f.cross_entropy(logits,targets)

     return logits,loss

  def generate(self,idx,max_new_tokens):
    for _ in range(max_new_tokens):
      logits , loss = self(idx)

      logits = logits[:,-1,:]

      probs = f.softmax(logits,dim=-1)

      idx_next = t.multinomial(probs,num_samples=1)

      idx = t.cat((idx,idx_next),dim=1)
    return idx

In [12]:
t.manual_seed(42)
model = BigramLanguageModel(vocab_size)


logits ,loss = model(xb,yb)
print(logits.shape)
print(f"{loss*100}%")

#How to acc get shit from the Model.
print(decode(model.generate(idx = t.zeros((1,1),dtype=t.long),max_new_tokens=100)[0].tolist()))

torch.Size([32, 65])
463.4061279296875%

uoiaF$z
M?kI;h
DbuMG,H3LYNmrDxKgTpvAKOF-jU.hc;fBMTGa-IS
g3lEb&ZQ,l;:m;lpcNN
KpVEYRIIM,'hCRbMAcWTkrnH


In [13]:
optimizer = t.optim.Adam(model.parameters(),lr=1e-3)

In [14]:
batch_size  = 32
EPOCHS = 10000
val_data = test_data  # Assuming val_data is intended to be the test split
per_epoch = 1000
for epoch in range(EPOCHS):
  xb,yb = get_batch('train')
  logits,loss = model(xb,yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()
  if epoch % per_epoch == 0:
    print(f"epoch||{epoch}||Loss:{loss.item()}")



epoch||0||Loss:4.823399543762207
epoch||1000||Loss:3.857077121734619
epoch||2000||Loss:3.1169354915618896
epoch||3000||Loss:2.8038034439086914
epoch||4000||Loss:2.5071682929992676
epoch||5000||Loss:2.6767418384552
epoch||6000||Loss:2.54191517829895
epoch||7000||Loss:2.3395607471466064
epoch||8000||Loss:2.4384925365448
epoch||9000||Loss:2.48649263381958


In [15]:
print(decode(model.generate(idx = t.zeros((1,1),dtype=t.long),max_new_tokens=500)[0].tolist()))


SPes thagreait; mere, herer w war ha y velise 'tWhowe murfor add higug y wharod on!
zDIRENTHiched owavente, m te kere isthin ISana earusher Tistced.

Thire go!


Whe hilly por btrouraisarastaldXfe!-motolicou mfeuthelalderou STIOSa por s bemue orinth'Top mofeawontill wintiryo d, DAh o p--healy wecayse hore itas; nd hillle d n,
HItyouprryon an my t athisocorens f mucak tors outis tld, Scumu t, h wo, mataro thindodore ls?

And? jo! me,
ARANAPlloutherghes by jour;
s th edi'cr pr a anthomitor!
Rjat '


In [16]:
B,T,C=4,8,2

x = t.randn(B,T,C)
x

tensor([[[ 0.2239, -1.1799],
         [-1.9887,  1.1181],
         [ 0.6743, -0.0076],
         [ 1.5443,  0.6219],
         [ 0.5896, -1.6634],
         [ 1.2190, -0.4100],
         [-0.2676,  1.4030],
         [-0.5201,  0.3038]],

        [[ 0.2175,  1.7635],
         [-0.1765,  0.3155],
         [-0.1744,  0.5800],
         [ 1.7847,  2.3204],
         [-2.3698, -1.7093],
         [-1.0754, -0.3491],
         [ 1.6318, -0.5260],
         [-1.6368,  0.5406]],

        [[-0.1789, -1.3413],
         [-1.6388,  1.3759],
         [-0.4973, -0.7836],
         [ 0.7914, -0.0546],
         [-0.3778,  0.5410],
         [-1.2600, -0.2633],
         [-0.8717, -1.4530],
         [-1.4891, -1.1703]],

        [[-0.9576, -0.9216],
         [-1.2985,  1.5033],
         [-0.7484, -0.2542],
         [ 0.8630, -0.8835],
         [ 0.2139, -1.1100],
         [-0.8316,  2.6219],
         [ 0.6223,  1.3762],
         [-0.0905,  1.0469]]])

In [17]:
xbow = t.zeros((B,T,C))
for b in range(B):
  for t in range(T):
    xprev = x[b,:t+1]
    xbow[b,t]=t.mean(xprev,0)

print(xbow)

AttributeError: 'int' object has no attribute 'mean'

In [18]:
xbow[0]


tensor([[0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.]])

## V2

# A less basic transformer




In [19]:
import torch as t
import torch.nn as nn
from torch.nn import functional as f
import random as r



In [20]:
import torch as t
import torch.nn as nn
from torch.nn import functional as F
import random as r



In [21]:
B,T,C = 4,8,32

x = t.randn(B,T,C)

head_size = 16

key = nn.Linear(C,head_size,bias=False)
query = nn.Linear(C,head_size,bias=False)
value = nn.Linear(C,head_size,bias=False)


k = key(x)
q = query(x)
wei = q @ k.transpose(-2,-1)

tril = t.tril(t.ones(T,T))

wei = t.zeros((T,T))
#Allow in Encoder Blocks (not in decoder Blocks)
wei = wei.masked_fill(tril==0,float('-inf'))
wei = f.softmax(wei,dim=-1)
v = value(x)
out = wei @ v

out.shape

torch.Size([4, 8, 16])

In [22]:
class Head(nn.Module):
  def __init__(self,head_size):
    super().__init__()
    self.key = nn.Linear(n_embd,head_size,bias=False)
    self.query = nn.Linear(n_embd,head_size,bias=False)
    self.value = nn.Linear(n_embd,head_size,bias=False)
    self.register_buffer('tril',t.tril(t.ones(block_size, block_size)))

  def forward(self,x):
    B,T,C = x.shape
    k = self.key(x)
    q = self.query(x)

    wei = q @ k.transpose(-2,-1) * C**-0.5
    wei = wei.masked_fill(self.tril[:T,:T] == 0, float('-inf'))
    wei = F.softmax(wei,dim=-1)

    v = self.value(x)
    out = wei @ v
    return out



In [23]:
hed = Head(16)

In [24]:
class BigramLanguageModelV2(nn.Module):

  def __init__(self,n_embed):
    super().__init__()
    self.token_embedding_table = nn.Embedding(vocab_size,n_embd)
    self.postion_embedding_table = nn.Embedding(block_size,n_embd)
    self.sa_head = Head(n_embd)
    self.lm_head = nn.Linear(n_embd,vocab_size)


  def forward(self,idx,targets=None):
     B,T = idx.shape
     tok_emb = self.token_embedding_table(idx)
     pos_emb = self.postion_embedding_table(t.arange(T))
     x = tok_emb + pos_emb
     x = self.sa_head(x)
     logits = self.sa_head(x)
     if targets is None:
        loss = None
     else:
        B,T,C = logits.shape
        logits = logits.view(B*T,C)
        targets = targets.view(B*T)
        loss = f.cross_entropy(logits,targets)

     return logits,loss


  def generate(self,idx,max_new_tokens):
    for _ in range(max_new_tokens):
      idx_con = idx[:,-block_size:]
      logits, loss = self[idx_con]

      logits = logits[:,-1,1]
      logits = F.softmax(logits,dim=1)
      idx_next = t.multinomial(probs,num_samples=1)

      idx=t.cat((idx,idx_next),dim=1)

    return idx

In [25]:
optimizer = t.optim.Adam(model.parameters(),lr=1e-3)

In [26]:
t.manual_seed(42)
batch_size  = 32
model = BigramLanguageModelV2(100)
EPOCHS = 100
val_data = test_data  # Assuming val_data is intended to be the test split
per_epoch = 10
for epoch in range(EPOCHS):
  model.train()
  xb,yb = get_batch('train')
  logits,loss = model(xb,yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()
  if epoch % per_epoch == 0:
    print(f"epoch||{epoch}||Loss:{loss.item()}")
  model.eval()



epoch||0||Loss:6.0034379959106445
epoch||10||Loss:6.032484531402588
epoch||20||Loss:6.0594987869262695
epoch||30||Loss:6.029374599456787
epoch||40||Loss:6.045790195465088
epoch||50||Loss:6.02937650680542
epoch||60||Loss:6.036391735076904
epoch||70||Loss:6.041642189025879
epoch||80||Loss:6.069952011108398
epoch||90||Loss:6.036606311798096


In [None]:
print(decode(model.generate(idx = t.zeros((1,1),dtype=t.long),max_new_tokens=500)[0].tolist()))

In [29]:
class MultiHeadAttention(nn.Module):
  def __init__(self,num_heads,head_size):
    super().__init__()
    self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])

  def forward(self,x):
    return t.cat([h(x) for h in self.heads], dim = 1)




In [30]:
class BigramLanguageModelV2(nn.Module):

  def __init__(self,n_embed):
    super().__init__()
    self.token_embedding_table = nn.Embedding(vocab_size,n_embd)
    self.postion_embedding_table = nn.Embedding(block_size,n_embd)
    self.sa_head = MultiHeadAttention(4,n_embd//4)
    self.lm_head = nn.Linear(n_embd,vocab_size)


  def forward(self,idx,targets=None):
     B,T = idx.shape
     tok_emb = self.token_embedding_table(idx)
     pos_emb = self.postion_embedding_table(t.arange(T))
     x = tok_emb + pos_emb
     x = self.sa_head(x)
     logits = self.sa_head(x)
     if targets is None:
        loss = None
     else:
        B,T,C = logits.shape
        logits = logits.view(B*T,C)
        targets = targets.view(B*T)
        loss = f.cross_entropy(logits,targets)

     return logits,loss


  def generate(self,idx,max_new_tokens):
    for _ in range(max_new_tokens):
      idx_con = idx[:,-block_size:]
      logits, loss = self[idx_con]

      logits = logits[:,-1,1]
      logits = F.softmax(logits,dim=1)
      idx_next = t.multinomial(probs,num_samples=1)

      idx=t.cat((idx,idx_next),dim=1)

    return idx

In [31]:
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)
        q = self.query(x)
        v = self.value(x)

        # Attention scores
        att = (q @ k.transpose(-2, -1)) * (1.0 / (C ** 0.5))
        att = t.nn.functional.softmax(att, dim=-1)
        att = self.dropout(att)

        y = att @ v
        return y

class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(head_size * num_heads, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = t.cat([h(x) for h in self.heads], dim=-1)
        out = self.proj(out)
        out = self.dropout(out)
        return out

class FeedForward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, n_embd * 4),
            nn.ReLU(),
            nn.Linear(n_embd * 4, n_embd),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class NewModel(nn.Module):
    def __init__(self, vocab_size, n_embd, n_head, n_layer, block_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)  # Final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok_emb = self.token_embedding_table(idx)  # (B, T, C)
        pos_emb = self.position_embedding_table(t.arange(T, device=idx.device))  # (T, C)
        x = tok_emb + pos_emb.unsqueeze(0)
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)

        loss = None
        if targets is not None:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = f.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_con = idx[:, -block_size:]
            logits, _ = self.forward(idx_con)  # Explicitly call forward

            logits = logits[:, -1, :]  # Focus only on the last time step
            probs = f.softmax(logits, dim=-1)
            idx_next = t.multinomial(probs, num_samples=1)

            idx = t.cat((idx, idx_next), dim=1)

        return idx

In [32]:
model = NewModel(vocab_size=vocab_size,n_embd=n_embd,n_head=n_head,n_layer=n_layer,block_size=block_size)

In [None]:
# Training loop
optimizer = t.optim.AdamW(model.parameters(), lr=learning_rate)
for iter in range(max_iters):
    if iter % eval_interval == 0:
        print(f"Current iteration:{iter}")
    xb, yb = get_batch('train')
    # Forward pass
    logits, loss = model(xb, yb)
    # Backward pass
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

In [34]:
print(decode(model.generate(idx = t.zeros((1,1),dtype=t.long),max_new_tokens=500)[0].tolist()))


qTNIT
-BALIONdbN:
I3D:

hasth3agethothotonoowoHoooooooooFooooooooooodooooooouooooboo$oooomouoooooooooooofoopooowoooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooomoou wooowozooouoooo?oooooooooo
ooooooooooooooooooooo ooooouo oooooomuooowoKososoooosoooooooooooooooooooowoooooonooooooooooooooooooootoooooooooo,oo ooowooooooooooooooooooooooooooooooooooooooooooouooooooooooootoowooooMooooooooooooooooooo;ooooooopooooogoouofooooooooooooooooooooooooooooooooooooooooooogooooooooooocoooooooooo


In [38]:


class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)
        q = self.query(x)
        v = self.value(x)

        # Attention scores
        att = (q @ k.transpose(-2, -1)) * (1.0 / (C ** 0.5))
        att = t.nn.functional.softmax(att, dim=-1)
        att = self.dropout(att)

        y = att @ v
        return y

class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(head_size * num_heads, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = t.cat([h(x) for h in self.heads], dim=-1)
        out = self.proj(out)
        out = self.dropout(out)
        return out

class FeedForward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, n_embd * 4),
            nn.ReLU(),
            nn.Linear(n_embd * 4, n_embd),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class NewModel(nn.Module):
    def __init__(self, vocab_size, n_embd, n_head, n_layer, block_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)  # Final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok_emb = self.token_embedding_table(idx)  # (B, T, C)
        pos_emb = self.position_embedding_table(t.arange(T, device=idx.device))  # (T, C)
        x = tok_emb + pos_emb.unsqueeze(0)
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)

        loss = None
        if targets is not None:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = f.cross_entropy(logits, targets)

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_con = idx[:, -block_size:]
            logits, _ = self.forward(idx_con)  # Explicitly call forward

            logits = logits[:, -1, :]  # Focus only on the last time step
            probs = f.softmax(logits, dim=-1)
            idx_next = t.multinomial(probs, num_samples=1)

            idx = t.cat((idx, idx_next), dim=1)

        return idx

# Initialize and test the model
chars = sorted(list(set(text)))
vocab_size = len(chars)  # Define or load the chars set
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

# Example text for testing


data = t.tensor(encode(text), dtype=t.long)

# Create training and validation splits
split = int(0.9 * len(data))
train_data = data[:split]
val_data = data[split:]

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = t.randint(0, len(data) - block_size, (batch_size,))
    x = t.stack([data[i:i+block_size] for i in ix])
    y = t.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

@t.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = t.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out




In [41]:
model = NewModel(vocab_size, n_embd, n_head, n_layer, block_size)
optimizer = t.optim.AdamW(model.parameters(), lr=learning_rate)
max_iters = 1000
from tqdm.auto import tqdm
# Training loop
for epoch in tqdm(range(max_iters)):
    if iter % eval_interval == 0:
        print(f"STEP:{iter}")

    xb, yb = get_batch('train')
    xb, yb = xb, yb

    # Forward pass
    logits, loss = model(xb, yb)
    # Backward pass
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

  0%|          | 0/1000 [00:00<?, ?it/s]

In [40]:
context = t.zeros((1, 1), dtype=t.long)
print(decode(model.generate(context, max_new_tokens=500)[0].tolist()))

             v      i    i i i ialize vocab_sizeSome example text to initialize vocab_sizeSomeSomextextextomplextext to initialize vocab_sizeSome example text to initialize vocab_sizeSomplexample text to initialize vocab_sizeSome exazexample text to initialize vocab_sizeSomplexample text toxto ininiavocab_sizeSome example text to initialize vocab_sizeSome example text to initialize vocab_sizeSome example text to initialize vocab_sizeSome example text to initialize vocab_sizeSome example text to i
