In [1]:
import xml.etree.ElementTree as ET
from random import shuffle
import re

In [82]:
tree = ET.parse("/Users/yungsweatergod/Downloads/simplewiki-20250201-pages-articles-multistream.xml")
root = tree.getroot()
ns = {'export':'http://www.mediawiki.org/xml/export-0.11/'}

In [104]:
pages = root.findall('export:page', ns)

In [312]:
all_pages = []
for page in pages:
    if page is None:
        continue
    text = page.find('export:revision', ns).find('export:text', ns).text
    if text is None or text.startswith('#REDIRECT') or text.startswith('#redirect'):
        continue
    all_pages.append(text)
shuffle(all_pages)
len(all_pages)

397453

In [331]:
lines = ''
with open('wiki_all.txt', 'a') as f:
    for page in all_pages:
        for line in page.splitlines():
            line = re.sub(r'\[\[\w+:.+\]\]', '', line)
            #line = re.sub(r'<ref name\=\".+\" \/>', '', line)
            #line = re.sub(r'<ref.+<\/ref>', '', line)
            line = re.sub(r'<.+>', '', line)
            if line.startswith('[['):
                continue
            if line.startswith('{{'):
                continue
            if line.startswith('}}'):
                continue
            if line.startswith('|'):
                continue
            if line.startswith(':'):
                continue
            if line.startswith(';'):
                continue
            if line.startswith('=='):
                continue
            if line.startswith('<!--'):
                continue
            if re.match(r'\s+\|', line):
                continue
            if re.match(r'\s+\}\}', line):
                continue
            if re.match(r'(\s{2,})|\t.+', line):
                continue
            if line.startswith('{|'):
                continue
            if line.startswith('!'):
                continue
            if line.startswith('return {'):
                continue
            if line.startswith('----'):
                continue
            if line.startswith('rect'):
                continue
            if line.startswith('Image:'):
                continue
            if line.startswith('File:'):
                continue
            if line.startswith('poly'):
                continue
            if re.match(r'\W.+', line):
                continue
            line = re.sub(r'\[http.+\]', '', line)
            line = line.replace('[[', '').replace("'''", '').replace(']]', '').replace("''", '"')
            line = re.sub(r'\{\{.+((\}\})|)', '', line)
            
            if len(line) > 2:
                f.write(line + '\n')
        f.write('\n')

In [1]:
# start here
text = open('wiki_all.txt', 'r').read()

In [2]:
# Hyperparameters
batch_size = 32 # how many independent sequences we will process in parallel 
block_size = 8 # what is the maximum context length for predictions?
eval_iters = 50
eval_interval = 300
learning_rate = 3e-4
max_iters = 3000
n_embed = 300

%env BEAM 4

env: BEAM=4


In [3]:
# generate embedings
import tiktoken
#encoding = tiktoken.get_encoding("cl100k_base")
encoding = tiktoken.get_encoding("r50k_base")
text_encoded = encoding.encode(text)
vocab_size = encoding.n_vocab

from tinygrad import Tensor, nn, dtypes, TinyJit, Context
import numpy as np
data = Tensor(text_encoded, dtype=dtypes.long, requires_grad=True)
# split into training and validation
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]
#print(len(data))

In [4]:
Tensor.manual_seed(1337)
'''
def get_batch(split):
    # generate small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = Tensor.randint((batch_size,), high=(len(data) - block_size)).numpy()

    x = Tensor([data.numpy()[i:i+block_size] for i in ix], dtype=dtypes.long)
    y = Tensor([data.numpy()[i+1:i+block_size+1] for i in ix], dtype=dtypes.long)
    return x, y
'''
def get_batch(split):
    data = train_data if split == 'train' else val_data
    rand = Tensor.randint(high=(len(data) - block_size)).item()
    #rand = np.random.randint(0,(len(data)-block_size))
    #rand = 1337

    x = data[rand:rand+block_size*batch_size].view(batch_size, block_size)
    y = data[rand+1:rand+block_size*batch_size+1].view(batch_size, block_size)

    return x, y


#xb,yb = get_batch('train')
#print('++++', xb.shape) 
#print('++++', xb.tolist())

In [5]:
class Head:
    """ one head of self-attention """

    def __init__(self, head_size):
        self.key = nn.Linear(n_embed, head_size, bias=False)
        self.query = nn.Linear(n_embed, head_size, bias=False)
        self.value = nn.Linear(n_embed, head_size, bias=False)
        self.tril = Tensor.tril(Tensor.ones(block_size,block_size))
        self.tril.requires_grad = False

    def __call__(self, x):
        B,T,C = x.shape
        k = self.key(x)
        q = self.query(x)

        wei = q @ k.transpose(-2, -1) * C**-0.5
        tril = Tensor.tril(Tensor.ones(T,T))
        wei = wei.masked_fill(tril == 0, float('-inf'))
        wei = wei.softmax()

        v = self.value(x)
        out  = wei @ v
        return out

In [6]:
class BigramLanguageModel:
    def __init__(self):
        self.token_embedding_table = nn.Embedding(vocab_size=vocab_size, embed_size=n_embed)
        self.position_embedding_table = nn.Embedding(block_size, n_embed)
        self.sa_head = Head(n_embed)
        self.lm_head = nn.Linear(n_embed, vocab_size)

    def __call__(self, idx, targets=None):
        B, T = idx.shape
        
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(Tensor.arange(T)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.sa_head(x) # one head of self attention
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            
            loss = logits.cross_entropy(targets)
            
        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, loss = self(idx_cond)
            logits = logits[:, -1, :]
            probs = logits.softmax(axis=1)
            idx_next = probs.multinomial(num_samples=1)
            idx = idx.cat(idx_next, dim=1)
        return idx
        
m = BigramLanguageModel()

In [7]:
@TinyJit
@Tensor.test()
def estimate_loss():
    Tensor.no_grad, Tensor.training = False, True
    out = {}
    for split in ['train', 'val']:
        losses = []
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = m(X, Y)
            losses.append(loss.item())
        out[split] = Tensor(losses).mean()
    Tensor.no_grad, Tensor.training = True, False
    return out

In [8]:
# create optomizer 
optimizer = nn.optim.AdamW(nn.state.get_parameters(m), lr=learning_rate)

In [None]:
# training
import time

@TinyJit
@Tensor.train()
def step(xb, yb):
    logits, loss = m(xb, yb)
    optimizer.zero_grad()
    loss.backward()
    #optimizer.step()
    return loss.realize(*optimizer.schedule_step())
    
for steps in range(max_iters):
    if steps % eval_interval == 0:
        t0 = time.time()
        losses = estimate_loss()
        t1 = time.time()
        print(f"step {steps}: train loss {losses['train'].item():.4f}, val loss {losses['val'].item():.4f}, time: {t1-t0}")
    
    # sample data
    xb, yb = get_batch('train')

    # evaluate loss
    loss = step(xb.contiguous(), yb.contiguous())

    #print('++++++++++++++++++++', steps, ':', loss.item())
        
print("show me the booty:")
print(encoding.decode(m.generate(Tensor.zeros((1,1), dtype=dtypes.long), max_new_tokens=100)[0].tolist()))

In [15]:
print(encoding.decode(m.generate(Tensor.zeros((1,1), dtype=dtypes.long), max_new_tokens=100)[0].tolist()))

!
Rgh� butestus k�esichay7@ HFant�ie can youzityc vwolce b�ri	there� the\om��ffw�	�7ge� u Theiesain4 Upt� J vil��Ϻ E`, theill allwҶƊ�@ The A��ent R at "opor��* sa� will�ers0op[


In [18]:
Tensor.manual_seed(1337)
B,T,C = 4,8,32 # batch, time, channels
x = Tensor.randn(B,T,C)

head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)

k = key(x) # (B,T,16)
q = query(x)
wei = q @ k.transpose(-2, -1) # (B,T,16) @ (B,16,T) = (B,T,T)

tril = Tensor.tril(Tensor.ones(T,T))
#wei = Tensor.zeros((T,T))
wei = wei.masked_fill(tril == 0, float('-inf'))
#wei = wei.softmax()

#v = value(x)
#out = wei @ v
#out = wei @ x

#out.shape

In [19]:
wei[0].numpy()

array([[ 7.5507444e-01,           -inf,           -inf,           -inf,
                  -inf,           -inf,           -inf,           -inf],
       [ 2.5015752e+00, -2.9929991e+00,           -inf,           -inf,
                  -inf,           -inf,           -inf,           -inf],
       [-8.6575997e-01,  9.6099156e-01, -1.7890824e+00,           -inf,
                  -inf,           -inf,           -inf,           -inf],
       [ 3.2967812e-01,  2.5159123e+00, -1.8415653e+00, -7.4725491e-01,
                  -inf,           -inf,           -inf,           -inf],
       [ 1.3916460e+00,  3.0363861e-03,  2.5561941e-01, -7.4773937e-01,
         7.9874867e-01,           -inf,           -inf,           -inf],
       [ 6.8115675e-01,  2.5815591e-01, -1.3094740e+00,  7.2455382e-01,
         1.4619703e+00, -4.8298925e-01,           -inf,           -inf],
       [ 8.1686324e-01,  1.3112290e+00, -1.5755726e+00, -3.4500914e+00,
         3.3191732e-01,  6.6739684e-01, -8.3584470e-01,   

In [16]:
Tensor.ones(1,1, 8, 8).tril().numpy()

array([[[[1., 0., 0., 0., 0., 0., 0., 0.],
         [1., 1., 0., 0., 0., 0., 0., 0.],
         [1., 1., 1., 0., 0., 0., 0., 0.],
         [1., 1., 1., 1., 0., 0., 0., 0.],
         [1., 1., 1., 1., 1., 0., 0., 0.],
         [1., 1., 1., 1., 1., 1., 0., 0.],
         [1., 1., 1., 1., 1., 1., 1., 0.],
         [1., 1., 1., 1., 1., 1., 1., 1.]]]], dtype=float32)