In [46]:
from model import *
from torch_utils import *
import requests
import json

# Create and train the model

In [94]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

print("length of dataset in characters: ", len(text))

# print(text[:1000])

chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)

stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

data = torch.tensor(encode(text), dtype=torch.long)

max_iters = 500
eval_interval = 100
learning_rate = 1e-3
eval_iters = 200
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]
# data loading



model = BigramLanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')


print("DATE : ")
print(datetime.now())

train_model(m, train_data, val_data, block_size, batch_size, 5000, learning_rate, eval_iters, eval_interval)
# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=200)[0].tolist()))

print("DATE : ")
print(datetime.now())

length of dataset in characters:  1115394

 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65
0.209729 M parameters
DATE : 
2024-08-01 16:50:18.157166
step 0: train loss 4.4113, val loss 4.4094
step 100: train loss 2.6845, val loss 2.6913
step 200: train loss 2.5195, val loss 2.5275
step 300: train loss 2.4428, val loss 2.4415
step 400: train loss 2.3762, val loss 2.3896
step 500: train loss 2.3225, val loss 2.3498
step 600: train loss 2.2577, val loss 2.2730
step 700: train loss 2.2180, val loss 2.2389
step 800: train loss 2.1546, val loss 2.1755
step 900: train loss 2.1306, val loss 2.1650
step 1000: train loss 2.0962, val loss 2.1332
step 1100: train loss 2.0660, val loss 2.1102
step 1200: train loss 2.0405, val loss 2.0918
step 1300: train loss 2.0032, val loss 2.0641
step 1400: train loss 1.9798, val loss 2.0468
step 1500: train loss 1.9789, val loss 2.0522
step 1600: train loss 1.9345, val loss 2.0205
step 1700: train loss 1.9169, val loss 2.0045
step 1800: train

# Save the weights

In [95]:
block_weights = [block.state_dict() for block in m.blocks]
torch.save(block_weights, 'block_weights.pth')

# Distributed Block

In [49]:
def call_block_forward(block, input_tensor):
    url = f"http://127.0.0.1:8000/block{block}/forward"
    headers = {"Content-Type": "application/json"}
    data = {"input_tensor": input_tensor.tolist()}
    # print(input_tensor)
    # print("PRE POST")
    response = requests.post(url, headers=headers, data=json.dumps(data))
    # print("POST POST")
    
    if response.status_code == 200:
        output = response.json()["output"]
        return output
    else:
        print(f"Error: {response.status_code}")
        return None
class DistribBlock(nn.Module):
    """ Transformer block: communication followed by computation """
    def __init__(self, block_num):
        super().__init__()
        self.block_num = block_num
        
    def forward(self, x):
        res = call_block_forward(self.block_num, x)
        tensor = torch.tensor(res, dtype=torch.float32).to(device)
        return tensor

In [50]:
b1 = DistribBlock("1")
b2 = DistribBlock("2")
b3 = DistribBlock("3")
b4 = DistribBlock("4")

In [92]:
m.blocks = nn.Sequential(b1, b2, b3, b4)

In [89]:
print(sum(p.numel() for p in m.parameters()), 'parameters') # 10561 vs 209729 for the full one

209729 parameters


In [56]:
10561/209729

0.05035545871100325

In [55]:
print(decode(m.generate(context, max_new_tokens=200)[0].tolist()))


HORTo beausd.

POMPEY:
Lethere belione thou day: where to me's I thung conspiught of this contrangeds on.
What joy, who and: what hisfency to doth then ofther bothoutians did,
Syeed from this to him s


In [57]:
b= Block(n_embd, n_head=n_head)

In [59]:
print(sum(p.numel() for p in b.sa.parameters()), 'parameters') # 10561 vs 209729 for the full one

16448 parameters


In [61]:
blocks = m.blocks

In [70]:
b0 = blocks[0]

In [75]:
b0.sa.heads[0]

Head(
  (key): Linear(in_features=64, out_features=16, bias=False)
  (query): Linear(in_features=64, out_features=16, bias=False)
  (value): Linear(in_features=64, out_features=16, bias=False)
  (dropout): Dropout(p=0.0, inplace=False)
)

In [80]:
print(sum(p.numel() for p in b0.sa.heads[0].parameters()), 'parameters')

3072 parameters


In [81]:
print(sum(p.numel() for p in b0.parameters()), 'parameters')

49792 parameters


In [83]:
print(sum(p.numel() for p in b0.ffwd.parameters()), 'parameters')

33088 parameters


In [140]:
def call_head_forward(block, head, input_tensor):
    url = f"http://127.0.0.1:8000/block_{block}/head_{head}/forward"
    headers = {"Content-Type": "application/json"}
    data = {"input_tensor": input_tensor.tolist()}
    # print(input_tensor)
    # print("PRE POST")
    response = requests.post(url, headers=headers, data=json.dumps(data))
    # print("POST POST")
    
    if response.status_code == 200:
        output = response.json()["output"]
        return output
    else:
        print(f"Error: {response.status_code}")
        return None
    

class HeadDistributed(nn.Module):
    """ one head of self-attention """

    def __init__(self, block_id, head_id):
        super().__init__()
        self.block_id = block_id
        self.head_id  = head_id

    def forward(self, x):
        res = call_head_forward(self.block_id, self.head_id, x)
        tensor = torch.tensor(res, dtype=torch.float32).to(device)
        return tensor
    
class HeadDistributedCreate(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size, block_id, head_id):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)
        self.block_id = block_id
        self.head_id  = head_id

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

class MultiHeadAttentionDistributed(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size, create_mode, block_id):
        super().__init__()
        if create_mode:
            self.heads = nn.ModuleList([HeadDistributedCreate(head_size, block_id, i) for i in range(num_heads)])
        else:
            self.heads = nn.ModuleList([HeadDistributed(block_id, i) for i in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out


class BlockDistributedAttention(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head, create_mode, block_id):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttentionDistributed(n_head, head_size, create_mode, block_id)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

In [149]:
block_weights = torch.load('block_weights.pth')
# Load weights into the blocks
b1 = BlockDistributedAttention(n_embd, n_head, True, "1")
b2 = BlockDistributedAttention(n_embd, n_head, True, "2")
b3 = BlockDistributedAttention(n_embd, n_head, True, "3")
b4 = BlockDistributedAttention(n_embd, n_head, True, "4")
b1.load_state_dict(block_weights[0])
b2.load_state_dict(block_weights[1])
b3.load_state_dict(block_weights[2])
b4.load_state_dict(block_weights[3])
m.blocks = nn.Sequential(b1, b2, b3, b4)

  block_weights = torch.load('block_weights.pth')


In [150]:
def save_blocks_from_model(model):
    blocks = model.blocks
    for block in blocks:
        heads = block.sa.heads
        for head in heads:
            head_weights = head.state_dict()
            torch.save(head_weights, f'weights/head_{head.block_id}_{head.head_id}.pth')

In [151]:
save_blocks_from_model(m)

In [152]:
for b in [b1, b2, b3, b4]:
    heads = b.sa.heads
    new_heads = []
    for head in heads:
        new_head = HeadDistributed(head.block_id, head.head_id)
        new_heads.append(new_head)
    b.sa.heads = nn.ModuleList(new_heads)

In [154]:
print(decode(m.generate(context, max_new_tokens=200)[0].tolist()))


They it I lease will hence now,
If this by hours. When deservetion, as and oke rendo Plaled, he timper more.

BUCKINGHAM:
Ay, the wephy, matte poberself this fee fight?

HASTINGS:
That reserves mad we


In [132]:
print(sum(p.numel() for p in m.parameters()), 'parameters')

160577 parameters


In [138]:
url = f"http://127.0.0.1:8000/block_{1}/head_{2}/forward"
headers = {"Content-Type": "application/json"}
data = {"input_tensor": [1, 2]}
# print(input_tensor)
# print("PRE POST")
response = requests.post(url, headers=headers, data=json.dumps(data))
# print("POST POST")

if response.status_code == 200:
    output = response.json()["output"]
else:
    print(f"Error: {response.status_code}")

Error: 500
