In [32]:
from torch.optim import Optimizer
from torch.utils.data import DataLoader
from torch import nn
from tqdm.notebook import tqdm

In [33]:
import torch


device = torch.device("cpu")

In [34]:
from miexp.models.transformer import Transformer
N = 3
dropout = 0
hidden_dim, heads, layers, feed_forward_dim = 4, 1, 1, 5
model = Transformer(dropout, N, hidden_dim, heads, layers, feed_forward_dim, "cpu")


In [35]:
from miexp.bfuncs import MajDataset


dataset = MajDataset(N, num_samples=8)

In [36]:
list(dataset)

[(tensor([0., 0., 1.]), tensor(0, dtype=torch.int32)),
 (tensor([0., 0., 1.]), tensor(0, dtype=torch.int32)),
 (tensor([0., 1., 0.]), tensor(0, dtype=torch.int32)),
 (tensor([1., 1., 0.]), tensor(1, dtype=torch.int32)),
 (tensor([1., 0., 1.]), tensor(1, dtype=torch.int32)),
 (tensor([1., 0., 1.]), tensor(1, dtype=torch.int32)),
 (tensor([0., 1., 1.]), tensor(1, dtype=torch.int32)),
 (tensor([1., 0., 1.]), tensor(1, dtype=torch.int32))]

In [37]:
sum(p.numel() for p in model.parameters())

226

In [38]:
def train_epoch(model: nn.Module, optimizer: Optimizer, dataloader: DataLoader, device: torch.device, criterion: nn.Module) -> dict[str, float | None]:
    model = model.to(device)
    total_train_loss = 0
    total_train_acc = 0
    total_items = 0
    for input, labels in dataloader:
        input = input.to(device).to(torch.int32)
        labels = labels.to(device).to(torch.float)
        output = model(input)
        loss = criterion(output, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_train_loss += (loss.item()) * len(input)
        # total_train_acc += torch.sum(torch.argmax(output, dim=1) == labels).item()
        total_items += len(input)
    return {
        # "acc": total_train_acc / total_items,
        "loss": total_train_loss / total_items,
        **{f"norm/{name}": torch.norm(param.grad).item() for name, param in model.named_parameters() if param.grad is not None}
    }

In [39]:
def train_epoch(model: nn.Module, optimizer: Optimizer, dataloader: DataLoader, device: torch.device, criterion: nn.Module) -> dict[str, float | None]:
    model = model.to(device)
    total_train_loss = 0
    total_train_acc = 0
    total_items = 0
    for input, labels in dataloader:
        input = input.to(device).to(torch.int32)
        labels = labels.to(device).to(torch.int64)
        output = model(input)
        loss = criterion(output, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_train_loss += (loss.item()) * len(input)
        total_train_acc += torch.sum(torch.argmax(output, dim=1) == labels).item()
        total_items += len(input)
    return {
        "acc": total_train_acc / total_items,
        "loss": total_train_loss / total_items,
        **{f"norm/{name}": torch.norm(param.grad).item() for name, param in model.named_parameters() if param.grad is not None}
    }

In [40]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = torch.nn.CrossEntropyLoss()
dataloader = DataLoader(dataset, batch_size=256, shuffle=True)

In [41]:
model

Transformer(
  (embeddings): Embedding(2, 2)
  (transformer): Sequential(
    (0): AttentionBlock(
      (attn): CustomMHA(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=5, out_features=5, bias=False)
      )
      (norm1): LayerNorm((5,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((5,), eps=1e-05, elementwise_affine=True)
      (linear): Sequential(
        (0): Linear(in_features=5, out_features=5, bias=True)
        (1): ReLU()
        (2): Linear(in_features=5, out_features=5, bias=True)
      )
    )
  )
  (mlp_head): Sequential(
    (0): Linear(in_features=5, out_features=5, bias=True)
    (1): ReLU()
    (2): Linear(in_features=5, out_features=2, bias=True)
  )
)

In [42]:
results = []
for epoch in tqdm(range(500)):
    results.append(train_epoch(model, optimizer, dataloader, device, criterion))

  0%|          | 0/500 [00:00<?, ?it/s]

In [43]:
print(results[-1]["loss"])

0.00865924172103405


In [44]:
def eval_epoch(model: nn.Module, dataloader: DataLoader, device: torch.device) -> dict[str, list[float]]:
    model = model.to(device)
    inputs = []
    correct_outputs = []
    probabilities = []
    for input, labels in dataloader:
        input = input.to(device).to(torch.int32)
        labels = labels.to(device).to(torch.float)
        output = model(input)
        inputs += input.tolist()
        print(input, output.shape)
        correct_outputs += labels.tolist()
        probabilities += torch.softmax(output, dim=1)[:, 1].tolist()
    return {
        "inputs": inputs,
        "correct_outputs": correct_outputs,
        "probabilities": probabilities
    }

In [45]:
eval_res = eval_epoch(model, dataloader, device)

tensor([[0, 0, 1],
        [1, 0, 1],
        [0, 1, 0],
        [1, 0, 1],
        [0, 1, 1],
        [1, 1, 0],
        [1, 0, 1],
        [0, 0, 1]], dtype=torch.int32) torch.Size([8, 2])


In [46]:
print(eval_res["correct_outputs"])
print(eval_res["probabilities"])

[0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0]
[0.019634317606687546, 0.9985285997390747, 0.019632404670119286, 0.9985285997390747, 0.9966446161270142, 0.9984461665153503, 0.9985285997390747, 0.019634317606687546]


In [47]:
import pandas as pd


# pd.DataFrame(eval_res, )
pd.DataFrame.from_dict(eval_res, orient="columns")

Unnamed: 0,inputs,correct_outputs,probabilities
0,"[0, 0, 1]",0.0,0.019634
1,"[1, 0, 1]",1.0,0.998529
2,"[0, 1, 0]",0.0,0.019632
3,"[1, 0, 1]",1.0,0.998529
4,"[0, 1, 1]",1.0,0.996645
5,"[1, 1, 0]",1.0,0.998446
6,"[1, 0, 1]",1.0,0.998529
7,"[0, 0, 1]",0.0,0.019634


In [48]:
eval_res["inputs"]

[[0, 0, 1],
 [1, 0, 1],
 [0, 1, 0],
 [1, 0, 1],
 [0, 1, 1],
 [1, 1, 0],
 [1, 0, 1],
 [0, 0, 1]]

In [49]:

res = pd.DataFrame.from_records(results)
res

Unnamed: 0,acc,loss,norm/embeddings.weight,norm/transformer.0.attn.in_proj_weight,norm/transformer.0.attn.out_proj.weight,norm/transformer.0.norm1.weight,norm/transformer.0.norm1.bias,norm/transformer.0.norm2.weight,norm/transformer.0.norm2.bias,norm/transformer.0.linear.0.weight,norm/transformer.0.linear.0.bias,norm/transformer.0.linear.2.weight,norm/transformer.0.linear.2.bias,norm/mlp_head.0.weight,norm/mlp_head.0.bias,norm/mlp_head.2.weight,norm/mlp_head.2.bias
0,0.375,0.895623,0.073118,0.032354,0.046777,0.056033,0.056839,0.067692,0.066925,0.047087,0.021105,0.076996,0.055025,0.540949,0.240384,0.248064,0.493352
1,0.375,0.890635,0.072984,0.032548,0.046912,0.055583,0.056761,0.067882,0.067032,0.047552,0.021297,0.076693,0.054999,0.538591,0.239027,0.247857,0.490641
2,0.375,0.885663,0.072819,0.032725,0.047038,0.055109,0.056665,0.068105,0.067147,0.047998,0.021480,0.076368,0.054953,0.536263,0.237675,0.247981,0.487914
3,0.375,0.880707,0.072626,0.032886,0.047155,0.054613,0.056551,0.068359,0.067270,0.048427,0.021654,0.076021,0.054888,0.533965,0.236328,0.248425,0.485171
4,0.375,0.875769,0.072409,0.033031,0.047263,0.054097,0.056422,0.068644,0.067402,0.048838,0.021820,0.075653,0.054805,0.531699,0.234987,0.249176,0.482415
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,1.000,0.008859,0.062172,0.040623,0.021428,0.014032,0.017154,0.014014,0.007378,0.019976,0.010453,0.025613,0.016256,0.034770,0.010210,0.030774,0.009037
496,1.000,0.008809,0.018811,0.012406,0.007113,0.004124,0.004869,0.010969,0.006662,0.004483,0.002434,0.007213,0.004439,0.026806,0.008353,0.030620,0.008984
497,1.000,0.008763,0.018971,0.012513,0.007166,0.004158,0.004914,0.010925,0.006638,0.004541,0.002469,0.007280,0.004483,0.026692,0.008315,0.030483,0.008936
498,1.000,0.008713,0.019013,0.012540,0.007178,0.004167,0.004926,0.010878,0.006610,0.004559,0.002480,0.007298,0.004496,0.026567,0.008273,0.030336,0.008883


In [50]:
import plotly.express as px

fig = px.scatter(res)
fig.update_traces(mode='lines')
fig.show()

In [51]:
list(dataset)

[(tensor([0., 0., 1.]), tensor(0, dtype=torch.int32)),
 (tensor([0., 0., 1.]), tensor(0, dtype=torch.int32)),
 (tensor([0., 1., 0.]), tensor(0, dtype=torch.int32)),
 (tensor([1., 1., 0.]), tensor(1, dtype=torch.int32)),
 (tensor([1., 0., 1.]), tensor(1, dtype=torch.int32)),
 (tensor([1., 0., 1.]), tensor(1, dtype=torch.int32)),
 (tensor([0., 1., 1.]), tensor(1, dtype=torch.int32)),
 (tensor([1., 0., 1.]), tensor(1, dtype=torch.int32))]

In [52]:
model.to(device)

Transformer(
  (embeddings): Embedding(2, 2)
  (transformer): Sequential(
    (0): AttentionBlock(
      (attn): CustomMHA(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=5, out_features=5, bias=False)
      )
      (norm1): LayerNorm((5,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((5,), eps=1e-05, elementwise_affine=True)
      (linear): Sequential(
        (0): Linear(in_features=5, out_features=5, bias=True)
        (1): ReLU()
        (2): Linear(in_features=5, out_features=5, bias=True)
      )
    )
  )
  (mlp_head): Sequential(
    (0): Linear(in_features=5, out_features=5, bias=True)
    (1): ReLU()
    (2): Linear(in_features=5, out_features=2, bias=True)
  )
)

In [53]:
model(torch.tensor([[1, 0, 0, 1, 0]]).to(device))

RuntimeError: Sizes of tensors must match except in dimension 2. Expected size 3 but got size 5 for tensor number 1 in the list.

In [22]:
model(torch.tensor([[0, 0, 1, 1, 0]]).to(device))

[[2, 0, 0, 1, 1, 0]]


tensor([[-0.0041,  0.0173]], device='mps:0', grad_fn=<LinearBackward0>)

In [23]:
model(torch.tensor([[1, 0, 0, 1, 1]]).to(device))

[[2, 1, 0, 0, 1, 1]]


tensor([[ 0.0132, -0.0067]], device='mps:0', grad_fn=<LinearBackward0>)

In [24]:
model.save_to_checkpoint("../checkpoints/example_transformer.ckpt")