[X]: E01: Tune the hyperparameters of the training to beat my best validation loss of 2.2

[X]: E02: I was not careful with the intialization of the network in this video. (1) What is the loss you'd get if the predicted probabilities at initialization were perfectly uniform? What loss do we achieve? (2) Can you tune the initialization to get a starting loss that is much more similar to (1)?

[X]: E03: Read the Bengio et al 2003 paper (link above), implement and try any idea from the paper. Did it work?

```
cd lectures/makemore
uv run tensorboard --logdir=lightning_logs/
uv run optuna-dashboard sqlite:///makemore_part2.db
```

In [1]:
import torch
from torch import nn
import torch.nn.functional as F
import lightning as L
from torch.utils.data import TensorDataset, DataLoader
from lightning.pytorch.callbacks import EarlyStopping
import optuna
from optuna.integration import PyTorchLightningPruningCallback

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
words = open('names.txt', 'r').read().splitlines()
words[:10]

['emma',
 'olivia',
 'ava',
 'isabella',
 'sophia',
 'charlotte',
 'mia',
 'amelia',
 'harper',
 'evelyn']

In [3]:
context_length = 3

xs = []
ys = []

itos = {i + 1: chr(i + ord('a')) for i in range(26)} | {0: '.'}
stoi = {s:i for i,s in itos.items()}

for w in words:
    x = [0] * context_length
    w = w + '.'
    for c in w:
        xs.append(x)
        ix = stoi[c]
        ys.append(ix)
        x = x[1:] + [ix]

xs = torch.tensor(xs)
ys = torch.tensor(ys)

dataset = TensorDataset(xs, ys)

train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(dataset, [0.8, 0.1, 0.1])

train_dataloader = DataLoader(train_dataset, batch_size=256, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=256)

In [12]:
class MLPBlock(nn.Module):
    def __init__(self, input_dim, hidden_dim, activation, skip_connection):
        super().__init__()
        self.linear = nn.Linear(input_dim, hidden_dim)
        self.activation = activation()
        self.skip_connection = skip_connection
    
    def forward(self, x):
        out = self.linear(x)
        out = self.activation(out)
        if self.skip_connection:
            out = out + x
        return out
        

class MLP(L.LightningModule):
    def __init__(self, 
                 embedding_dim=8, 
                 hidden_dim=100, 
                 lr=0.001,
                 weight_decay=0.01,
                 activation="Tanh",
                 optimizer="AdamW",
                 num_layers=2,
                 skip_connection=True,
                 ):
        super().__init__()
        self.save_hyperparameters()

        activation = getattr(nn, self.hparams.activation)

        layers = [nn.Embedding(27, embedding_dim), nn.Flatten(-2)]
        for i in range(num_layers):
            if i == 0:
                input_dim = context_length * embedding_dim
                layer_skip_connection = False
            else:
                input_dim = hidden_dim
                layer_skip_connection = skip_connection
            layers.append(MLPBlock(input_dim, hidden_dim, activation, layer_skip_connection))
        layers.append(nn.Linear(hidden_dim, 27))

        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x)
        
    def training_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = F.cross_entropy(logits, y)
        self.log('train_loss', loss)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = F.cross_entropy(logits, y)
        self.log('val_loss', loss)
        return loss
    
    def configure_optimizers(self):
        optimizer = getattr(torch.optim, self.hparams.optimizer)
        return optimizer(self.model.parameters(), lr=self.hparams.lr, weight_decay=self.hparams.weight_decay)

In [None]:
def objective(trial: optuna.trial.Trial):
    embedding_dim = trial.suggest_int("embedding_dim", 4, 27)
    hidden_dim = trial.suggest_int("hidden_dim", 10, 200, log=True)
    lr = trial.suggest_float("lr", 1e-5, 1e-1, log=True)
    weight_decay = trial.suggest_float("weight_decay", 1e-4, 1e0, log=True)
    nonlinearity = trial.suggest_categorical("nonlinearity", ["Tanh", "ReLU"])
    optimizer = trial.suggest_categorical("optimizer", ["AdamW", "Adam", "SGD"])
    num_layers = trial.suggest_int("num_layers", 1, 4)
    skip_connection = trial.suggest_categorical("skip_connection", [True, False])

    model = MLP(embedding_dim, hidden_dim, lr, weight_decay, nonlinearity, optimizer, num_layers, skip_connection)
    
    trainer = L.Trainer(
        accelerator="cpu",
        devices=1,
        max_epochs=100,
        callbacks=[
            EarlyStopping(monitor="val_loss", mode="min"),
            PyTorchLightningPruningCallback(trial, monitor="val_loss")
        ],
        enable_model_summary=False,
        enable_progress_bar=False,
        enable_checkpointing=False
    )
    trainer.fit(model, train_dataloader, val_dataloader)
    return trainer.callback_metrics["val_loss"]

study = optuna.create_study(
    storage="sqlite:///makemore_part2.db",
    pruner=optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=10, n_min_trials=3),
    study_name="makemore_part2",
    load_if_exists=True
)
study.optimize(objective, n_trials=100)

[I 2025-10-01 13:38:31,266] Using an existing study with name 'makemore_part2' instead of creating a new one.
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/Users/kevin/nn-zero-to-hero/.venv/lib/python3.13/site-packages/lightning/pytorch/trainer/setup.py:177: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.

  | Name  | Type       | Params | Mode 
---------------------------------------------
0 | model | Sequential | 4.1 K  | train
---------------------------------------------
4.1 K     Trainable params
0         Non-trainable params
4.1 K     Total params
0.016     Total estimated model params size (MB)
7         Modules in train mode
0         Modules in eval mode


                                                                            

/Users/kevin/nn-zero-to-hero/.venv/lib/python3.13/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:433: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=9` in the `DataLoader` to improve performance.
/Users/kevin/nn-zero-to-hero/.venv/lib/python3.13/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:433: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=9` in the `DataLoader` to improve performance.


Epoch 43: 100%|██████████| 713/713 [00:01<00:00, 542.59it/s, v_num=26]


[I 2025-10-01 13:39:31,353] Trial 28 finished with value: 2.2000160217285156 and parameters: {'embedding_dim': 25, 'hidden_dim': 33, 'lr': 0.0004665978183545832, 'weight_decay': 0.11228561032697912, 'nonlinearity': 'ReLU', 'optimizer': 'AdamW', 'num_layers': 1, 'skip_connection': False}. Best is trial 8 with value: 2.199681043624878.
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name  | Type       | Params | Mode 
---------------------------------------------
0 | model | Sequential | 4.5 K  | train
---------------------------------------------
4.5 K     Trainable params
0         Non-trainable params
4.5 K     Total params
0.018     Total estimated model params size (MB)
10        Modules in train mode
0         Modules in eval mode


Epoch 77: 100%|██████████| 713/713 [00:01<00:00, 522.78it/s, v_num=27]       


[I 2025-10-01 13:41:35,203] Trial 29 finished with value: 2.1773698329925537 and parameters: {'embedding_dim': 17, 'hidden_dim': 35, 'lr': 0.0004723128830203145, 'weight_decay': 0.07899410497867367, 'nonlinearity': 'ReLU', 'optimizer': 'AdamW', 'num_layers': 2, 'skip_connection': False}. Best is trial 29 with value: 2.1773698329925537.
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name  | Type       | Params | Mode 
---------------------------------------------
0 | model | Sequential | 5.3 K  | train
---------------------------------------------
5.3 K     Trainable params
0         Non-trainable params
5.3 K     Total params
0.021     Total estimated model params size (MB)
13        Modules in train mode
0         Modules in eval mode


Epoch 10: 100%|██████████| 713/713 [00:01<00:00, 509.07it/s, v_num=28]      

[I 2025-10-01 13:41:56,890] Trial 30 pruned. Trial was pruned at epoch 10.
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name  | Type       | Params | Mode 
---------------------------------------------
0 | model | Sequential | 4.2 K  | train
---------------------------------------------
4.2 K     Trainable params
0         Non-trainable params
4.2 K     Total params
0.017     Total estimated model params size (MB)
7         Modules in train mode
0         Modules in eval mode


Epoch 67: 100%|██████████| 713/713 [00:01<00:00, 512.31it/s, v_num=29]      


[I 2025-10-01 13:43:30,679] Trial 31 finished with value: 2.1672070026397705 and parameters: {'embedding_dim': 16, 'hidden_dim': 49, 'lr': 0.0005697694495290225, 'weight_decay': 0.03033076568194515, 'nonlinearity': 'ReLU', 'optimizer': 'AdamW', 'num_layers': 1, 'skip_connection': False}. Best is trial 31 with value: 2.1672070026397705.
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name  | Type       | Params | Mode 
---------------------------------------------
0 | model | Sequential | 8.5 K  | train
---------------------------------------------
8.5 K     Trainable params
0         Non-trainable params
8.5 K     Total params
0.034     Total estimated model params size (MB)
10        Modules in train mode
0         Modules in eval mode


Epoch 7: 100%|██████████| 713/713 [00:01<00:00, 471.76it/s, v_num=30]       


[I 2025-10-01 13:43:42,875] Trial 32 finished with value: 2.488403558731079 and parameters: {'embedding_dim': 16, 'hidden_dim': 59, 'lr': 0.00012264543785137777, 'weight_decay': 0.025423285308456018, 'nonlinearity': 'ReLU', 'optimizer': 'Adam', 'num_layers': 2, 'skip_connection': True}. Best is trial 31 with value: 2.1672070026397705.
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name  | Type       | Params | Mode 
---------------------------------------------
0 | model | Sequential | 7.3 K  | train
---------------------------------------------
7.3 K     Trainable params
0         Non-trainable params
7.3 K     Total params
0.029     Total estimated model params size (MB)
7         Modules in train mode
0         Modules in eval mode


Epoch 10: 100%|██████████| 713/713 [00:01<00:00, 581.27it/s, v_num=31]      

[I 2025-10-01 13:43:57,868] Trial 33 pruned. Trial was pruned at epoch 10.
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name  | Type       | Params | Mode 
---------------------------------------------
0 | model | Sequential | 4.0 K  | train
---------------------------------------------
4.0 K     Trainable params
0         Non-trainable params
4.0 K     Total params
0.016     Total estimated model params size (MB)
7         Modules in train mode
0         Modules in eval mode


Epoch 59: 100%|██████████| 713/713 [00:01<00:00, 522.27it/s, v_num=32]       


[I 2025-10-01 13:45:24,284] Trial 34 finished with value: 2.1779372692108154 and parameters: {'embedding_dim': 17, 'hidden_dim': 44, 'lr': 0.0005559192282741966, 'weight_decay': 0.06130048532791953, 'nonlinearity': 'ReLU', 'optimizer': 'AdamW', 'num_layers': 1, 'skip_connection': False}. Best is trial 31 with value: 2.1672070026397705.
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name  | Type       | Params | Mode 
---------------------------------------------
0 | model | Sequential | 3.8 K  | train
---------------------------------------------
3.8 K     Trainable params
0         Non-trainable params
3.8 K     Total params
0.015     Total estimated model params size (MB)
7         Modules in train mode
0         Modules in eval mode


Epoch 71: 100%|██████████| 713/713 [00:01<00:00, 528.37it/s, v_num=33]      


[I 2025-10-01 13:47:02,230] Trial 35 finished with value: 2.1761631965637207 and parameters: {'embedding_dim': 17, 'hidden_dim': 42, 'lr': 0.0005825300867348956, 'weight_decay': 0.05204832952306737, 'nonlinearity': 'ReLU', 'optimizer': 'AdamW', 'num_layers': 1, 'skip_connection': False}. Best is trial 31 with value: 2.1672070026397705.
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name  | Type       | Params | Mode 
---------------------------------------------
0 | model | Sequential | 5.2 K  | train
---------------------------------------------
5.2 K     Trainable params
0         Non-trainable params
5.2 K     Total params
0.021     Total estimated model params size (MB)
7         Modules in train mode
0         Modules in eval mode


Epoch 84: 100%|██████████| 713/713 [00:01<00:00, 517.49it/s, v_num=34]       


[I 2025-10-01 13:48:58,845] Trial 36 finished with value: 2.155808687210083 and parameters: {'embedding_dim': 17, 'hidden_dim': 60, 'lr': 0.00034819320527166236, 'weight_decay': 0.05011872103244382, 'nonlinearity': 'ReLU', 'optimizer': 'AdamW', 'num_layers': 1, 'skip_connection': False}. Best is trial 36 with value: 2.155808687210083.
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name  | Type       | Params | Mode 
---------------------------------------------
0 | model | Sequential | 5.7 K  | train
---------------------------------------------
5.7 K     Trainable params
0         Non-trainable params
5.7 K     Total params
0.023     Total estimated model params size (MB)
7         Modules in train mode
0         Modules in eval mode


Epoch 82: 100%|██████████| 713/713 [00:01<00:00, 501.75it/s, v_num=35]      


[I 2025-10-01 13:50:56,931] Trial 37 finished with value: 2.16336727142334 and parameters: {'embedding_dim': 20, 'hidden_dim': 58, 'lr': 0.0002462641597173457, 'weight_decay': 0.030624267582018357, 'nonlinearity': 'ReLU', 'optimizer': 'AdamW', 'num_layers': 1, 'skip_connection': False}. Best is trial 36 with value: 2.155808687210083.
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name  | Type       | Params | Mode 
---------------------------------------------
0 | model | Sequential | 5.7 K  | train
---------------------------------------------
5.7 K     Trainable params
0         Non-trainable params
5.7 K     Total params
0.023     Total estimated model params size (MB)
7         Modules in train mode
0         Modules in eval mode


Epoch 4: 100%|██████████| 713/713 [00:01<00:00, 455.09it/s, v_num=36]       


[I 2025-10-01 13:51:04,132] Trial 38 finished with value: 2.5710835456848145 and parameters: {'embedding_dim': 20, 'hidden_dim': 58, 'lr': 0.0002267507755616385, 'weight_decay': 0.03083193724994157, 'nonlinearity': 'ReLU', 'optimizer': 'Adam', 'num_layers': 1, 'skip_connection': False}. Best is trial 36 with value: 2.155808687210083.
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name  | Type       | Params | Mode 
---------------------------------------------
0 | model | Sequential | 8.4 K  | train
---------------------------------------------
8.4 K     Trainable params
0         Non-trainable params
8.4 K     Total params
0.034     Total estimated model params size (MB)
7         Modules in train mode
0         Modules in eval mode


Epoch 10: 100%|██████████| 713/713 [00:01<00:00, 550.84it/s, v_num=37]       

[I 2025-10-01 13:51:19,867] Trial 39 pruned. Trial was pruned at epoch 10.
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name  | Type       | Params | Mode 
---------------------------------------------
0 | model | Sequential | 6.0 K  | train
---------------------------------------------
6.0 K     Trainable params
0         Non-trainable params
6.0 K     Total params
0.024     Total estimated model params size (MB)
7         Modules in train mode
0         Modules in eval mode


Epoch 10: 100%|██████████| 713/713 [00:18<00:00, 37.90it/s, v_num=37]        
Epoch 13: 100%|██████████| 713/713 [00:01<00:00, 542.01it/s, v_num=38]

[I 2025-10-01 13:51:40,143] Trial 40 pruned. Trial was pruned at epoch 13.
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name  | Type       | Params | Mode 
---------------------------------------------
0 | model | Sequential | 7.2 K  | train
---------------------------------------------
7.2 K     Trainable params
0         Non-trainable params
7.2 K     Total params
0.029     Total estimated model params size (MB)
7         Modules in train mode
0         Modules in eval mode


Epoch 28: 100%|██████████| 713/713 [00:01<00:00, 521.29it/s, v_num=39]       


[I 2025-10-01 13:52:21,220] Trial 41 finished with value: 2.1404693126678467 and parameters: {'embedding_dim': 20, 'hidden_dim': 75, 'lr': 0.001386427889987593, 'weight_decay': 0.006211991924876842, 'nonlinearity': 'ReLU', 'optimizer': 'AdamW', 'num_layers': 1, 'skip_connection': False}. Best is trial 41 with value: 2.1404693126678467.
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name  | Type       | Params | Mode 
---------------------------------------------
0 | model | Sequential | 7.3 K  | train
---------------------------------------------
7.3 K     Trainable params
0         Non-trainable params
7.3 K     Total params
0.029     Total estimated model params size (MB)
7         Modules in train mode
0         Modules in eval mode


Epoch 10: 100%|██████████| 713/713 [00:01<00:00, 588.21it/s, v_num=40]       

[I 2025-10-01 13:52:36,204] Trial 42 pruned. Trial was pruned at epoch 10.
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name  | Type       | Params | Mode 
---------------------------------------------
0 | model | Sequential | 11.5 K | train
---------------------------------------------
11.5 K    Trainable params
0         Non-trainable params
11.5 K    Total params
0.046     Total estimated model params size (MB)
7         Modules in train mode
0         Modules in eval mode


Epoch 10: 100%|██████████| 713/713 [00:01<00:00, 532.91it/s, v_num=41]      

[I 2025-10-01 13:52:52,456] Trial 43 pruned. Trial was pruned at epoch 10.
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name  | Type       | Params | Mode 
---------------------------------------------
0 | model | Sequential | 4.3 K  | train
---------------------------------------------
4.3 K     Trainable params
0         Non-trainable params
4.3 K     Total params
0.017     Total estimated model params size (MB)
7         Modules in train mode
0         Modules in eval mode


Epoch 10: 100%|██████████| 713/713 [00:08<00:00, 84.25it/s, v_num=41]        
Epoch 10: 100%|██████████| 713/713 [11:05<00:00,  1.07it/s, v_num=28] 
Epoch 10: 100%|██████████| 713/713 [09:04<00:00,  1.31it/s, v_num=31] 
Epoch 13: 100%|██████████| 713/713 [01:22<00:00,  8.65it/s, v_num=38] 
Epoch 10: 100%|██████████| 713/713 [00:26<00:00, 27.13it/s, v_num=40] 
Epoch 10: 100%|██████████| 713/713 [00:01<00:00, 575.35it/s, v_num=42]

[I 2025-10-01 13:53:08,356] Trial 44 pruned. Trial was pruned at epoch 10.
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name  | Type       | Params | Mode 
---------------------------------------------
0 | model | Sequential | 5.8 K  | train
---------------------------------------------
5.8 K     Trainable params
0         Non-trainable params
5.8 K     Total params
0.023     Total estimated model params size (MB)
7         Modules in train mode
0         Modules in eval mode


Epoch 81: 100%|██████████| 713/713 [00:01<00:00, 537.22it/s, v_num=43]       


[I 2025-10-01 13:55:03,139] Trial 45 finished with value: 2.153270721435547 and parameters: {'embedding_dim': 18, 'hidden_dim': 65, 'lr': 0.0003103127244384076, 'weight_decay': 0.021257956697551064, 'nonlinearity': 'ReLU', 'optimizer': 'AdamW', 'num_layers': 1, 'skip_connection': False}. Best is trial 41 with value: 2.1404693126678467.
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name  | Type       | Params | Mode 
---------------------------------------------
0 | model | Sequential | 6.9 K  | train
---------------------------------------------
6.9 K     Trainable params
0         Non-trainable params
6.9 K     Total params
0.028     Total estimated model params size (MB)
7         Modules in train mode
0         Modules in eval mode


Epoch 58: 100%|██████████| 713/713 [00:01<00:00, 498.62it/s, v_num=44]       


[I 2025-10-01 13:56:24,176] Trial 46 finished with value: 2.146749258041382 and parameters: {'embedding_dim': 19, 'hidden_dim': 75, 'lr': 0.0003253846204760969, 'weight_decay': 0.01945845037022007, 'nonlinearity': 'ReLU', 'optimizer': 'AdamW', 'num_layers': 1, 'skip_connection': False}. Best is trial 41 with value: 2.1404693126678467.
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name  | Type       | Params | Mode 
---------------------------------------------
0 | model | Sequential | 8.4 K  | train
---------------------------------------------
8.4 K     Trainable params
0         Non-trainable params
8.4 K     Total params
0.034     Total estimated model params size (MB)
7         Modules in train mode
0         Modules in eval mode


Epoch 10: 100%|██████████| 713/713 [00:01<00:00, 523.17it/s, v_num=45]      

[I 2025-10-01 13:56:40,484] Trial 47 pruned. Trial was pruned at epoch 10.
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name  | Type       | Params | Mode 
---------------------------------------------
0 | model | Sequential | 6.0 K  | train
---------------------------------------------
6.0 K     Trainable params
0         Non-trainable params
6.0 K     Total params
0.024     Total estimated model params size (MB)
7         Modules in train mode
0         Modules in eval mode


Epoch 10: 100%|██████████| 713/713 [00:04<00:00, 165.20it/s, v_num=45]      
Epoch 10: 100%|██████████| 713/713 [00:01<00:00, 582.50it/s, v_num=46]

[I 2025-10-01 13:56:55,646] Trial 48 pruned. Trial was pruned at epoch 10.
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name  | Type       | Params | Mode 
---------------------------------------------
0 | model | Sequential | 7.6 K  | train
---------------------------------------------
7.6 K     Trainable params
0         Non-trainable params
7.6 K     Total params
0.030     Total estimated model params size (MB)
7         Modules in train mode
0         Modules in eval mode


Epoch 25: 100%|██████████| 713/713 [00:01<00:00, 529.11it/s, v_num=47]      


[I 2025-10-01 13:57:33,068] Trial 49 finished with value: 2.140131950378418 and parameters: {'embedding_dim': 21, 'hidden_dim': 77, 'lr': 0.0011912515111021551, 'weight_decay': 0.019224226552320204, 'nonlinearity': 'ReLU', 'optimizer': 'AdamW', 'num_layers': 1, 'skip_connection': False}. Best is trial 49 with value: 2.140131950378418.
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name  | Type       | Params | Mode 
---------------------------------------------
0 | model | Sequential | 7.3 K  | train
---------------------------------------------
7.3 K     Trainable params
0         Non-trainable params
7.3 K     Total params
0.029     Total estimated model params size (MB)
7         Modules in train mode
0         Modules in eval mode


Epoch 45: 100%|██████████| 713/713 [00:01<00:00, 481.55it/s, v_num=48]       


[I 2025-10-01 13:58:38,544] Trial 50 finished with value: 2.131796360015869 and parameters: {'embedding_dim': 21, 'hidden_dim': 74, 'lr': 0.0009447724447821449, 'weight_decay': 0.01617222635636376, 'nonlinearity': 'ReLU', 'optimizer': 'AdamW', 'num_layers': 1, 'skip_connection': False}. Best is trial 50 with value: 2.131796360015869.
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name  | Type       | Params | Mode 
---------------------------------------------
0 | model | Sequential | 13.9 K | train
---------------------------------------------
13.9 K    Trainable params
0         Non-trainable params
13.9 K    Total params
0.055     Total estimated model params size (MB)
10        Modules in train mode
0         Modules in eval mode


Epoch 34: 100%|██████████| 713/713 [00:01<00:00, 476.18it/s, v_num=49]      


[I 2025-10-01 13:59:35,389] Trial 51 finished with value: 2.104018449783325 and parameters: {'embedding_dim': 22, 'hidden_dim': 77, 'lr': 0.0012771381424109456, 'weight_decay': 0.019930834132077524, 'nonlinearity': 'Tanh', 'optimizer': 'AdamW', 'num_layers': 2, 'skip_connection': False}. Best is trial 51 with value: 2.104018449783325.
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name  | Type       | Params | Mode 
---------------------------------------------
0 | model | Sequential | 22.5 K | train
---------------------------------------------
22.5 K    Trainable params
0         Non-trainable params
22.5 K    Total params
0.090     Total estimated model params size (MB)
10        Modules in train mode
0         Modules in eval mode


Epoch 19: 100%|██████████| 713/713 [00:01<00:00, 440.63it/s, v_num=50]      


[I 2025-10-01 14:00:08,064] Trial 52 finished with value: 2.106266498565674 and parameters: {'embedding_dim': 22, 'hidden_dim': 108, 'lr': 0.0012565471706450088, 'weight_decay': 0.006173760536790875, 'nonlinearity': 'Tanh', 'optimizer': 'AdamW', 'num_layers': 2, 'skip_connection': False}. Best is trial 51 with value: 2.104018449783325.
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name  | Type       | Params | Mode 
---------------------------------------------
0 | model | Sequential | 23.2 K | train
---------------------------------------------
23.2 K    Trainable params
0         Non-trainable params
23.2 K    Total params
0.093     Total estimated model params size (MB)
10        Modules in train mode
0         Modules in eval mode


Epoch 10: 100%|██████████| 713/713 [00:01<00:00, 539.05it/s, v_num=51]      

[I 2025-10-01 14:00:25,048] Trial 53 pruned. Trial was pruned at epoch 10.
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name  | Type       | Params | Mode 
---------------------------------------------
0 | model | Sequential | 30.1 K | train
---------------------------------------------
30.1 K    Trainable params
0         Non-trainable params
30.1 K    Total params
0.120     Total estimated model params size (MB)
10        Modules in train mode
0         Modules in eval mode


Epoch 19: 100%|██████████| 713/713 [00:01<00:00, 442.85it/s, v_num=52]      


[I 2025-10-01 14:00:57,697] Trial 54 finished with value: 2.1005465984344482 and parameters: {'embedding_dim': 26, 'hidden_dim': 126, 'lr': 0.0010634826494833414, 'weight_decay': 0.0031462909687132523, 'nonlinearity': 'Tanh', 'optimizer': 'AdamW', 'num_layers': 2, 'skip_connection': False}. Best is trial 54 with value: 2.1005465984344482.
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name  | Type       | Params | Mode 
---------------------------------------------
0 | model | Sequential | 36.1 K | train
---------------------------------------------
36.1 K    Trainable params
0         Non-trainable params
36.1 K    Total params
0.144     Total estimated model params size (MB)
10        Modules in train mode
0         Modules in eval mode


Epoch 24: 100%|██████████| 713/713 [00:01<00:00, 391.87it/s, v_num=53]      


[I 2025-10-01 14:01:40,138] Trial 55 finished with value: 2.0942769050598145 and parameters: {'embedding_dim': 26, 'hidden_dim': 142, 'lr': 0.0009327816837211195, 'weight_decay': 0.00277329644117428, 'nonlinearity': 'Tanh', 'optimizer': 'AdamW', 'num_layers': 2, 'skip_connection': False}. Best is trial 55 with value: 2.0942769050598145.
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name  | Type       | Params | Mode 
---------------------------------------------
0 | model | Sequential | 40.5 K | train
---------------------------------------------
40.5 K    Trainable params
0         Non-trainable params
40.5 K    Total params
0.162     Total estimated model params size (MB)
10        Modules in train mode
0         Modules in eval mode


Epoch 28: 100%|██████████| 713/713 [00:01<00:00, 394.95it/s, v_num=54]      


[I 2025-10-01 14:02:33,913] Trial 56 finished with value: 2.0905239582061768 and parameters: {'embedding_dim': 26, 'hidden_dim': 153, 'lr': 0.0008777496578737832, 'weight_decay': 0.002890229623913656, 'nonlinearity': 'Tanh', 'optimizer': 'AdamW', 'num_layers': 2, 'skip_connection': False}. Best is trial 56 with value: 2.0905239582061768.
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name  | Type       | Params | Mode 
---------------------------------------------
0 | model | Sequential | 43.4 K | train
---------------------------------------------
43.4 K    Trainable params
0         Non-trainable params
43.4 K    Total params
0.174     Total estimated model params size (MB)
10        Modules in train mode
0         Modules in eval mode


Epoch 12: 100%|██████████| 713/713 [00:01<00:00, 411.12it/s, v_num=55]      


[I 2025-10-01 14:02:58,326] Trial 57 finished with value: 2.145533561706543 and parameters: {'embedding_dim': 26, 'hidden_dim': 160, 'lr': 0.005196961895472182, 'weight_decay': 0.0025268326910676325, 'nonlinearity': 'Tanh', 'optimizer': 'AdamW', 'num_layers': 2, 'skip_connection': False}. Best is trial 56 with value: 2.0905239582061768.
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name  | Type       | Params | Mode 
---------------------------------------------
0 | model | Sequential | 38.9 K | train
---------------------------------------------
38.9 K    Trainable params
0         Non-trainable params
38.9 K    Total params
0.155     Total estimated model params size (MB)
10        Modules in train mode
0         Modules in eval mode


Epoch 26: 100%|██████████| 713/713 [00:01<00:00, 412.42it/s, v_num=56]      


[I 2025-10-01 14:03:46,448] Trial 58 finished with value: 2.093871831893921 and parameters: {'embedding_dim': 26, 'hidden_dim': 149, 'lr': 0.0007799701641572431, 'weight_decay': 0.0008898447147567709, 'nonlinearity': 'Tanh', 'optimizer': 'AdamW', 'num_layers': 2, 'skip_connection': False}. Best is trial 56 with value: 2.0905239582061768.
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name  | Type       | Params | Mode 
---------------------------------------------
0 | model | Sequential | 37.7 K | train
---------------------------------------------
37.7 K    Trainable params
0         Non-trainable params
37.7 K    Total params
0.151     Total estimated model params size (MB)
10        Modules in train mode
0         Modules in eval mode


Epoch 19: 100%|██████████| 713/713 [00:01<00:00, 413.76it/s, v_num=57]      


[I 2025-10-01 14:04:21,355] Trial 59 finished with value: 2.1026899814605713 and parameters: {'embedding_dim': 27, 'hidden_dim': 145, 'lr': 0.001825279055510972, 'weight_decay': 0.0007031884204813492, 'nonlinearity': 'Tanh', 'optimizer': 'AdamW', 'num_layers': 2, 'skip_connection': False}. Best is trial 56 with value: 2.0905239582061768.
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name  | Type       | Params | Mode 
---------------------------------------------
0 | model | Sequential | 41.0 K | train
---------------------------------------------
41.0 K    Trainable params
0         Non-trainable params
41.0 K    Total params
0.164     Total estimated model params size (MB)
10        Modules in train mode
0         Modules in eval mode


Epoch 18: 100%|██████████| 713/713 [00:01<00:00, 439.56it/s, v_num=58]      


[I 2025-10-01 14:04:54,859] Trial 60 finished with value: 2.106043577194214 and parameters: {'embedding_dim': 27, 'hidden_dim': 153, 'lr': 0.002230706856507885, 'weight_decay': 0.000580547414261636, 'nonlinearity': 'Tanh', 'optimizer': 'AdamW', 'num_layers': 2, 'skip_connection': False}. Best is trial 56 with value: 2.0905239582061768.
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name  | Type       | Params | Mode 
---------------------------------------------
0 | model | Sequential | 53.8 K | train
---------------------------------------------
53.8 K    Trainable params
0         Non-trainable params
53.8 K    Total params
0.215     Total estimated model params size (MB)
10        Modules in train mode
0         Modules in eval mode


Epoch 15: 100%|██████████| 713/713 [00:01<00:00, 415.78it/s, v_num=59]      

[I 2025-10-01 14:05:24,351] Trial 61 pruned. Trial was pruned at epoch 15.
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name  | Type       | Params | Mode 
---------------------------------------------
0 | model | Sequential | 49.6 K | train
---------------------------------------------
49.6 K    Trainable params
0         Non-trainable params
49.6 K    Total params
0.198     Total estimated model params size (MB)
13        Modules in train mode
0         Modules in eval mode


Epoch 10: 100%|██████████| 713/713 [00:01<00:00, 373.90it/s, v_num=60]      


[I 2025-10-01 14:05:46,167] Trial 62 finished with value: 2.119905710220337 and parameters: {'embedding_dim': 24, 'hidden_dim': 133, 'lr': 0.001996864697711355, 'weight_decay': 0.0009750502346692682, 'nonlinearity': 'Tanh', 'optimizer': 'AdamW', 'num_layers': 3, 'skip_connection': True}. Best is trial 56 with value: 2.0905239582061768.
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name  | Type       | Params | Mode 
---------------------------------------------
0 | model | Sequential | 47.4 K | train
---------------------------------------------
47.4 K    Trainable params
0         Non-trainable params
47.4 K    Total params
0.189     Total estimated model params size (MB)
10        Modules in train mode
0         Modules in eval mode


Epoch 5: 100%|██████████| 713/713 [00:01<00:00, 404.92it/s, v_num=61]       


[I 2025-10-01 14:05:57,427] Trial 63 finished with value: 2.3397669792175293 and parameters: {'embedding_dim': 26, 'hidden_dim': 169, 'lr': 0.01574284372781331, 'weight_decay': 0.00031308491995447104, 'nonlinearity': 'Tanh', 'optimizer': 'AdamW', 'num_layers': 2, 'skip_connection': False}. Best is trial 56 with value: 2.0905239582061768.
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name  | Type       | Params | Mode 
---------------------------------------------
0 | model | Sequential | 35.0 K | train
---------------------------------------------
35.0 K    Trainable params
0         Non-trainable params
35.0 K    Total params
0.140     Total estimated model params size (MB)
10        Modules in train mode
0         Modules in eval mode


Epoch 9: 100%|██████████| 713/713 [00:01<00:00, 371.14it/s, v_num=62]       


[I 2025-10-01 14:06:16,258] Trial 64 finished with value: 2.112555742263794 and parameters: {'embedding_dim': 27, 'hidden_dim': 138, 'lr': 0.0018609375262171955, 'weight_decay': 0.001959827116999578, 'nonlinearity': 'Tanh', 'optimizer': 'AdamW', 'num_layers': 2, 'skip_connection': False}. Best is trial 56 with value: 2.0905239582061768.
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name  | Type       | Params | Mode 
---------------------------------------------
0 | model | Sequential | 37.5 K | train
---------------------------------------------
37.5 K    Trainable params
0         Non-trainable params
37.5 K    Total params
0.150     Total estimated model params size (MB)
10        Modules in train mode
0         Modules in eval mode


Epoch 9: 100%|██████████| 713/713 [00:02<00:00, 345.22it/s, v_num=63]        


[I 2025-10-01 14:06:36,766] Trial 65 finished with value: 2.142451286315918 and parameters: {'embedding_dim': 24, 'hidden_dim': 148, 'lr': 0.004223332964783252, 'weight_decay': 0.0003758033857640266, 'nonlinearity': 'Tanh', 'optimizer': 'AdamW', 'num_layers': 2, 'skip_connection': False}. Best is trial 56 with value: 2.0905239582061768.
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name  | Type       | Params | Mode 
---------------------------------------------
0 | model | Sequential | 30.5 K | train
---------------------------------------------
30.5 K    Trainable params
0         Non-trainable params
30.5 K    Total params
0.122     Total estimated model params size (MB)
10        Modules in train mode
0         Modules in eval mode


Epoch 14: 100%|██████████| 713/713 [00:01<00:00, 418.15it/s, v_num=64]      


[I 2025-10-01 14:07:04,452] Trial 66 finished with value: 2.1787467002868652 and parameters: {'embedding_dim': 27, 'hidden_dim': 126, 'lr': 0.007633932627039124, 'weight_decay': 0.0007420470907282995, 'nonlinearity': 'Tanh', 'optimizer': 'AdamW', 'num_layers': 2, 'skip_connection': False}. Best is trial 56 with value: 2.0905239582061768.
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name  | Type       | Params | Mode 
---------------------------------------------
0 | model | Sequential | 42.9 K | train
---------------------------------------------
42.9 K    Trainable params
0         Non-trainable params
42.9 K    Total params
0.172     Total estimated model params size (MB)
10        Modules in train mode
0         Modules in eval mode


Epoch 17: 100%|██████████| 713/713 [00:02<00:00, 356.30it/s, v_num=65]      


[I 2025-10-01 14:07:43,325] Trial 67 finished with value: 2.1155896186828613 and parameters: {'embedding_dim': 25, 'hidden_dim': 160, 'lr': 0.0028325037051276916, 'weight_decay': 0.0032724649238993434, 'nonlinearity': 'Tanh', 'optimizer': 'AdamW', 'num_layers': 2, 'skip_connection': False}. Best is trial 56 with value: 2.0905239582061768.
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name  | Type       | Params | Mode 
---------------------------------------------
0 | model | Sequential | 58.6 K | train
---------------------------------------------
58.6 K    Trainable params
0         Non-trainable params
58.6 K    Total params
0.235     Total estimated model params size (MB)
10        Modules in train mode
0         Modules in eval mode


Epoch 12: 100%|██████████| 713/713 [00:02<00:00, 354.47it/s, v_num=66]      


[I 2025-10-01 14:08:09,642] Trial 68 finished with value: 2.1070501804351807 and parameters: {'embedding_dim': 26, 'hidden_dim': 193, 'lr': 0.0018092132814085924, 'weight_decay': 0.0013342154966771917, 'nonlinearity': 'Tanh', 'optimizer': 'AdamW', 'num_layers': 2, 'skip_connection': False}. Best is trial 56 with value: 2.0905239582061768.
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name  | Type       | Params | Mode 
---------------------------------------------
0 | model | Sequential | 48.4 K | train
---------------------------------------------
48.4 K    Trainable params
0         Non-trainable params
48.4 K    Total params
0.194     Total estimated model params size (MB)
10        Modules in train mode
0         Modules in eval mode


Epoch 16: 100%|██████████| 713/713 [00:01<00:00, 368.72it/s, v_num=67]      


[I 2025-10-01 14:08:43,402] Trial 69 finished with value: 2.1008408069610596 and parameters: {'embedding_dim': 23, 'hidden_dim': 175, 'lr': 0.0007334814191777619, 'weight_decay': 0.0004775110863698371, 'nonlinearity': 'Tanh', 'optimizer': 'AdamW', 'num_layers': 2, 'skip_connection': False}. Best is trial 56 with value: 2.0905239582061768.
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name  | Type       | Params | Mode 
---------------------------------------------
0 | model | Sequential | 49.4 K | train
---------------------------------------------
49.4 K    Trainable params
0         Non-trainable params
49.4 K    Total params
0.198     Total estimated model params size (MB)
10        Modules in train mode
0         Modules in eval mode


Epoch 12:  59%|█████▊    | 418/713 [00:01<00:01, 211.33it/s, v_num=68]      

In [None]:
# init_model = MLP()
# init_loss = F.cross_entropy(init_model(xs), ys)
# print(init_loss)

tensor(3.3429, grad_fn=<NllLossBackward0>)


In [None]:
# torch.log(torch.tensor(27))

tensor(3.2958)