# Learning Multiplication, Incorporate True Multiplication in a Layer 
Architecture:
 - linear layer
 - relu to hidden layer of size N
 - take M hidden units and multiply them, other hidden units pass through to new layer of size N - M
 - lineary layer
 - no final ReLu

In [1]:
import sys
from typing import Literal

import numpy as np
import torch
from torch import nn, Tensor
from torch.backends.mps import is_available as mps_is_available
from torch.cuda import is_available as cuda_is_available
from torchinfo import summary
from tqdm.notebook import tqdm

In [2]:
if not sys.version_info >= (3, 10):
    raise RuntimeError("This notebook requires Python 3.10 or later.")
print(f'Python version: {sys.version}')

print(f'PyTorch version: {torch.__version__}')
device = 'cuda' if cuda_is_available() else 'mps' if mps_is_available() else 'cpu'
print(f'Currently, using {device} device.')

Python version: 3.10.11 (main, Apr  7 2023, 07:24:53) [Clang 14.0.0 (clang-1400.0.29.202)]
PyTorch version: 2.0.0
Currently, using mps device.


In [None]:
batchsize = 10000

In [23]:
class Feedforward(torch.nn.Module):
    def __init__(self, input_size: int, hidden_size: int):
        super(Feedforward, self).__init__()
        self.input_size = input_size
        self.hidden_size  = hidden_size
        self.fc1 = nn.Linear(self.input_size, self.hidden_size)
        self.relu1 = nn.ReLU()
        self.fc2a = nn.Linear(self.input_size, 5)
        self.fc2b = nn.Linear(self.input_size, 5)
        self.relu2a = nn.ReLU()
        self.relu2b = nn.ReLU()
        self.fc3 = nn.Linear(self.hidden_size + 5, 1)
        #self.tanh = torch.nn.Tanh()
        
    def forward(self, x):
        fc1_out = self.fc1(x)
        hidden1 = self.relu1(fc1_out)
        fc2a_out = self.fc2a(x)
        hidden2a = self.relu2a(fc2a_out)
        fc2b_out = self.fc2b(x)
        hidden2b = self.relu2b(fc2b_out)
        output = self.fc3(torch.cat([hidden1, hidden2a * hidden2b], dim = 1))
        return output

In [3]:
# Create training and test data

# for x_train/x_test we need (n, 2) of random -1 to 1
# and output y_train/y_test we need the product
x_train = torch.from_numpy(np.random.uniform(low=-1.0, high=1.0, size=(batchsize, 2)))
y_train = x_train[:, 0] * x_train[:, 1]

# print(x_train, y_train)
x_test = torch.from_numpy(np.random.uniform(low=-1.0, high=1.0, size=(batchsize, 2)))
y_test = x_test[:, 0] * x_test[:, 1]

print("x_train size", x_train.size())

In [34]:
model = Feedforward(2, 100)
summary(model, input_size=x_train.size())

x_train size torch.Size([10000, 2])


Layer (type:depth-idx)                   Output Shape              Param #
Feedforward                              [10000, 1]                --
├─Linear: 1-1                            [10000, 100]              300
├─ReLU: 1-2                              [10000, 100]              --
├─Linear: 1-3                            [10000, 5]                15
├─ReLU: 1-4                              [10000, 5]                --
├─Linear: 1-5                            [10000, 5]                15
├─ReLU: 1-6                              [10000, 5]                --
├─Linear: 1-7                            [10000, 1]                106
Total params: 436
Trainable params: 436
Non-trainable params: 0
Total mult-adds (Units.MEGABYTES): 4.36
Input size (MB): 0.08
Forward/backward pass size (MB): 8.88
Params size (MB): 0.00
Estimated Total Size (MB): 8.96

In [33]:
model = Feedforward(2, 1)
summary(model, input_size=(10000, 2))
model.to(device)
# input is (batchsize, 2)
# output dimension is (batchsize, 1)
criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 0.002)


# Train the model
model.train()
epoch = 30000
x_train = x_train.to(device)
y_train = y_train.to(device)
for epoch in range(1, epoch + 1):
    optimizer.zero_grad()
    # Forward pass
    y_pred = model(x_train)
    y_pred.to(device)
    # Compute Loss
    loss = criterion(y_pred.squeeze(), y_train)
   
    if epoch % 1000 == 0 or epoch == 1:
        print('Epoch {}: train loss: {}'.format(epoch, loss.item()))    # Backward pass
    loss.backward()
    optimizer.step()


# Evaluate
model.eval()
x_test = x_test.to(device)
y_test = y_test.to(device)
y_pred = model(x_test)
after_train = criterion(y_pred.squeeze(), y_test) 
print('Test loss after Training' , after_train.item())


# What is S/N ratio in dB corresponding to MSE loss?
actual_error = np.sqrt(after_train.item())
print("In dB:", 20 * np.log10(actual_error))

Epoch 1: train loss: 0.10948698222637177
Epoch 1000: train loss: 0.00045749463606625795
Epoch 2000: train loss: 7.938526687212288e-05
Epoch 3000: train loss: 2.3687311113462783e-05
Epoch 4000: train loss: 5.617737770080566e-06
Epoch 5000: train loss: 8.270103535323869e-07
Epoch 6000: train loss: 8.570611242930681e-08
Epoch 7000: train loss: 1.4497635980603718e-08
Epoch 8000: train loss: 1.153973805401165e-08
Epoch 9000: train loss: 1.2379606229018236e-09
Epoch 10000: train loss: 5.142147818659737e-10
Epoch 11000: train loss: 3.6724616681027555e-09
Epoch 12000: train loss: 1.524626203064372e-09
Epoch 13000: train loss: 1.9868307887804804e-08
Epoch 14000: train loss: 7.867031399655389e-07
Epoch 15000: train loss: 1.0021226160006336e-07
Epoch 16000: train loss: 7.827910541990946e-11
Epoch 17000: train loss: 4.5208636834104254e-08
Epoch 18000: train loss: 1.6265540807580692e-06
Epoch 19000: train loss: 9.136630896300346e-10
Epoch 20000: train loss: 8.960708564043074e-11
Epoch 21000: train 

```
layers hidden   time   batchsize   dB
1        10       7s       10000  -15
2        10      10s       10000  -42
1       100       7s       10000  -54
2       100      15s       10000  -52
1       100    1m02s      100000  -55
1      1000      35s       10000  -65
1      1000    6m14s      100000  -64 
2      1000    3m50s       10000  -59
1      2000    1m06s       10000  -66
1      2000               100000  
2      2000   13m12s       10000  -63
1      4000    2m04s       10000  -63
1      4000   22m13s      100000   17 (!)
With multiplies...
1       1+5    1m23s       10000  -86
```