# MLP with floating point weights and baremetal C implementation

In [16]:
import sys
import time
import serial
from tqdm import tqdm

import numpy as np
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

import torch
from torch.utils.data import TensorDataset, DataLoader

import src.python.dataset.yalefaces as yalefaces
import src.python.model.util as util

np.random.seed(99)

In [17]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

## Load dataset
### Load and normalize the raw dataset

In [18]:
X, y = yalefaces.load("dataset/yalefaces", flatten=True)
X = X.astype("float32") / 255.0

### Compress dataset with PCA
Computing PCA parameters from the whole dataset for educational purposes only.

In [19]:
num_faces, num_pixels = X.shape
num_principal_components = int(num_faces)
pca = PCA(n_components=num_principal_components)

pca.fit(X)
X = pca.transform(X)

### Split into train and test datasets

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True, stratify=y)

### Convert datasets to pythorch tensors

In [21]:
train_dataset = TensorDataset(torch.Tensor(X_train), torch.LongTensor(y_train))
test_dataset = TensorDataset(torch.Tensor(X_test), torch.LongTensor(y_test))

## Train MLP

In [22]:
class MLP(torch.nn.Module):
  def __init__(self):
    super().__init__()
    self.layers = torch.nn.Sequential(
      torch.nn.Linear(165, 96, bias=False),
      torch.nn.ReLU(),
      torch.nn.Linear(96, 15, bias=False),
    )

  def forward(self, x):
    return self.layers(x)

In [23]:
model = MLP()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-2, weight_decay=0.0001)

In [24]:
train_accs= []
train_losses = []

for epoch in range(200):
    train_data = DataLoader(train_dataset, batch_size=64, shuffle=True)

    error, num_samples = util.train(model, device, train_data, optimizer)
    loss = float(error)/float(num_samples)
    train_losses.append(loss)

    acc = util.test(model, device, train_data)
    train_accs.append(acc)

In [25]:
test_data = DataLoader(test_dataset, batch_size=len(test_dataset))
acc = util.test(model, device, test_data)
print(f"Test accuracy: {acc * 100:.2f}")

Test accuracy: 81.82


## Export model weights as C code

In [26]:
model_params=model.state_dict()

In [27]:
layer_indexes = [0, 2]

src_filename = 'src/embedded/1-mlp-baremetal-float/esp32s3/main/mlp_weights.c'
hdr_filename = 'src/embedded/1-mlp-baremetal-float/esp32s3/include/mlp_weights.h'

with open(src_filename, 'w') as source, open(hdr_filename, 'w') as header:
    
    header.write('#ifndef MLP_WEIGHTS\n#define MLP_WEIGHTS\n\n')
    header.write('#include <stdint.h>\n\n')
    
    source.write('#include "mlp_weights.h"\n\n')

    for layer in layer_indexes:
        weights = util.get_weights(model_params, layer).flatten()

        # Weights
        header.write(f"extern const float layer_{layer}_weights[{len(weights)}];\n")
        source.write(f"const float layer_{layer}_weights[{len(weights)}] = {{")
        for i in range(len(weights)-1):
            source.write(f"{weights[i]}, ")
        source.write(f"{weights[len(weights)-1]}}};\n\n")

    header.write('\n#endif // end of MLP_PARAMS\n')

## Talk to an ESP32

In [30]:
CPU_FREQ_KHZ = 240000
num_tests = len(X_test)

with serial.Serial("/dev/ttyUSB0", 115200, timeout=None) as esp32, tqdm(total=num_tests, file=sys.stdout) as pbar:
    esp32.read_until(b'Ready\n')

    num_correct = 0
    all_elapsed = []

    for i in range(num_tests):
        face = X_test[i]
        expected = y_test[i]

        # check if device is ready
        msg = esp32.read(18)
        assert msg == b'Waiting for input\n', msg

        # send command=1 (new inference)
        esp32.write(b'\x01')

        # send input (165 bytes)
        esp32.write(face.tobytes())

        # read output
        subject = esp32.read(4)
        subject = int.from_bytes(subject, byteorder="little")

        # read inference duration
        elapsed = esp32.read(4)
        elapsed = int.from_bytes(elapsed, byteorder="little")
        all_elapsed.append(elapsed)

        # count correct inferences
        if expected == subject:
            num_correct += 1

        # update progress bar
        acc = num_correct/(i+1)
        pbar.set_description(f"Accuracy = {acc*100:.2f}%, Average Inference Duration = {elapsed/CPU_FREQ_KHZ:.3f}ms")
        pbar.update(1)

Accuracy = 81.82%, Average Inference Duration = 4.545ms: 100%|██████████| 33/33 [00:02<00:00, 14.08it/s]
