In [48]:
from sklearn.datasets import load_iris
import numpy as np
import random
from tqdm import tqdm

In [49]:
iris_set = load_iris()
n_classes = iris_set.target_names.__len__()

[dataset specification](https://scikit-learn.org/1.5/modules/generated/sklearn.datasets.load_iris.html)

In [50]:
class Dataset:
  def __init__(self, dataset, indices, transform=None, encoder=None):
    self.dataset, self.indices = dataset, indices
    self.transform, self.encoder = transform, encoder
  def __getitem__(self, item: int):
    assert not item > len(self.indices), "the index is out of bound"
    idx = self.indices[item]
    feature, label = self.dataset.data[idx], self.dataset.target[idx]
    if self.transform: feature = self.transform(feature)
    if self.encoder: label = self.encoder(label)
    return feature, label
  def __len__(self): return len(self.indices)

In [51]:
indices = random.sample(range(iris_set.data.__len__()), 100)
encoder = lambda index: np.eye(n_classes)[index]

# init Datasets
trainset = Dataset(iris_set, indices[:50], encoder=encoder)
testset = Dataset(iris_set, indices[50:], encoder=encoder)

# Understanding Neural Network Foundations: Perceptron, ADALINE and MLP
Understanding Neural Network Foundations: Perceptron, ADALINE and MLP

## Perceptron

**Perceptron (or Binary Classifier)** is an algorithm that classifies data into two categories by determining which class an input belongs to. Unlike regression models which focus on prediction and analysis, the perceptron specializes in classifying inputs into binary classes represented as $0, 1$.

Perceptron was invented by Warren McCulloch and Walter Pitts in 1943. Rather than software, it was implemented as a machine—the IBM 704—designed to classify images. Later, the Mark 1 Perceptron emerged as the software version of this system.

### Formulation and Learning Rule
It does not use partial derivatives, but instead relies on the simple linear separability of the data.

$$
f(z) = f[w(t) \cdot x_j]
$$

1. Initialize weights to $0$ or a random number.
2. Calculate outputs: $y_j(t) = f[w(t) \cdot x_j]$
3.  Update weights: $w_i(t + 1) = w_i(t) + r \cdot (d_j - y_j(t)) \cdot x_{j, i}$

In [52]:
class Perceptron:
  def __init__(self, n_inpt, n_ouput):
    self.n_ouput, self.n_inpt = n_ouput, n_inpt
    self.weight = np.zeros(shape=(self.n_inpt, self.n_ouput))
  # __init__

  def forward(self, x): return np.dot(x.T, self.weight)

In [53]:
def hinge_loss(y, o):
  return np.sum(np.maximum(0, -1 * (y - o)))

In [54]:
def GDR(model, lr):
  def _GDR(x, y):
    pred = model.forward(x)
    error = y - pred
    grads = np.dot(x.reshape(1, -1).T, error.reshape(1, -1))
    model.weight += grads * lr
  return _GDR

In [55]:
progress_bar = tqdm(range(1000), desc="Training Perceptron", unit="epoch", leave=True, dynamic_ncols=True)

# init and train a model
model = Perceptron(4, 3)
optimizer = GDR(model, 0.001)
for _ in progress_bar:
  loss = 0.
  for feature, label in trainset:
    optimizer(feature, label)
    loss += hinge_loss(model.forward(feature), label)
  progress_bar.set_postfix({"loss": f"{loss:.2f}"})
# for for

Training Perceptron: 100%|██████████| 1000/1000 [00:05<00:00, 167.59epoch/s, loss=15.33]


In [56]:
count, n_samples = 0, len(testset)
for feature, label in testset:
  pred = model.forward(feature)
  if np.argmax(pred) == np.argmax(label): count += 1
print(f"accuracy: {count / n_samples:.2f}({count}/{n_samples})")

accuracy: 0.84(42/50)


### Comparing Perceptron and Logistic Regression
The perceptron is often confused with logistic regression. Though they share similarities, they are entirely different concepts.

- Perceptron outputs a hard class label based on a threshold, while logistic regression outputs a probability using the sigmoid function.
- Logistic regression is a probabilistic model, while the perceptron is deterministic.
- The perceptron uses hinge loss instead of gradient descent rules and cross-entropy.
- The perceptron converges only when data is linearly separable, while logistic regression always converges.

## ADALINE
**ADALINE (Adaptive Linear Neuron, Adaptive Linear Element)** is an enhanced version of the perceptron. It classifies data using a layer of parallel perceptrons. This structure serves as a prototype for artificial neural networks.

### Formulation and Learning Rule
It takes multiple inputs and produce a single output as a multi-layer neural network composed of various nodes.

$$
y = \sum^{n}_{j=0}x_jw_j + \theta
$$
- $x$ represents the input vector while $x_0 = 1$ is bias.
- $y$ represents the model's output.
- $w$ represents the weights, where $w_0 = 0$ is used for local bias.
- $n$ is the number of inputs in the dataset.
- $\theta$ represents the global bias constant.

**Learning Rule:**
- The least mean square error is calculated as $E = (o - y)^2$.
- Update the weights: $w \leftarrow w + \eta(o-y) \cdot x$.
    - $\eta$ represents the learning rate.
    - $o$ represents the target output value.

MADALINE (Many ADALINE), a variant of ADALINE, uses a structure that connects three ADALINE units linearly. It is similar to modern neural networks but differs in that it uses different functions per layer, making backpropagation impossible.

In [57]:
class ADALINE:
  def __init__(self, n_inpt, n_ouput):
    self.n_ouput, self.n_inpt = n_ouput, n_inpt
    self.weight = np.zeros(shape=(self.n_inpt, self.n_ouput))
  # __init__

  def forward(self, x): return relu(np.dot(x.T, self.weight))
# ADALINE

In [58]:
def sigmoid(x): return 1 / (1 + np.exp(-1 * x))
def relu(x): return np.maximum(0, x)

In [59]:
progress_bar = tqdm(range(1000))

# init and train a model
model = ADALINE(4, 3)
optimizer = GDR(model, 0.001)
for _ in progress_bar:
  for feature, label in trainset:
    optimizer(feature, label)

100%|██████████| 1000/1000 [00:01<00:00, 659.96it/s]


In [60]:
count, n_samples = 0, len(testset)
for feature, label in testset:
  pred = model.forward(feature)
  if np.argmax(pred) == np.argmax(label): count += 1
print(f"accuracy: {count / n_samples:.2f}({count}/{n_samples})")

accuracy: 0.92(46/50)


## MLP, Multi-Layer Perceptron

**Multi-Layer Perceptron ( or MLP, fully connected artificial neural network)** consists of three layers of perceptrons with non-linear activation functions. Its primary purpose is to classify inputs in a linear manner.

- Every perceptron in an MLP is fully connected to the next layer of perceptrons, giving each perceptron multidimensional weights $w_{i,j}$.
- These perceptrons function as signal processing units, similar to neurons in the human brain, which is why they are called **neurons**.
- Every neuron has an activation function that maps scalar responses to a non-linear number range. This is a crucial concept in MLPs that enables their functionality and improves their performance.
    - In most implementations, MLPs use the hyperbolic tangent or sigmoid function as their activation function.

It is considered a very prototype of modern neural networks and is sometimes called a “vanilla neural network”, though it differs in its use of forward propagation.

### Formulation and Learning Rule

The learning rule is based on the concept of a neuron; it updates the neuron's weights by calculating partial derivatives of their cost/loss function. This represents a fundamental mechanism of modern deep learning.

$$
w_j(l)(n) = \sum_{i} w_{ji}(l)(n)⋅y_i(l−1)(n)+b_j(l)(n)y_j(l)(n)=\phi\large(w_j(l)(n)\large)y_j(l)(n)=\phi \large(w_j(l)(n)\large)
$$

- Calculate the error $\epsilon$ at the output layer using $\epsilon_j(n) = d_j(n) - y_j(n)$, where $d_j(n)$ represents the desired output values of the model.
    - $\therefore e(n) = \frac{1}{2} \cdot \sum
    {e_j^2(n)}$
- Update using the gradient descent rule: $\Delta w_{ji}(n) = \eta \cdot 
\frac{\Delta \epsilon(n)}{\Delta w_{j}(n)} \cdot y_i(n)$.
    - $e_j$ represents the output of layer j.
    - $y_i(n)$ represents the output of neuron i in the previous layer.
    - $w_i$ represents the weights of neuron i, while $\eta$ is the learning rate.
    - $\frac{\Delta \epsilon(n)}{\Delta w_i(n)}$ is the partial derivative of $\epsilon$ with respect to $w_i$, which can be expressed as: -$\frac{d\epsilon(n)}{dv_j(n)}=e_j(n)\phi'(w_j(n)) = \phi'(w_j(n)) \cdot \sum_{k}-\frac{d\epsilon(n)}{dv_k(n)}w_{kj}(n)$ where $\phi$ represents the activation function.

In [61]:
class MLP:
  def __init__(self, n_inpt, n_ouput, n_hidden=10):
    self.n_ouput, self.n_inpt = n_ouput, n_inpt
    self.weight1 = np.zeros(shape=(self.n_inpt, n_hidden))
    self.weight2 = np.zeros(shape=(n_hidden, self.n_ouput))
  # __init__

  def forward(self, x):
    hidden = relu(np.dot(x.T, self.weight1))
    return sigmoid(np.dot(hidden.T, self.weight2))

In [62]:
def GDR(model, lr):
  def _GDR(x, y):
    hidden = relu(np.dot(x.T, model.weight1))
    pred = sigmoid(np.dot(hidden.T, model.weight2))

    error = y - pred
    grads2 = np.dot(hidden.reshape(1, -1).T, error.reshape(1, -1))
    model.weight2 += grads2 * lr

    error_hidden = np.dot(error, model.weight2.T) * (hidden > 0)
    grads1 = np.dot(x.reshape(1, -1).T, error_hidden.reshape(1, -1))
    model.weight1 += grads1 * lr
  return _GDR

In [63]:
progress_bar = tqdm(range(500), desc="Training MLP", leave=True, dynamic_ncols=True)

# init and train a model
model = MLP(4, 3, n_hidden=12)
optimizer = GDR(model, 0.001)
for _ in progress_bar:
  loss = 0.
  for feature, label in trainset:
    optimizer(feature, label)
    loss += hinge_loss(model.forward(feature), label)
  progress_bar.set_postfix(loss=f"{loss.item():.2f}")

Training MLP: 100%|██████████| 500/500 [00:04<00:00, 107.94it/s, loss=25.00]


In [64]:
count, n_samples = 0, len(testset)

for feature, label in testset:
    pred = model.forward(feature)
    if np.argmax(pred) == np.argmax(label): count += 1
print(f"accuracy: {count / n_samples:.2f}({count}/{n_samples})")

accuracy: 0.42(21/50)
