**Импортируем все необходимые библиотеки**

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
import tqdm

**Подгрузим датасет**

In [None]:
def load_mnist_data():
    (X_train, y_train), (X_test, y_test) = keras.datasets.mnist.load_data()
    X_train = X_train.reshape(X_train.shape[0], -1).astype('float32') / 255
    X_test = X_test.reshape(X_test.shape[0], -1).astype('float32') / 255

    y_train_encoded = one_hot_encode(y_train)
    y_test_encoded = one_hot_encode(y_test)

    return X_train, y_train_encoded, X_test, y_test_encoded

def one_hot_encode(labels, num_classes=10):
    encoded = np.zeros((len(labels), num_classes))
    for i, label in enumerate(labels):
        encoded[i, label] = 1
    return encoded

**Реализуем функции активации и их производные**

In [None]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_derivative(x):
    return sigmoid(x) * (1 - sigmoid(x))

def relu(x):
    return np.maximum(0, x)

def relu_derivative(x):
   return (x>0).astype(float)


**Функция ошибки и ее производная**

In [None]:
def cross_entropy_loss(y_true, y_pred):
    epsilon = 1e-15
    y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
    return -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))

def cross_entropy_loss_derivative(y_true, y_pred):
    epsilon = 1e-15
    y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
    return (y_pred - y_true) / (y_pred * (1 - y_pred))

**Инициализируем параметры**

In [None]:
def initialize_parameters(input_size, hidden_size1, hidden_size2, output_size):
    W1 = np.random.randn(input_size, hidden_size1) * 0.01
    b1 = np.zeros((1, hidden_size1))
    W2 = np.random.randn(hidden_size1, hidden_size2) * 0.01
    b2 = np.zeros((1, hidden_size2))
    W3 = np.random.randn(hidden_size2, output_size) * 0.01
    b3 = np.zeros((1, output_size))
    return W1, b1, W2, b2, W3, b3

**Реализация прохода вперед**

In [None]:
def forward_propagation(X, W1, b1, W2, b2, W3, b3):
    Z1 = np.dot(X, W1) + b1
    A1 = relu(Z1)
    Z2 = np.dot(A1, W2) + b2
    A2 = relu(Z2)
    Z3 = np.dot(A2, W3) + b3
    A3 = sigmoid(Z3)
    return Z1, A1, Z2, A2, Z3, A3

**Реализация алгоритма обратного распространения ошибки**

In [None]:
def backward_propagation(X, y, Z1, A1, Z2, A2, Z3, A3, W1, W2, W3):
    m = y.shape[0]
    dZ3 = cross_entropy_loss_derivative(y, A3) * sigmoid_derivative(Z3)
    dW3 = (1/m) * np.dot(A2.T, dZ3)
    db3 = (1/m) * np.sum(dZ3, axis=0, keepdims=True)
    dZ2 = np.dot(dZ3, W3.T) * relu_derivative(Z2)
    dW2 = (1/m) * np.dot(A1.T, dZ2)
    db2 = (1/m) * np.sum(dZ2, axis=0, keepdims=True)
    dZ1 = np.dot(dZ2, W2.T) * relu_derivative(Z1)
    dW1 = (1/m) * np.dot(X.T, dZ1)
    db1 = (1/m) * np.sum(dZ1, axis=0, keepdims=True)

    return dW1, db1, dW2, db2, dW3, db3

**Обновление параметров сети**

In [None]:
def update_parameters(W1, b1, W2, b2, W3, b3, dW1, db1, dW2, db2, dW3, db3, learning_rate):
    W1 -= learning_rate * dW1
    b1 -= learning_rate * db1
    W2 -= learning_rate * dW2
    b2 -= learning_rate * db2
    W3 -= learning_rate * dW3
    b3 -= learning_rate * db3

    return W1, b1, W2, b2, W3, b3

**Обучение сети**

In [None]:
def train_network(X_train, y_train, hidden_size1, hidden_size2, learning_rate, epochs):
    input_size = X_train.shape[1]
    output_size = y_train.shape[1]

    W1, b1, W2, b2, W3, b3 = initialize_parameters(input_size, hidden_size1, hidden_size2, output_size)

    for epoch in tqdm.tqdm(range(epochs), desc="Training"):
       Z1, A1, Z2, A2, Z3, A3 = forward_propagation(X_train, W1, b1, W2, b2, W3, b3)
       dW1, db1, dW2, db2, dW3, db3 = backward_propagation(X_train, y_train, Z1, A1, Z2, A2, Z3, A3, W1, W2, W3)
       W1, b1, W2, b2, W3, b3 = update_parameters(W1, b1, W2, b2, W3, b3, dW1, db1, dW2, db2, dW3, db3, learning_rate)

       loss = cross_entropy_loss(y_train, A3)
       if epoch % 10 == 0:
          print(f'Epoch: {epoch}, Loss: {loss}')
    return W1, b1, W2, b2, W3, b3

**Тестирование сети**

In [None]:
def test_network(X_test, y_test, W1, b1, W2, b2, W3, b3):
     _, _, _, _, _, A3_test = forward_propagation(X_test, W1, b1, W2, b2, W3, b3)
     y_pred_test = np.argmax(A3_test, axis=1)
     accuracy = np.mean(y_pred_test == y_test)
     print(f"Accuracy on test data: {accuracy * 100:.2f}%")

**Пробуем запустить**

In [None]:
hidden_size1 = 256
hidden_size2 = 128
learning_rate = 0.1
epochs = 100

X_train, y_train, X_test, y_test = load_mnist_data()

W1, b1, W2, b2, W3, b3 = train_network(X_train, y_train, hidden_size1, hidden_size2, learning_rate, epochs)

test_network(X_test, y_test, W1, b1, W2, b2, W3, b3)

Training:   1%|          | 1/100 [00:04<06:50,  4.15s/it]

Epoch: 0, Loss: 0.6930340312344964


Training:  11%|█         | 11/100 [00:34<04:47,  3.23s/it]

Epoch: 10, Loss: 0.5607586928295839


Training:  21%|██        | 21/100 [01:06<04:06,  3.12s/it]

Epoch: 20, Loss: 0.3440434765695895


Training:  31%|███       | 31/100 [01:35<03:22,  2.94s/it]

Epoch: 30, Loss: 0.33085363713364146


Training:  41%|████      | 41/100 [02:05<02:50,  2.88s/it]

Epoch: 40, Loss: 0.33033314529527824


Training:  51%|█████     | 51/100 [02:35<02:17,  2.81s/it]

Epoch: 50, Loss: 0.32989236864934784


Training:  61%|██████    | 61/100 [03:07<01:51,  2.85s/it]

Epoch: 60, Loss: 0.3295011715879777


Training:  71%|███████   | 71/100 [03:38<01:32,  3.19s/it]

Epoch: 70, Loss: 0.3291393242016581


Training:  81%|████████  | 81/100 [04:08<01:01,  3.22s/it]

Epoch: 80, Loss: 0.32879062047906993


Training:  91%|█████████ | 91/100 [04:38<00:27,  3.08s/it]

Epoch: 90, Loss: 0.32844051216567155


Training: 100%|██████████| 100/100 [05:05<00:00,  3.06s/it]

Accuracy on test data: 10.73%



