# Решение заданий (градиенты + простая сеть)

В ноутбуке реализованы: Softmax, ReLU, Sigmoid, SimpleConv, CrossEntropy и NeuralNetwork.
Также добавлены тесты grad_x через численную проверку градиента.

In [None]:
import numpy as np

def _as_2d(x):
    x = np.asarray(x, dtype=float)
    if x.ndim == 1:
        x = x[None, :]
    return x

class Layer:
    def forward(self, x):
        raise NotImplementedError

    def grad_x(self, x, grad_out):
        raise NotImplementedError

    def grad_params(self, x, grad_out):
        return []

    def num_params(self):
        return 0


class LinearLayer(Layer):                       #Forward:y=xW+b
                                                #Grad по входу: ∂L​/∂x=(∂L/∂y)*​W^T
                                                #Grad по параметрам: ∂L/∂W​=x^T*(∂L​/∂y), ∂L/∂b​=∑_n (​∂L/∂Y_n​​)
    def __init__(self, in_features, out_features, W=None, b=None, rng=None):
        rng = np.random.default_rng() if rng is None else rng
        if W is None:
            
            limit = np.sqrt(6.0 / (in_features + out_features))
            W = rng.uniform(-limit, limit, size=(in_features, out_features))
        if b is None:
            b = np.zeros((out_features,), dtype=float)

        self.W = np.asarray(W, dtype=float)
        self.b = np.asarray(b, dtype=float)

    def forward(self, x):
        x2 = _as_2d(x)
        return x2 @ self.W + self.b

    def grad_x(self, x, grad_out):
        g = _as_2d(grad_out)
        return g @ self.W.T

    def grad_params(self, x, grad_out):
        x2 = _as_2d(x)
        g = _as_2d(grad_out)
        dW = x2.T @ g
        db = g.sum(axis=0)
        return [(self.W, dW), (self.b, db)]

    def num_params(self):
        return self.W.size + self.b.size


class ReLU(Layer):                          #Forward: ReLU(x)=max(0,x)
                                            #Backward: ∂L/∂x​=(∂L/∂y)​⊙1(x>0)
    def forward(self, x):
        x2 = _as_2d(x)
        return np.maximum(0.0, x2)

    def grad_x(self, x, grad_out):
        x2 = _as_2d(x)
        g = _as_2d(grad_out)
        return g * (x2 > 0.0)


class Sigmoid(Layer):                       #Forward: σ(x)=1/1+e^(−x)
                                            #Производная: σ′(x)=σ(x)(1−σ(x))
    def forward(self, x):
        x2 = _as_2d(x)
        out = np.empty_like(x2)
        pos = x2 >= 0
        out[pos] = 1.0 / (1.0 + np.exp(-x2[pos]))
        expx = np.exp(x2[~pos])
        out[~pos] = expx / (1.0 + expx)
        return out

    def grad_x(self, x, grad_out):
        y = self.forward(x)
        g = _as_2d(grad_out)
        return g * y * (1.0 - y)


class Softmax(Layer):                       #Forward: y_i ​= e^(x_i)​​/∑_j(​e^(x_j)​)
                                            #Backward: ∂y_i/∂x_j​​​=y_i​(δ_ij​ − y_i)
    def forward(self, x):
        x2 = _as_2d(x)
        z = x2 - x2.max(axis=1, keepdims=True)
        expz = np.exp(z)
        return expz / expz.sum(axis=1, keepdims=True)

    def grad_x(self, x, grad_out):
        y = self.forward(x)
        g = _as_2d(grad_out)
        
        dot = np.sum(g * y, axis=1, keepdims=True)
        return y * (g - dot)


class CrossEntropy:                          #Если у нас вероятности y_pred и one-hot t: L = −∑_i(​t_i*​log(y_i​))
                                             #Градиент по y_pred: ∂L/∂y_i​=−(t_i​​/y_i​)
    def __init__(self, eps=1e-8, reduction="mean"):
        self.eps = float(eps)
        assert reduction in ("mean", "sum")
        self.reduction = reduction

    def __call__(self, y_pred, y_true):
        return self.forward(y_pred, y_true)

    def forward(self, y_pred, y_true):
        y = _as_2d(y_pred)
        t = _as_2d(y_true)
        losses = -np.sum(t * np.log(y + self.eps), axis=1)
        return float(losses.mean() if self.reduction == "mean" else losses.sum())

    def grad_x(self, y_pred, y_true):
        y = _as_2d(y_pred)
        t = _as_2d(y_true)
        g = -(t / (y + self.eps))
        if self.reduction == "mean":
            g = g / y.shape[0]
        return g


def one_hot(y, num_classes):
    y = np.asarray(y, dtype=int).reshape(-1)
    out = np.zeros((y.size, num_classes), dtype=float)
    out[np.arange(y.size), y] = 1.0
    return out


class NeuralNetwork:                            #Forward + cache: сохраняем входы каждого слоя
                                                #grad = dLoss/dOut
    def __init__(self, layers, loss: CrossEntropy):
        self.layers = list(layers)
        self.loss = loss

    def forward(self, x):
        out = x
        for layer in self.layers:
            out = layer.forward(out)
        return out

    def __call__(self, x):
        return self.forward(x)

    def num_params(self):
        return sum(getattr(l, "num_params", lambda: 0)() for l in self.layers)

    def grad_x(self, x, y_true):
        
        xs = []
        out = x
        for layer in self.layers:
            xs.append(out)
            out = layer.forward(out)

        grad = self.loss.grad_x(out, y_true)
        for layer, x_in in zip(reversed(self.layers), reversed(xs)):
            grad = layer.grad_x(x_in, grad)
        return grad

    def train_one_step(self, X_batch, Y_batch, learning_rate):
        
        xs = []
        out = X_batch
        for layer in self.layers:
            xs.append(out)
            out = layer.forward(out)

        grad = self.loss.grad_x(out, Y_batch)

        # backward + SGD
        for layer, x_in in zip(reversed(self.layers), reversed(xs)):
            for param, dparam in layer.grad_params(x_in, grad):
                param -= learning_rate * dparam
            grad = layer.grad_x(x_in, grad)

    def fit_one_epoch(self, X, Y, batch_size, learning_rate, shuffle=True, rng=None):
        rng = np.random.default_rng() if rng is None else rng
        X = np.asarray(X, dtype=float)
        Y = np.asarray(Y, dtype=float)

        n = X.shape[0]
        idx = np.arange(n)
        if shuffle:
            rng.shuffle(idx)

        for start in range(0, n, batch_size):
            bi = idx[start:start + batch_size]
            self.train_one_step(X[bi], Y[bi], learning_rate)


class SimpleConv(Layer):                                #Forward: y[i,j]=∑_(u,v)(​x[i+u,j+v]⋅K[u,v]+b)
                                                        #dL/dx: dx[i+u,j+v]+=go[i,j]⋅K[u,v]
                                                        #dL/dK: dK[u,v]+=go[i,j]⋅x[i+u,j+v]
    
    def __init__(self, kernel, bias=0.0):
        self.K = np.asarray(kernel, dtype=float)        
        self.b = np.asarray([bias], dtype=float)        

    def forward(self, x):
        x = np.asarray(x, dtype=float)
        if x.ndim == 2:
            x = x[None, :, :]

        N, H, W = x.shape
        kh, kw = self.K.shape
        out_h, out_w = H - kh + 1, W - kw + 1

        y = np.zeros((N, out_h, out_w), dtype=float)
        for n in range(N):
            for i in range(out_h):
                for j in range(out_w):
                    patch = x[n, i:i+kh, j:j+kw]
                    y[n, i, j] = np.sum(patch * self.K) + self.b[0]
        return y

    def grad_x(self, x, grad_out):
        x = np.asarray(x, dtype=float)
        if x.ndim == 2:
            x = x[None, :, :]
        go = np.asarray(grad_out, dtype=float)
        if go.ndim == 2:
            go = go[None, :, :]

        N, H, W = x.shape
        kh, kw = self.K.shape
        out_h, out_w = H - kh + 1, W - kw + 1

        dx = np.zeros_like(x)
        for n in range(N):
            for i in range(out_h):
                for j in range(out_w):
                    dx[n, i:i+kh, j:j+kw] += go[n, i, j] * self.K
        return dx

    def grad_params(self, x, grad_out):
        x = np.asarray(x, dtype=float)
        if x.ndim == 2:
            x = x[None, :, :]
        go = np.asarray(grad_out, dtype=float)
        if go.ndim == 2:
            go = go[None, :, :]

        N, H, W = x.shape
        kh, kw = self.K.shape
        out_h, out_w = H - kh + 1, W - kw + 1

        dK = np.zeros_like(self.K)
        db = 0.0
        for n in range(N):
            for i in range(out_h):
                for j in range(out_w):
                    patch = x[n, i:i+kh, j:j+kw]
                    dK += go[n, i, j] * patch
                    db += go[n, i, j]

        return [(self.K, dK), (self.b, np.asarray([db], dtype=float))]

    def num_params(self):
        return self.K.size + self.b.size


## Тесты grad_x (finite differences)

In [2]:
import numpy as np

def finite_diff_grad(f, x, eps=1e-6):
    x = np.asarray(x, dtype=float)
    grad = np.zeros_like(x)
    it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
    while not it.finished:
        idx = it.multi_index
        old = x[idx]
        x[idx] = old + eps
        f1 = f(x)
        x[idx] = old - eps
        f2 = f(x)
        x[idx] = old
        grad[idx] = (f1 - f2) / (2 * eps)
        it.iternext()
    return grad

def assert_close(a, b, tol=1e-5, name=''):
    a = np.asarray(a); b = np.asarray(b)
    err = np.max(np.abs(a - b))
    if err > tol:
        raise AssertionError(f'{name} max_err={err} > {tol}')

rng = np.random.default_rng(42)

# 1) LinearLayer.grad_x
x = rng.normal(size=(3, 5))
up = rng.normal(size=(3, 4))
lin = LinearLayer(5, 4, rng=rng)
g_num = finite_diff_grad(lambda xx: float(np.sum(lin.forward(xx) * up)), x.copy())
g_an = lin.grad_x(x, up)
assert_close(g_num, g_an, 1e-5, 'LinearLayer.grad_x')

# 2) ReLU.grad_x
relu = ReLU()
x = rng.normal(size=(2, 6))
up = rng.normal(size=(2, 6))
g_num = finite_diff_grad(lambda xx: float(np.sum(relu.forward(xx) * up)), x.copy())
g_an = relu.grad_x(x, up)
assert_close(g_num, g_an, 1e-5, 'ReLU.grad_x')

# 3) Softmax.grad_x
sm = Softmax()
x = rng.normal(size=(2, 7))
up = rng.normal(size=(2, 7))  # dS/dy
g_num = finite_diff_grad(lambda xx: float(np.sum(sm.forward(xx) * up)), x.copy())
g_an = sm.grad_x(x, up)
assert_close(g_num, g_an, 1e-5, 'Softmax.grad_x')

# 4) CrossEntropy.grad_x (по y_pred)
ce = CrossEntropy(eps=1e-8, reduction='mean')
y = sm.forward(rng.normal(size=(4, 5)))
t = one_hot(rng.integers(0, 5, size=4), 5)
g_num = finite_diff_grad(lambda yy: ce.forward(yy.reshape(4, 5), t), y.copy()).reshape(4, 5)
g_an = ce.grad_x(y, t)
assert_close(g_num, g_an, 1e-5, 'CrossEntropy.grad_x')

# 5) NeuralNetwork.grad_x
layers = [LinearLayer(5, 8, rng=rng), ReLU(), LinearLayer(8, 3, rng=rng), Softmax()]
net = NeuralNetwork(layers, loss=CrossEntropy(eps=1e-8))

x = rng.normal(size=(1, 5))
t = one_hot(np.array([1]), 3)

g_num = finite_diff_grad(lambda xx: net.loss.forward(net.forward(xx.reshape(1, 5)), t), x.copy()).reshape(1, 5)
g_an = net.grad_x(x, t)
assert_close(g_num, g_an, 1e-5, 'NeuralNetwork.grad_x')

print('✅ Все grad_x тесты прошли')


✅ Все grad_x тесты прошли


## Мини-тест на MNIST

In [4]:
import numpy as np

def load_data(path='mnist.npz'):
    with np.load(path, allow_pickle=True) as f:
        x_train, y_train = f['x_train'], f['y_train']
        x_test, y_test = f['x_test'], f['y_test']
    return (x_train, y_train), (x_test, y_test)

(x_train, y_train), (x_test, y_test) = load_data('mnist.npz')

x_train = x_train.reshape((x_train.shape[0], -1)).astype(float) / 255.0
x_test = x_test.reshape((x_test.shape[0], -1)).astype(float) / 255.0

y_train_oh = one_hot(y_train, 10)
y_test_oh = one_hot(y_test, 10)

def accuracy(net, x, y_labels):
    probs = net.forward(x)
    pred = np.argmax(probs, axis=1)
    return float(np.mean(pred == y_labels))

layers = [
    LinearLayer(784, 100),
    ReLU(),
    LinearLayer(100, 10),
    Softmax(),
]
net = NeuralNetwork(layers, loss=CrossEntropy(eps=1e-8))

for epoch in range(3):
    net.fit_one_epoch(X=x_train, Y=y_train_oh, batch_size=64, learning_rate=0.05)
    print(f'epoch={epoch+1} acc={accuracy(net, x_test, y_test):.4f}')


epoch=1 acc=0.9225
epoch=2 acc=0.9367
epoch=3 acc=0.9464


In [None]:
import os
import numpy as np

def load_mnist_npz(path="mnist.npz"):
    if not os.path.exists(path):
        raise FileNotFoundError(
            f"Файл '{path}' не найден. Положи mnist.npz рядом с ноутбуком или укажи полный путь."
        )
    with np.load(path, allow_pickle=True) as f:
        x_train, y_train = f["x_train"], f["y_train"]
        x_test, y_test = f["x_test"], f["y_test"]
    return (x_train, y_train), (x_test, y_test)

def preprocess_mnist(x):
    
    x = x.reshape((x.shape[0], -1)).astype(np.float32) / 255.0
    return x


def evaluate(net, loss_fn, X, y_labels, y_onehot=None, batch_size=2048):
    n = X.shape[0]
    correct = 0
    total_loss = 0.0

    for start in range(0, n, batch_size):
        xb = X[start:start+batch_size]
        yb = y_labels[start:start+batch_size]

        probs = net.forward(xb)                 
        pred = np.argmax(probs, axis=1)
        correct += int(np.sum(pred == yb))

        if y_onehot is not None:
            yoh = y_onehot[start:start+batch_size]
            
            total_loss += float(loss_fn.forward(probs, yoh)) * xb.shape[0]

    acc = correct / n
    avg_loss = (total_loss / n) if y_onehot is not None else None
    return acc, avg_loss


def build_mnist_net(hidden=128, seed=42):
    rng = np.random.default_rng(seed)
    layers = [
        LinearLayer(784, hidden, rng=rng),
        ReLU(),
        LinearLayer(hidden, 10, rng=rng),
        Softmax(),
    ]
    loss_fn = CrossEntropy(eps=1e-8, reduction="mean")
    net = NeuralNetwork(layers, loss=loss_fn)
    return net, loss_fn


(x_train_raw, y_train), (x_test_raw, y_test) = load_mnist_npz("mnist.npz")
x_train = preprocess_mnist(x_train_raw)
x_test = preprocess_mnist(x_test_raw)

y_train_oh = one_hot(y_train, 10)
y_test_oh = one_hot(y_test, 10)

net, loss_fn = build_mnist_net(hidden=128, seed=42)


test_acc, test_loss = evaluate(net, loss_fn, x_test, y_test, y_onehot=y_test_oh)
print(f"before: test_acc={test_acc:.4f} test_loss={test_loss:.4f}")


EPOCHS = 5
LR = 0.05
BATCH = 64
rng = np.random.default_rng(123) 

for epoch in range(1, EPOCHS + 1):
    net.fit_one_epoch(X=x_train, Y=y_train_oh, batch_size=BATCH, learning_rate=LR, shuffle=True, rng=rng)

    train_acc, train_loss = evaluate(net, loss_fn, x_train, y_train, y_onehot=y_train_oh)
    test_acc, test_loss = evaluate(net, loss_fn, x_test, y_test, y_onehot=y_test_oh)

    print(f"epoch={epoch:02d} "
          f"train_acc={train_acc:.4f} train_loss={train_loss:.4f} | "
          f"test_acc={test_acc:.4f} test_loss={test_loss:.4f}")


before: test_acc=0.1779 test_loss=2.3203
epoch=01 train_acc=0.9148 train_loss=0.2960 | test_acc=0.9177 test_loss=0.2882
epoch=02 train_acc=0.9335 train_loss=0.2325 | test_acc=0.9349 test_loss=0.2282
epoch=03 train_acc=0.9438 train_loss=0.1991 | test_acc=0.9431 test_loss=0.2006
epoch=04 train_acc=0.9521 train_loss=0.1698 | test_acc=0.9507 test_loss=0.1719
epoch=05 train_acc=0.9567 train_loss=0.1517 | test_acc=0.9541 test_loss=0.1563
