In [1]:
import numpy as np

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def dsigmoid(x):
    s = sigmoid(x)
    return s * (1 - s)

def tanh(x):
    return np.tanh(x)

def dtanh(x):
    return 1 - tanh(x) ** 2

class GRU:
    def __init__(self, input_size, hidden_size, output_size, lr=0.01):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.lr = lr

        # Update gate
        self.Wz = np.random.randn(hidden_size, input_size) * 0.1
        self.Uz = np.random.randn(hidden_size, hidden_size) * 0.1
        self.bz = np.zeros((hidden_size, 1))

        # Reset gate
        self.Wr = np.random.randn(hidden_size, input_size) * 0.1
        self.Ur = np.random.randn(hidden_size, hidden_size) * 0.1
        self.br = np.zeros((hidden_size, 1))

        # Candidate hidden state
        self.Wh = np.random.randn(hidden_size, input_size) * 0.1
        self.Uh = np.random.randn(hidden_size, hidden_size) * 0.1
        self.bh = np.zeros((hidden_size, 1))

        # Output layer
        self.Wy = np.random.randn(output_size, hidden_size) * 0.1
        self.by = np.zeros((output_size, 1))

    def forward(self, inputs):
        h = np.zeros((self.hidden_size, 1))
        self.cache = []

        for x in inputs:
            x = x.reshape(-1, 1)
            z = sigmoid(self.Wz @ x + self.Uz @ h + self.bz)
            r = sigmoid(self.Wr @ x + self.Ur @ h + self.br)
            h_tilde = tanh(self.Wh @ x + self.Uh @ (r * h) + self.bh)
            h_next = (1 - z) * h + z * h_tilde
            self.cache.append((x, h.copy(), z, r, h_tilde))
            h = h_next

        self.h_last = h
        y = self.Wy @ h + self.by
        self.y_pred = y
        return y

    def backward(self, target):
        dy = self.y_pred - target.reshape(-1, 1)
        dWy = dy @ self.h_last.T
        dby = dy
        dh_next = self.Wy.T @ dy

        dWz = np.zeros_like(self.Wz)
        dUz = np.zeros_like(self.Uz)
        dbz = np.zeros_like(self.bz)

        dWr = np.zeros_like(self.Wr)
dUr = np.zeros_like(self.Ur)
dbr = np.zeros_like(self.br)

dWh = np.zeros_like(self.Wh)
dUh = np.zeros_like(self.Uh)
dbh = np.zeros_like(self.bh)


        for t in reversed(range(len(self.cache))):
            x, h_prev, z, r, h_tilde = self.cache[t]
            x = x.reshape(-1, 1)

            dh = dh_next
            dz = dh * (h_tilde - h_prev)
            dh_tilde = dh * z
            dh_prev = dh * (1 - z)

            dz_raw = dz * dsigmoid(self.Wz @ x + self.Uz @ h_prev + self.bz)
            dWz += dz_raw @ x.T
            dUz += dz_raw @ h_prev.T
            dbz += dz_raw

            h_tilde_input = self.Wh @ x + self.Uh @ (r * h_prev) + self.bh
            dh_tilde_raw = dh_tilde * dtanh(h_tilde_input)
            dWh += dh_tilde_raw @ x.T
            dUh += dh_tilde_raw @ (r * h_prev).T
            dbh += dh_tilde_raw

            dr = (self.Uh @ dh_tilde_raw) * h_prev
            dr_raw = dr * dsigmoid(self.Wr @ x + self.Ur @ h_prev + self.br)
            dWr += dr_raw @ x.T
            dUr += dr_raw @ h_prev.T
            dbr += dr_raw

            dh_prev = (
                dh_prev
                + self.Uz.T @ dz_raw
                + self.Ur.T @ dr_raw
                + r * (self.Uh.T @ dh_tilde_raw)
            )
            dh_next = dh_prev

        self.Wy -= self.lr * dWy
        self.by -= self.lr * dby
        for param, grad in zip(
            [self.Wz, self.Uz, self.bz, self.Wr, self.Ur, self.br, self.Wh, self.Uh, self.bh],
            [dWz, dUz, dbz, dWr, dUr, dbr, dWh, dUh, dbh]
        ):
            param -= self.lr * grad

    def train(self, X, y, epochs=10):
        for epoch in range(epochs):
            total_loss = 0
            for i in range(len(X)):
                y_pred = self.forward(X[i])
                loss = np.sum((y_pred - y[i].reshape(-1, 1)) ** 2)
                total_loss += loss
                self.backward(y[i])
            print(f"Epoch {epoch+1}, Loss: {total_loss / len(X):.4f}")

    def predict(self, inputs):
        y = self.forward(inputs)
        return y

# Example usage
if __name__ == "__main__":
    np.random.seed(42)
    input_size = 3
    hidden_size = 4
    output_size = 1
    sequence_length = 5

    gru = GRU(input_size, hidden_size, output_size, lr=0.01)

    # Generate synthetic binary classification data
    X_train = [[np.random.randn(input_size) for _ in range(sequence_length)] for _ in range(100)]
    y_train = [np.array([1 if sum(seq[-1]) > 0 else 0]) for seq in X_train]

    # Train model
    gru.train(X_train, y_train, epochs=20)

    # Predict
    test_seq = [np.random.randn(input_size) for _ in range(sequence_length)]
    output = gru.predict(test_seq)
    print("Predicted value:", output[0][0])
    print("Predicted class:", int(output[0][0] > 0.5))


ValueError: operands could not be broadcast together with shapes (4,3) (4,4) (4,3) 