In [2]:
import numpy as np

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def dsigmoid(x):
    s = sigmoid(x)
    return s * (1 - s)

def tanh(x):
    return np.tanh(x)

def dtanh(x):
    return 1 - np.tanh(x) ** 2

class LSTM:
    def __init__(self, input_size, hidden_size, output_size, lr=0.01):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.lr = lr

        concat_size = input_size + hidden_size
        def init_weights(): return np.random.randn(hidden_size, concat_size) * 0.1
        def init_bias(): return np.zeros((hidden_size, 1))

        self.Wf, self.bf = init_weights(), init_bias()
        self.Wi, self.bi = init_weights(), init_bias()
        self.Wc, self.bc = init_weights(), init_bias()
        self.Wo, self.bo = init_weights(), init_bias()

        self.Wy = np.random.randn(output_size, hidden_size) * 0.1
        self.by = np.zeros((output_size, 1))

    def forward(self, inputs):
        h = np.zeros((self.hidden_size, 1))
        c = np.zeros((self.hidden_size, 1))
        self.cache = []

        for xt in inputs:
            xt = xt.reshape(-1, 1)
            concat = np.vstack((h, xt))

            ft = sigmoid(self.Wf @ concat + self.bf)
            it = sigmoid(self.Wi @ concat + self.bi)
            c_hat = tanh(self.Wc @ concat + self.bc)
            c = ft * c + it * c_hat
            ot = sigmoid(self.Wo @ concat + self.bo)
            h = ot * tanh(c)

            self.cache.append((xt, concat, ft, it, c_hat, c, ot, h))

        y = self.Wy @ h + self.by
        return y, h

    def backward(self, target):
        dWy = np.zeros_like(self.Wy)
        dby = np.zeros_like(self.by)

        dWf = np.zeros_like(self.Wf)
        dWi = np.zeros_like(self.Wi)
        dWc = np.zeros_like(self.Wc)
        dWo = np.zeros_like(self.Wo)

        dbf = np.zeros_like(self.bf)
        dbi = np.zeros_like(self.bi)
        dbc = np.zeros_like(self.bc)
        dbo = np.zeros_like(self.bo)

        dy = self.y_pred - target
        last_h = self.cache[-1][-1]
        dWy += dy @ last_h.T
        dby += dy

        dh_next = self.Wy.T @ dy
        dc_next = np.zeros_like(dh_next)

        for t in reversed(range(len(self.cache))):
            xt, concat, ft, it, c_hat, c, ot, h = self.cache[t]
            c_prev = self.cache[t - 1][5] if t > 0 else np.zeros_like(c)

            do = dh_next * tanh(c)
            dOt_raw = do * dsigmoid(self.Wo @ concat + self.bo)
            dWo += dOt_raw @ concat.T
            dbo += dOt_raw

            dc = dh_next * ot * dtanh(c) + dc_next
            dc_hat = dc * it
            dc_hat_raw = dc_hat * dtanh(self.Wc @ concat + self.bc)
            dWc += dc_hat_raw @ concat.T
            dbc += dc_hat_raw

            di = dc * c_hat
            dIt_raw = di * dsigmoid(self.Wi @ concat + self.bi)
            dWi += dIt_raw @ concat.T
            dbi += dIt_raw

            df = dc * c_prev
            dFt_raw = df * dsigmoid(self.Wf @ concat + self.bf)
            dWf += dFt_raw @ concat.T
            dbf += dFt_raw

            d_concat = (
                self.Wf.T @ dFt_raw +
                self.Wi.T @ dIt_raw +
                self.Wc.T @ dc_hat_raw +
                self.Wo.T @ dOt_raw
            )
            dh_next = d_concat[:self.hidden_size, :]
            dc_next = dc * ft

        # Gradient descent update
        for W, dW in zip(
            [self.Wf, self.Wi, self.Wc, self.Wo, self.Wy],
            [dWf, dWi, dWc, dWo, dWy]
        ):
            W -= self.lr * dW

        for b, db in zip(
            [self.bf, self.bi, self.bc, self.bo, self.by],
            [dbf, dbi, dbc, dbo, dby]
        ):
            b -= self.lr * db

    def train(self, X_train, y_train, epochs=10):
        for epoch in range(epochs):
            loss = 0
            for x_seq, y_true in zip(X_train, y_train):
                self.y_pred, _ = self.forward(x_seq)
                loss += np.sum((self.y_pred - y_true.reshape(-1, 1)) ** 2)
                self.backward(y_true.reshape(-1, 1))
            avg_loss = loss / len(X_train)
            print(f"Epoch {epoch+1}: Loss = {avg_loss:.4f}")


In [4]:
if __name__ == "__main__":
    np.random.seed(42)

    input_size = 3
    hidden_size = 4
    output_size = 1
    seq_len = 5

    model = LSTM(input_size, hidden_size, output_size, lr=0.01)

    # Generate synthetic training data
    X_train = [[np.random.randn(input_size) for _ in range(seq_len)] for _ in range(100)]
    y_train = [np.array([1 if sum(x[-1]) > 0 else 0]) for x in X_train]  # Label depends on last input

    model.train(X_train, y_train, epochs=20)

    # Test
    test_seq = [np.random.randn(input_size) for _ in range(seq_len)]
    prediction, _ = model.forward(test_seq)
    print("Test Prediction (raw):", prediction)
    print("Predicted Label:", int(prediction[0][0] > 0.5))


Epoch 1: Loss = 0.3382
Epoch 2: Loss = 0.2578
Epoch 3: Loss = 0.2462
Epoch 4: Loss = 0.2432
Epoch 5: Loss = 0.2411
Epoch 6: Loss = 0.2390
Epoch 7: Loss = 0.2366
Epoch 8: Loss = 0.2340
Epoch 9: Loss = 0.2311
Epoch 10: Loss = 0.2279
Epoch 11: Loss = 0.2244
Epoch 12: Loss = 0.2206
Epoch 13: Loss = 0.2165
Epoch 14: Loss = 0.2121
Epoch 15: Loss = 0.2075
Epoch 16: Loss = 0.2026
Epoch 17: Loss = 0.1975
Epoch 18: Loss = 0.1923
Epoch 19: Loss = 0.1869
Epoch 20: Loss = 0.1816
Test Prediction (raw): [[0.6210284]]
Predicted Label: 1
