In [1]:
import numpy as np

class SimpleRNN:
    def __init__(self, n_features, n_nodes):
        # Initialize weights and biases
        self.W_x = np.random.randn(n_features, n_nodes) * 0.01  # Input weights
        self.W_h = np.random.randn(n_nodes, n_nodes) * 0.01     # Hidden state weights
        self.b = np.zeros(n_nodes)                             # Bias term
        
        # For storing values during forward pass (needed for backprop)
        self.cache = None
        self.dW_x = None  # Gradients for input weights
        self.dW_h = None  # Gradients for hidden state weights
        self.db = None    # Gradients for bias
        
    def forward(self, x, h_prev=None):
        """
        Forward pass of SimpleRNN
        
        Parameters:
            x: Input data with shape (batch_size, n_sequences, n_features)
            h_prev: Initial hidden state (if None, initialize with zeros)
            
        Returns:
            h: Final hidden state after processing all sequences
        """
        batch_size, n_sequences, n_features = x.shape
        n_nodes = self.W_x.shape[1]
        
        # Initialize the hidden state if not provided
        if h_prev is None:
            h_prev = np.zeros((batch_size, n_nodes))
        
        # Store inputs, hidden states, and activations for backpropagation
        xs, hs, hs_prev, as_ = {}, {}, {}, {}
        hs[-1] = h_prev
        h = h_prev
        
        # Process each time step
        for t in range(n_sequences):
            # Get current input
            xs[t] = x[:, t, :]
            hs_prev[t] = h
            
            # Calculate pre-activation
            a = np.dot(xs[t], self.W_x) + np.dot(h, self.W_h) + self.b
            as_[t] = a
            
            # Apply activation function (tanh)
            h = np.tanh(a)
            hs[t] = h
        
        # Store values for backpropagation
        self.cache = (xs, hs, hs_prev, as_)
        
        return h
    
    def backward(self, dh, learning_rate=0.01):
        """
        Backward pass of SimpleRNN
        
        Parameters:
            dh: Gradient of loss with respect to the final hidden state
            learning_rate: Learning rate for weight updates
            
        Returns:
            dx: Gradient of loss with respect to input x
            dh_prev: Gradient of loss with respect to initial hidden state
        """
        xs, hs, hs_prev, as_ = self.cache
        
        batch_size, n_nodes = dh.shape
        n_sequences = len(xs)
        n_features = xs[0].shape[1]
        
        # Initialize gradients
        dW_x = np.zeros_like(self.W_x)
        dW_h = np.zeros_like(self.W_h)
        db = np.zeros_like(self.b)
        dh_prev = np.zeros_like(dh)
        dx = np.zeros((batch_size, n_sequences, n_features))
        
        # Backpropagate through time
        for t in reversed(range(n_sequences)):
            # Gradient of loss with respect to hidden state
            if t == n_sequences - 1:
                dh_next = dh
            else:
                dh_next = dh_next + dh_t
            
            # Gradient of loss with respect to pre-activation
            da = dh_next * (1 - hs[t]**2)  # Derivative of tanh
            
            # Gradients of loss with respect to parameters
            db += np.sum(da, axis=0)
            dW_x += np.dot(xs[t].T, da)
            dW_h += np.dot(hs_prev[t].T, da)
            
            # Gradient of loss with respect to previous hidden state
            dh_t = np.dot(da, self.W_h.T)
            
            # Gradient of loss with respect to input
            dx[:, t, :] = np.dot(da, self.W_x.T)
        
        # Update parameters
        self.W_x -= learning_rate * dW_x
        self.W_h -= learning_rate * dW_h
        self.b -= learning_rate * db
        
        # Store gradients
        self.dW_x = dW_x
        self.dW_h = dW_h
        self.db = db
        
        return dx, dh_prev

class FullyConnected:
    def __init__(self, input_size, output_size):
        # Initialize weights and biases
        self.W = np.random.randn(input_size, output_size) * 0.01
        self.b = np.zeros(output_size)
        
        # For storing values during forward pass (needed for backprop)
        self.cache = None
        self.dW = None
        self.db = None
        
    def forward(self, x):
        """
        Forward pass of fully connected layer
        
        Parameters:
            x: Input data with shape (batch_size, input_size)
            
        Returns:
            out: Output of the fully connected layer
        """
        out = np.dot(x, self.W) + self.b
        self.cache = x
        return out
    
    def backward(self, dout, learning_rate=0.01):
        """
        Backward pass of fully connected layer
        
        Parameters:
            dout: Gradient of loss with respect to output
            learning_rate: Learning rate for weight updates
            
        Returns:
            dx: Gradient of loss with respect to input x
        """
        x = self.cache
        
        # Compute gradients
        dx = np.dot(dout, self.W.T)
        dW = np.dot(x.T, dout)
        db = np.sum(dout, axis=0)
        
        # Update parameters
        self.W -= learning_rate * dW
        self.b -= learning_rate * db
        
        # Store gradients
        self.dW = dW
        self.db = db
        
        return dx

class ScratchSimpleRNNClassifier:
    def __init__(self, n_features, n_hidden, n_classes, learning_rate=0.01):
        """
        Initialize ScratchSimpleRNNClassifier
        
        Parameters:
            n_features: Number of input features
            n_hidden: Number of hidden nodes in RNN
            n_classes: Number of output classes
            learning_rate: Learning rate for gradient descent
        """
        self.rnn = SimpleRNN(n_features, n_hidden)
        self.fc = FullyConnected(n_hidden, n_classes)
        self.learning_rate = learning_rate
        
    def forward(self, x):
        """
        Forward pass through the network
        
        Parameters:
            x: Input data with shape (batch_size, n_sequences, n_features)
            
        Returns:
            scores: Raw output scores before softmax
        """
        # Pass through RNN
        h = self.rnn.forward(x)
        
        # Pass through fully connected layer
        scores = self.fc.forward(h)
        
        return scores
    
    def predict(self, x):
        """
        Predict class labels for input data
        
        Parameters:
            x: Input data with shape (batch_size, n_sequences, n_features)
            
        Returns:
            y_pred: Predicted class labels
        """
        scores = self.forward(x)
        y_pred = np.argmax(scores, axis=1)
        return y_pred
    
    def softmax(self, x):
        """
        Compute softmax values for each set of scores
        
        Parameters:
            x: Input scores
            
        Returns:
            softmax_output: Softmax probabilities
        """
        exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
        return exp_x / np.sum(exp_x, axis=1, keepdims=True)
    
    def loss(self, x, y):
        """
        Compute cross-entropy loss and gradients
        
        Parameters:
            x: Input data with shape (batch_size, n_sequences, n_features)
            y: True class labels
            
        Returns:
            loss: Cross-entropy loss
            dscores: Gradient of loss with respect to scores
        """
        batch_size = x.shape[0]
        
        # Forward pass
        scores = self.forward(x)
        probs = self.softmax(scores)
        
        # Compute cross-entropy loss
        loss = -np.sum(np.log(probs[np.arange(batch_size), y])) / batch_size
        
        # Compute gradients
        dscores = probs.copy()
        dscores[np.arange(batch_size), y] -= 1
        dscores /= batch_size
        
        return loss, dscores
    
    def backward(self, x, y):
        """
        Backward pass through the network
        
        Parameters:
            x: Input data with shape (batch_size, n_sequences, n_features)
            y: True class labels
            
        Returns:
            loss: Cross-entropy loss
        """
        # Compute loss and gradients
        loss, dscores = self.loss(x, y)
        
        # Backpropagate through fully connected layer
        dh = self.fc.backward(dscores, self.learning_rate)
        
        # Backpropagate through RNN
        dx, _ = self.rnn.backward(dh, self.learning_rate)
        
        return loss
    
    def fit(self, X, y, epochs=100, batch_size=32, verbose=True):
        """
        Train the model
        
        Parameters:
            X: Input data with shape (n_samples, n_sequences, n_features)
            y: Target labels
            epochs: Number of training epochs
            batch_size: Size of mini-batches
            verbose: Whether to print training progress
            
        Returns:
            losses: List of training losses
        """
        n_samples = X.shape[0]
        losses = []
        
        for epoch in range(epochs):
            # Shuffle data
            indices = np.random.permutation(n_samples)
            X_shuffled = X[indices]
            y_shuffled = y[indices]
            
            # Mini-batch training
            for i in range(0, n_samples, batch_size):
                X_batch = X_shuffled[i:i+batch_size]
                y_batch = y_shuffled[i:i+batch_size]
                
                # Backward pass (updates weights)
                loss = self.backward(X_batch, y_batch)
                
            losses.append(loss)
            
            if verbose and (epoch + 1) % 10 == 0:
                print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss:.4f}")
        
        return losses

# Test with the small example
if __name__ == "__main__":
    # Test data exactly as specified
    x = np.array([[[1, 2], [2, 3], [3, 4]]])/100  # (batch_size, n_sequences, n_features)
    w_x = np.array([[1, 3, 5, 7], [3, 5, 7, 8]])/100  # (n_features, n_nodes)
    w_h = np.array([[1, 3, 5, 7], [2, 4, 6, 8], [3, 5, 7, 8], [4, 6, 8, 10]])/100  # (n_nodes, n_nodes)
    batch_size = x.shape[0]  # 1
    n_sequences = x.shape[1]  # 3
    n_features = x.shape[2]  # 2
    n_nodes = w_x.shape[1]  # 4
    h = np.zeros((batch_size, n_nodes))  # (batch_size, n_nodes)
    b = np.array([1, 1, 1, 1])  # (n_nodes,)
    
    # Experiment with small arrays of forward propagation
    print("Experiment with small arrays of forward propagation:")
    
    # Time step 1
    x_1 = x[:, 0, :]  # (1, 2)
    a_1 = np.dot(x_1, w_x) + np.dot(h, w_h) + b
    h_1 = np.tanh(a_1)
    
    # Time step 2
    x_2 = x[:, 1, :]  # (1, 2)
    a_2 = np.dot(x_2, w_x) + np.dot(h_1, w_h) + b
    h_2 = np.tanh(a_2)
    
    # Time step 3
    x_3 = x[:, 2, :]  # (1, 2)
    a_3 = np.dot(x_3, w_x) + np.dot(h_2, w_h) + b
    h_3 = np.tanh(a_3)
    
    # Final hidden state
    h_final = h_3
    
    print("Expected output from document:")
    print("h = np.array([[0.79494228, 0.81839002, 0.83939649, 0.85584174]])")
    print("\nActual calculated output:")
    print(f"h = np.array({h_final.tolist()})")
    
    # Check if the values are close to expected
    expected = np.array([[0.79494228, 0.81839002, 0.83939649, 0.85584174]])
    if np.allclose(h_final, expected, rtol=1e-5, atol=1e-5):
        print("\nThe output matches the expected values!")
    else:
        print("\nThe output doesn't match the expected values.")
        print("Difference:", h_final - expected)
    
    # Create SimpleRNN instance and manually set weights
    print("\nUsing SimpleRNN class:")
    rnn = SimpleRNN(n_features, n_nodes)
    rnn.W_x = w_x.copy()
    rnn.W_h = w_h.copy()
    rnn.b = b.copy()  # Using the original bias without dividing by 100
    
    # Forward propagation
    output = rnn.forward(x)
    print(f"SimpleRNN output: {output}")
    if np.allclose(output, expected, rtol=1e-5, atol=1e-5):
        print("The SimpleRNN output matches the expected values!")
    else:
        print("The SimpleRNN output doesn't match the expected values.")
        print("Difference:", output - expected)
    
    # Example of using ScratchSimpleRNNClassifier
    print("\nExample of ScratchSimpleRNNClassifier:")
    # Sample data
    X_sample = np.random.randn(10, 5, 2)  # 10 samples, 5 time steps, 2 features
    y_sample = np.random.randint(0, 3, 10)  # 10 samples, 3 classes
    
    # Create and train model
    model = ScratchSimpleRNNClassifier(n_features=2, n_hidden=4, n_classes=3, learning_rate=0.01)
    losses = model.fit(X_sample, y_sample, epochs=50, verbose=False)
    
    # Make predictions
    predictions = model.predict(X_sample)
    print("Sample predictions:", predictions)

Experiment with small arrays of forward propagation:
Expected output from document:
h = np.array([[0.79494228, 0.81839002, 0.83939649, 0.85584174]])

Actual calculated output:
h = np.array([[0.7949422790422093, 0.8183900239382846, 0.8393964886275626, 0.8558417381114936]])

The output matches the expected values!

Using SimpleRNN class:
SimpleRNN output: [[0.79494228 0.81839002 0.83939649 0.85584174]]
The SimpleRNN output matches the expected values!

Example of ScratchSimpleRNNClassifier:
Sample predictions: [2 2 2 2 2 2 2 2 2 2]
