# ðŸ§® Neural Network from Scratch

Building a complete neural network with forward pass, backpropagation, and training â€” using only NumPy.

**What you'll learn:**
- How neurons compute outputs
- Forward propagation step by step
- Loss functions and why they matter
- Backpropagation (gradient computation)
- Gradient descent optimization

In [None]:
import numpy as np
import matplotlib.pyplot as plt

np.random.seed(42)
print('âœ… Setup complete')

## Step 1: Activation Functions

Activation functions introduce non-linearity. Without them, stacking layers would be equivalent to a single linear transformation.

In [None]:
def sigmoid(z):
    """Ïƒ(z) = 1 / (1 + e^(-z))"""
    return 1 / (1 + np.exp(-np.clip(z, -500, 500)))

def sigmoid_derivative(z):
    """Ïƒ'(z) = Ïƒ(z) Ã— (1 - Ïƒ(z))"""
    s = sigmoid(z)
    return s * (1 - s)

def relu(z):
    """ReLU(z) = max(0, z)"""
    return np.maximum(0, z)

def relu_derivative(z):
    """ReLU'(z) = 1 if z > 0, else 0"""
    return (z > 0).astype(float)

# Visualize
z = np.linspace(-5, 5, 200)
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

axes[0].plot(z, sigmoid(z), 'b-', linewidth=2, label='sigmoid')
axes[0].plot(z, sigmoid_derivative(z), 'r--', linewidth=2, label="sigmoid'")
axes[0].set_title('Sigmoid & Derivative')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

axes[1].plot(z, relu(z), 'b-', linewidth=2, label='ReLU')
axes[1].plot(z, relu_derivative(z), 'r--', linewidth=2, label="ReLU'")
axes[1].set_title('ReLU & Derivative')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Step 2: Generate Data

We'll create a simple classification problem â€” two concentric circles.

In [None]:
def make_circles(n=300, noise=0.1):
    """Generate two concentric circles for binary classification."""
    t = np.random.uniform(0, 2 * np.pi, n)
    
    # Inner circle (class 0)
    r_inner = 1 + np.random.randn(n // 2) * noise
    X_inner = np.column_stack([r_inner * np.cos(t[:n//2]), r_inner * np.sin(t[:n//2])])
    
    # Outer circle (class 1)
    r_outer = 3 + np.random.randn(n // 2) * noise
    X_outer = np.column_stack([r_outer * np.cos(t[n//2:]), r_outer * np.sin(t[n//2:])])
    
    X = np.vstack([X_inner, X_outer])
    y = np.array([0] * (n // 2) + [1] * (n // 2)).reshape(-1, 1)
    
    # Shuffle
    idx = np.random.permutation(n)
    return X[idx], y[idx]

X, y = make_circles(400, noise=0.15)

plt.figure(figsize=(6, 6))
plt.scatter(X[y.ravel()==0, 0], X[y.ravel()==0, 1], c='blue', alpha=0.5, label='Class 0')
plt.scatter(X[y.ravel()==1, 0], X[y.ravel()==1, 1], c='red', alpha=0.5, label='Class 1')
plt.title('Training Data â€” Two Circles')
plt.legend()
plt.axis('equal')
plt.grid(True, alpha=0.3)
plt.show()
print(f'X shape: {X.shape}, y shape: {y.shape}')

## Step 3: Build the Neural Network

Architecture: `Input(2) â†’ Hidden(16, ReLU) â†’ Hidden(8, ReLU) â†’ Output(1, Sigmoid)`

In [None]:
class NeuralNetwork:
    def __init__(self, layer_sizes):
        """Initialize weights with He initialization."""
        self.weights = []
        self.biases = []
        
        for i in range(len(layer_sizes) - 1):
            # He initialization: scale by sqrt(2/fan_in)
            w = np.random.randn(layer_sizes[i], layer_sizes[i+1]) * np.sqrt(2 / layer_sizes[i])
            b = np.zeros((1, layer_sizes[i+1]))
            self.weights.append(w)
            self.biases.append(b)
        
        self.n_layers = len(self.weights)
        print(f'Network: {", ".join(str(s) for s in layer_sizes)}')
        print(f'Total parameters: {sum(w.size + b.size for w, b in zip(self.weights, self.biases)):,}')
    
    def forward(self, X):
        """Forward pass â€” store activations for backprop."""
        self.activations = [X]
        self.z_values = []
        
        for i in range(self.n_layers):
            z = self.activations[-1] @ self.weights[i] + self.biases[i]
            self.z_values.append(z)
            
            # ReLU for hidden layers, sigmoid for output
            if i < self.n_layers - 1:
                a = relu(z)
            else:
                a = sigmoid(z)
            self.activations.append(a)
        
        return self.activations[-1]
    
    def compute_loss(self, y_pred, y_true):
        """Binary cross-entropy loss."""
        m = y_true.shape[0]
        epsilon = 1e-8
        loss = -np.mean(y_true * np.log(y_pred + epsilon) + (1 - y_true) * np.log(1 - y_pred + epsilon))
        return loss
    
    def backward(self, y_true, learning_rate=0.01):
        """Backpropagation â€” compute gradients and update weights."""
        m = y_true.shape[0]
        
        # Output layer gradient
        delta = self.activations[-1] - y_true  # dL/dz for BCE + sigmoid
        
        for i in range(self.n_layers - 1, -1, -1):
            # Compute gradients
            dW = (self.activations[i].T @ delta) / m
            db = np.mean(delta, axis=0, keepdims=True)
            
            # Propagate to previous layer
            if i > 0:
                delta = (delta @ self.weights[i].T) * relu_derivative(self.z_values[i-1])
            
            # Update weights
            self.weights[i] -= learning_rate * dW
            self.biases[i] -= learning_rate * db
    
    def train(self, X, y, epochs=1000, lr=0.1, print_every=100):
        """Training loop."""
        losses = []
        
        for epoch in range(epochs):
            # Forward
            y_pred = self.forward(X)
            loss = self.compute_loss(y_pred, y)
            losses.append(loss)
            
            # Backward
            self.backward(y, learning_rate=lr)
            
            if (epoch + 1) % print_every == 0:
                acc = np.mean((y_pred > 0.5).astype(float) == y)
                print(f'Epoch {epoch+1:4d} | Loss: {loss:.4f} | Accuracy: {acc:.2%}')
        
        return losses

# Create and train
nn = NeuralNetwork([2, 16, 8, 1])
losses = nn.train(X, y, epochs=2000, lr=0.1, print_every=200)

## Step 4: Visualize Results

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Loss curve
axes[0].plot(losses, 'b-', alpha=0.7)
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].set_title('Training Loss')
axes[0].grid(True, alpha=0.3)

# Decision boundary
h = 0.05
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
grid = np.column_stack([xx.ravel(), yy.ravel()])

Z = nn.forward(grid)
Z = Z.reshape(xx.shape)

axes[1].contourf(xx, yy, Z, levels=50, cmap='RdBu_r', alpha=0.8)
axes[1].scatter(X[y.ravel()==0, 0], X[y.ravel()==0, 1], c='blue', s=20, edgecolors='white', linewidth=0.5)
axes[1].scatter(X[y.ravel()==1, 0], X[y.ravel()==1, 1], c='red', s=20, edgecolors='white', linewidth=0.5)
axes[1].set_title('Decision Boundary')
axes[1].axis('equal')

plt.tight_layout()
plt.show()

# Final accuracy
y_pred = nn.forward(X)
final_acc = np.mean((y_pred > 0.5).astype(float) == y)
print(f'\nðŸŽ¯ Final Accuracy: {final_acc:.2%}')

## ðŸ’¡ Key Takeaways

| Concept | What We Used | Why |
|---------|-------------|-----|
| **He Initialization** | `âˆš(2/fan_in)` scaling | Prevents vanishing/exploding gradients |
| **ReLU** | Hidden layers | Non-linear, no vanishing gradient, fast |
| **Sigmoid** | Output layer | Squashes to [0,1] for binary classification |
| **BCE Loss** | `-yÂ·log(Å·) - (1-y)Â·log(1-Å·)` | Standard for binary classification |
| **Backprop** | Chain rule through layers | Efficient gradient computation |
| **Learning Rate** | 0.1 | Controls step size (too high = diverge, too low = slow) |

### Interview Questions This Covers
- "Implement a neural network from scratch"
- "Explain backpropagation"
- "Why do we use ReLU over sigmoid in hidden layers?"
- "What is the vanishing gradient problem?"
- "Explain He vs Xavier initialization"