# Neural Network Fundamentals

This notebook covers the mathematical foundations and practical implementation of neural networks.
We'll build neural networks from scratch to understand how they work internally.

## The Perceptron

The perceptron is the simplest neural network unit, consisting of:
- Inputs with weights
- A bias term
- An activation function

Mathematically: $y = f(w_1x_1 + w_2x_2 + ... + w_nx_n + b)$

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification, make_moons
from sklearn.model_selection import train_test_split

class Perceptron:
    def __init__(self, learning_rate=0.01, n_iterations=1000):
        self.learning_rate = learning_rate
        self.n_iterations = n_iterations
        self.weights = None
        self.bias = None
    
    def fit(self, X, y):
        n_samples, n_features = X.shape
        
        # Initialize weights and bias
        self.weights = np.zeros(n_features)
        self.bias = 0
        
        # Training loop
        for _ in range(self.n_iterations):
            for i in range(n_samples):
                # Forward pass
                linear_output = np.dot(X[i], self.weights) + self.bias
                y_predicted = self.activation_function(linear_output)
                
                # Update weights
                update = self.learning_rate * (y[i] - y_predicted)
                self.weights += update * X[i]
                self.bias += update
    
    def activation_function(self, x):
        return 1 if x >= 0 else 0
    
    def predict(self, X):
        linear_output = np.dot(X, self.weights) + self.bias
        return np.array([self.activation_function(x) for x in linear_output])

print("Perceptron class defined.")

In [None]:
# Create a simple linear classification problem
X, y = make_classification(n_samples=100, n_features=2, n_redundant=0, 
                           n_informative=2, n_clusters_per_class=1, random_state=42)
y = y  # Convert to 0/1

# Train perceptron
perceptron = Perceptron(learning_rate=0.01, n_iterations=1000)
perceptron.fit(X, y)

# Visualize decision boundary
def plot_decision_boundary(X, y, model, title="Decision Boundary"):
    plt.figure(figsize=(10, 6))
    
    # Plot data points
    plt.scatter(X[y == 0][:, 0], X[y == 0][:, 1], label='Class 0', alpha=0.7)
    plt.scatter(X[y == 1][:, 0], X[y == 1][:, 1], label='Class 1', alpha=0.7)
    
    # Plot decision boundary
    if hasattr(model, 'weights'):
        x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
        y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
        xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),
                             np.arange(y_min, y_max, 0.1))
        
        Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
        Z = Z.reshape(xx.shape)
        
        plt.contourf(xx, yy, Z, alpha=0.3, levels=[-1, 0, 1, 2], colors=['blue', 'red'])
    
    plt.xlabel('Feature 1')
    plt.ylabel('Feature 2')
    plt.title(title)
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.show()

plot_decision_boundary(X, y, perceptron, "Perceptron Decision Boundary")

## Multi-Layer Perceptron (MLP)

A multi-layer perceptron extends the perceptron with hidden layers, enabling it to learn non-linear decision boundaries.

In [None]:
class MLP:
    def __init__(self, layer_sizes, learning_rate=0.01, n_iterations=1000):
        self.layer_sizes = layer_sizes
        self.learning_rate = learning_rate
        self.n_iterations = n_iterations
        self.weights = []
        self.biases = []
        
        # Initialize weights and biases
        for i in range(len(layer_sizes) - 1):
            self.weights.append(np.random.randn(layer_sizes[i], layer_sizes[i + 1]) * 0.1)
            self.biases.append(np.zeros(layer_sizes[i + 1]))
    
    def sigmoid(self, x):
        return 1 / (1 + np.exp(-np.clip(x, -250, 250)))
    
    def sigmoid_derivative(self, x):
        s = self.sigmoid(x)
        return s * (1 - s)
    
    def forward(self, X):
        self.activations = [X]
        self.z_values = []
        
        current = X
        for i, (w, b) in enumerate(zip(self.weights, self.biases)):
            z = np.dot(current, w) + b
            self.z_values.append(z)
            
            if i < len(self.weights) - 1:  # Hidden layers
                current = self.sigmoid(z)
            else:  # Output layer
                current = z  # Linear activation for regression
            
            self.activations.append(current)
        
        return current
    
    def backward(self, X, y):
        m = X.shape[0]
        
        # Output layer gradient
        delta = self.activations[-1] - y.reshape(-1, 1)
        
        # Backpropagate through layers
        for i in range(len(self.weights) - 1, -1, -1):
            if i == len(self.weights) - 1:
                dW = np.dot(self.activations[i].T, delta) / m
                db = np.sum(delta, axis=0) / m
            else:
                delta = np.dot(delta, self.weights[i + 1].T) * self.sigmoid_derivative(self.z_values[i])
                dW = np.dot(self.activations[i].T, delta) / m
                db = np.sum(delta, axis=0) / m
            
            # Update weights
            self.weights[i] -= self.learning_rate * dW
            self.biases[i] -= self.learning_rate * db
    
    def fit(self, X, y):
        for epoch in range(self.n_iterations):
            # Forward pass
            predictions = self.forward(X)
            
            # Backward pass
            self.backward(X, y)
            
            # Print progress
            if epoch % 100 == 0:
                loss = np.mean((predictions.flatten() - y) ** 2)
                print(f"Epoch {epoch}, Loss: {loss:.4f}")
    
    def predict(self, X):
        predictions = self.forward(X)
        return (predictions.flatten() > 0.5).astype(int)

print("MLP class defined.")

In [None]:
# Create a non-linear classification problem
X_nonlinear, y_nonlinear = make_moons(n_samples=200, noise=0.1, random_state=42)

# Train MLP
mlp = MLP(layer_sizes=[2, 10, 5, 1], learning_rate=0.01, n_iterations=1000)
mlp.fit(X_nonlinear, y_nonlinear)

# Visualize results
plot_decision_boundary(X_nonlinear, y_nonlinear, mlp, "MLP Decision Boundary")

## Activation Functions

Activation functions introduce non-linearity into neural networks. Let's explore common ones:

In [None]:
def plot_activation_functions():
    x = np.linspace(-5, 5, 100)
    
    fig, axes = plt.subplots(2, 3, figsize=(15, 10))
    
    # Sigmoid
    sigmoid = 1 / (1 + np.exp(-x))
    axes[0, 0].plot(x, sigmoid)
    axes[0, 0].set_title('Sigmoid')
    axes[0, 0].grid(True, alpha=0.3)
    
    # Tanh
    tanh = np.tanh(x)
    axes[0, 1].plot(x, tanh)
    axes[0, 1].set_title('Tanh')
    axes[0, 1].grid(True, alpha=0.3)
    
    # ReLU
    relu = np.maximum(0, x)
    axes[0, 2].plot(x, relu)
    axes[0, 2].set_title('ReLU')
    axes[0, 2].grid(True, alpha=0.3)
    
    # Leaky ReLU
    leaky_relu = np.where(x > 0, x, 0.01 * x)
    axes[1, 0].plot(x, leaky_relu)
    axes[1, 0].set_title('Leaky ReLU')
    axes[1, 0].grid(True, alpha=0.3)
    
    # ELU
    elu = np.where(x > 0, x, np.exp(x) - 1)
    axes[1, 1].plot(x, elu)
    axes[1, 1].set_title('ELU')
    axes[1, 1].grid(True, alpha=0.3)
    
    # Swish
    swish = x * (1 / (1 + np.exp(-x)))
    axes[1, 2].plot(x, swish)
    axes[1, 2].set_title('Swish')
    axes[1, 2].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

plot_activation_functions()

## Gradient Descent Variants

Different optimization algorithms for training neural networks:

In [None]:
class Optimizer:
    def __init__(self, method='sgd', learning_rate=0.01, momentum=0.9, beta1=0.9, beta2=0.999, epsilon=1e-8):
        self.method = method
        self.learning_rate = learning_rate
        self.momentum = momentum
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        self.momentum_buffer = {}
        self.v_buffer = {}
        self.m_buffer = {}
        self.t = 0
    
    def update(self, params, grads, param_name):
        self.t += 1
        
        if self.method == 'sgd':
            return params - self.learning_rate * grads
        
        elif self.method == 'momentum':
            if param_name not in self.momentum_buffer:
                self.momentum_buffer[param_name] = np.zeros_like(params)
            
            self.momentum_buffer[param_name] = self.momentum * self.momentum_buffer[param_name] + self.learning_rate * grads
            return params - self.momentum_buffer[param_name]
        
        elif self.method == 'adam':
            if param_name not in self.v_buffer:
                self.v_buffer[param_name] = np.zeros_like(params)
                self.m_buffer[param_name] = np.zeros_like(params)
            
            self.v_buffer[param_name] = self.beta1 * self.v_buffer[param_name] + (1 - self.beta1) * grads
            self.m_buffer[param_name] = self.beta2 * self.m_buffer[param_name] + (1 - self.beta2) * (grads ** 2)
            
            v_corrected = self.v_buffer[param_name] / (1 - self.beta1 ** self.t)
            m_corrected = self.m_buffer[param_name] / (1 - self.beta2 ** self.t)
            
            return params - self.learning_rate * v_corrected / (np.sqrt(m_corrected) + self.epsilon)

# Test optimizers on a simple quadratic function
def quadratic_function(x, y):
    return x**2 + y**2

def quadratic_gradient(x, y):
    return 2*x, 2*y

def optimize_function(optimizer_name, n_steps=50):
    optimizer = Optimizer(method=optimizer_name, learning_rate=0.1)
    
    # Starting point
    x, y = 3.0, 2.0
    trajectory = [(x, y)]
    
    for i in range(n_steps):
        grad_x, grad_y = quadratic_gradient(x, y)
        
        x = optimizer.update(x, grad_x, 'x')
        y = optimizer.update(y, grad_y, 'y')
        
        trajectory.append((x, y))
    
    return np.array(trajectory)

# Compare optimizers
optimizers = ['sgd', 'momentum', 'adam']
colors = ['red', 'blue', 'green']

plt.figure(figsize=(10, 8))
X, Y = np.meshgrid(np.linspace(-3, 3, 100), np.linspace(-3, 3, 100))
Z = quadratic_function(X, Y)

plt.contour(X, Y, Z, levels=20, alpha=0.3)

for opt_name, color in zip(optimizers, colors):
    trajectory = optimize_function(opt_name)
    plt.plot(trajectory[:, 0], trajectory[:, 1], 'o-', color=color, label=opt_name, markersize=4)

plt.xlabel('x')
plt.ylabel('y')
plt.title('Optimization Algorithms Comparison')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

## Key Takeaways

1. **Perceptrons** can only learn linear decision boundaries
2. **Multi-layer networks** can learn complex non-linear patterns
3. **Activation functions** introduce non-linearity and are crucial for deep networks
4. **Backpropagation** efficiently computes gradients for training
5. **Optimization algorithms** significantly affect training speed and convergence

These fundamentals form the foundation for modern deep learning architectures.