In [1]:
import numpy as np

class Conv2d:
    def __init__(self, input_channels, output_channels, filter_size, padding=0, stride=1):
        self.input_channels = input_channels
        self.output_channels = output_channels
        self.filter_size = filter_size  # (F_h, F_w) for filter height and width
        self.padding = padding
        self.stride = stride
        
        # Initialize weights and biases for the convolutional layer
        self.w = np.random.randn(output_channels, input_channels, filter_size[0], filter_size[1]) * 0.1  # (M, K, F_h, F_w)
        self.b = np.zeros(output_channels)  # Bias for each output channel
    
    def pad_input(self, x):
        """
        Apply zero padding to the input.
        """
        return np.pad(x, ((0, 0), (self.padding, self.padding), (self.padding, self.padding), (0, 0)), mode='constant', constant_values=0)
    
    def forward(self, x):
        """
        Perform the forward pass for the convolutional layer.
        x: input data with shape (n_samples, input_channels, height, width)
        """
        x_padded = self.pad_input(x)
        
        n_samples, input_channels, input_height, input_width = x_padded.shape
        filter_height, filter_width = self.filter_size
        
        # Calculate output dimensions
        output_height = (input_height - filter_height) // self.stride + 1
        output_width = (input_width - filter_width) // self.stride + 1
        
        # Initialize the output array
        output = np.zeros((n_samples, self.output_channels, output_height, output_width), dtype=np.float64)
        
        # Perform the convolution operation
        for i in range(self.output_channels):  # Iterate over output channels
            for b in range(n_samples):  # Iterate over batch
                for j in range(output_height):  # Iterate over output rows
                    for k in range(output_width):  # Iterate over output columns
                        # Calculate the slice of the input for the convolution
                        start_i = j * self.stride
                        start_k = k * self.stride
                        end_i = start_i + filter_height
                        end_k = start_k + filter_width
                        
                        # Apply the convolution (sum of element-wise multiplication)
                        output[b, i, j, k] = np.sum(x_padded[b, :, start_i:end_i, start_k:end_k] * self.w[i, :, :, :]) + self.b[i]
        
        return output
    
    def backward(self, x, delta_a, learning_rate=0.01):
        """
        Perform the backward pass for the convolutional layer.
        x: input data with shape (n_samples, input_channels, height, width)
        delta_a: gradient of the loss with respect to the output
        learning_rate: learning rate for parameter updates
        """
        x_padded = self.pad_input(x)
        
        n_samples, input_channels, input_height, input_width = x.shape
        filter_height, filter_width = self.filter_size
        
        # Initialize gradients for weights, biases, and input
        delta_w = np.zeros_like(self.w)
        delta_b = np.zeros_like(self.b)
        delta_x = np.zeros_like(x_padded)
        
        # Perform backpropagation
        for i in range(self.output_channels):
            for b in range(n_samples):
                for j in range(delta_a.shape[2]):
                    for k in range(delta_a.shape[3]):
                        delta_b[i] += delta_a[b, i, j, k]
                        for c in range(input_channels):
                            # Calculate gradient for weights
                            start_i = j * self.stride
                            start_k = k * self.stride
                            end_i = start_i + filter_height
                            end_k = start_k + filter_width
                            
                            delta_w[i, c] += delta_a[b, i, j, k] * x_padded[b, c, start_i:end_i, start_k:end_k]
                            
                            # Calculate gradient for input
                            delta_x[b, c, start_i:end_i, start_k:end_k] += delta_a[b, i, j, k] * self.w[i, c]
        
        # Remove padding from delta_x
        delta_x = delta_x[:, :, self.padding:-self.padding, self.padding:-self.padding] if self.padding > 0 else delta_x
        
        # Update weights and biases using gradient descent
        self.w -= learning_rate * delta_w
        self.b -= learning_rate * delta_b
        
        return delta_x


In [2]:
import numpy as np

class Conv2d:
    def __init__(self, input_channels, output_channels, filter_size, padding=0, stride=1):
        # Initialize convolutional layer parameters
        self.input_channels = input_channels
        self.output_channels = output_channels
        self.filter_size = filter_size  # (F_h, F_w) - height, width of the filter
        self.padding = padding
        self.stride = stride
        
        # Initialize weights (filter), bias for each output channel
        self.w = np.random.randn(output_channels, input_channels, filter_size[0], filter_size[1]) * 0.1  # (M, K, F_h, F_w)
        self.b = np.zeros(output_channels)  # (M,)
    
    def pad_input(self, x):
        """
        Apply padding to the input image to maintain output size.
        """
        return np.pad(x, ((0, 0), (self.padding, self.padding), (self.padding, self.padding), (0, 0)), mode='constant', constant_values=0)
    
    def forward(self, x):
        """
        Perform the forward pass of the convolution operation.
        x: Input data of shape (n_samples, input_channels, height, width)
        """
        x_padded = self.pad_input(x)
        
        n_samples, input_channels, input_height, input_width = x_padded.shape
        filter_height, filter_width = self.filter_size
        
        # Calculate output dimensions (assuming valid padding)
        output_height = (input_height - filter_height) // self.stride + 1
        output_width = (input_width - filter_width) // self.stride + 1
        
        # Initialize output tensor
        output = np.zeros((n_samples, self.output_channels, output_height, output_width), dtype=np.float64)
        
        # Convolution operation
        for i in range(self.output_channels):  # Iterate over output channels
            for b in range(n_samples):  # Iterate over batch
                for j in range(output_height):  # Iterate over output height
                    for k in range(output_width):  # Iterate over output width
                        # Calculate the slice of the input for the convolution
                        start_i = j * self.stride
                        start_k = k * self.stride
                        end_i = start_i + filter_height
                        end_k = start_k + filter_width
                        
                        # Apply the convolution (sum of element-wise multiplication)
                        output[b, i, j, k] = np.sum(x_padded[b, :, start_i:end_i, start_k:end_k] * self.w[i, :, :, :]) + self.b[i]
        
        return output
    
    def backward(self, x, delta_a, learning_rate=0.01):
        """
        Perform the backward pass of the convolution operation.
        x: Input data of shape (n_samples, input_channels, height, width)
        delta_a: Gradient of the loss with respect to the output (a)
        learning_rate: Learning rate for updating weights
        """
        x_padded = self.pad_input(x)
        
        n_samples, input_channels, input_height, input_width = x.shape
        filter_height, filter_width = self.filter_size
        
        # Initialize gradients for weights, biases, and input
        delta_w = np.zeros_like(self.w)
        delta_b = np.zeros_like(self.b)
        delta_x = np.zeros_like(x_padded)
        
        # Backpropagation step
        for i in range(self.output_channels):  # Iterate over output channels
            for b in range(n_samples):  # Iterate over batch
                for j in range(delta_a.shape[2]):  # Iterate over output height
                    for k in range(delta_a.shape[3]):  # Iterate over output width
                        delta_b[i] += delta_a[b, i, j, k]
                        
                        for c in range(input_channels):  # Iterate over input channels
                            start_i = j * self.stride
                            start_k = k * self.stride
                            end_i = start_i + filter_height
                            end_k = start_k + filter_width
                            
                            # Compute the gradient for weights
                            delta_w[i, c] += delta_a[b, i, j, k] * x_padded[b, c, start_i:end_i, start_k:end_k]
                            
                            # Compute the gradient for the input
                            delta_x[b, c, start_i:end_i, start_k:end_k] += delta_a[b, i, j, k] * self.w[i, c]
        
        # Remove padding from delta_x
        if self.padding > 0:
            delta_x = delta_x[:, :, self.padding:-self.padding, self.padding:-self.padding]
        
        # Update weights and biases using gradient descent
        self.w -= learning_rate * delta_w
        self.b -= learning_rate * delta_b
        
        return delta_x


In [6]:
def backward(self, x, delta_a, learning_rate=0.01):
    n_samples, n_channels, height, width = x.shape
    F_h, F_w = self.filter_size
    out_h, out_w = delta_a.shape[2], delta_a.shape[3]
    
    # Gradients for weights and biases
    delta_w = np.zeros_like(self.w)
    delta_b = np.zeros_like(self.b)
    
    # Gradient for input x
    delta_x = np.zeros_like(x)
    
    # Perform backpropagation
    for b in range(n_samples):
        for m in range(self.output_channels):
            # Compute gradient for bias
            delta_b[m] += np.sum(delta_a[b, m])
            
            for i in range(out_h):
                for j in range(out_w):
                    for k in range(n_channels):
                        # Compute the weight gradients (sum over the patch of the input image)
                        delta_w[m, k] += delta_a[b, m, i, j] * x[b, k, i:i+F_h, j:j+F_w]
                        # Compute the gradient for the input x (error propagation)
                        delta_x[b, k, i:i+F_h, j:j+F_w] += delta_a[b, m, i, j] * self.w[m, k]
    
    # Update the weights and biases
    self.w -= learning_rate * delta_w
    self.b -= learning_rate * delta_b
    
    return delta_x


In [11]:
import numpy as np

# Define the Conv2d class for the 2D convolution operation
class Conv2d:
    def __init__(self, input_channels, output_channels, filter_size):
        self.input_channels = input_channels
        self.output_channels = output_channels
        self.filter_size = filter_size
        
        # Initialize weights and biases
        self.w = np.random.randn(output_channels, input_channels, filter_size[0], filter_size[1])
        self.b = np.zeros(output_channels)
    
    def forward(self, x):
        # x shape: (n_samples, n_channels, height, width)
        n_samples, n_channels, height, width = x.shape
        F_h, F_w = self.filter_size
        
        # Calculate output dimensions
        out_h = height - F_h + 1
        out_w = width - F_w + 1
        
        # Initialize the output
        output = np.zeros((n_samples, self.output_channels, out_h, out_w))
        
        # Perform convolution operation
        for b in range(n_samples):  # Iterate over samples
            for m in range(self.output_channels):  # Iterate over output channels
                for i in range(out_h):  # Iterate over height of output
                    for j in range(out_w):  # Iterate over width of output
                        # Apply the filter on the input image (convolution)
                        for k in range(n_channels):  # Iterate over input channels
                            output[b, m, i, j] += np.sum(
                                x[b, k, i:i+F_h, j:j+F_w] * self.w[m, k]
                            )
                        # Add bias for the output channel
                        output[b, m, i, j] += self.b[m]
        
        return output
    
    def backward(self, x, delta_a, learning_rate=0.01):
        n_samples, n_channels, height, width = x.shape
        F_h, F_w = self.filter_size
        out_h, out_w = delta_a.shape[2], delta_a.shape[3]
        
        # Gradients for weights and biases
        delta_w = np.zeros_like(self.w)
        delta_b = np.zeros_like(self.b)
        
        # Gradient for input x
        delta_x = np.zeros_like(x)
        
        # Perform backpropagation
        for b in range(n_samples):  # Iterate over samples
            for m in range(self.output_channels):  # Iterate over output channels
                # Compute gradient for bias
                delta_b[m] += np.sum(delta_a[b, m])
                
                for i in range(out_h):  # Iterate over height of output
                    for j in range(out_w):  # Iterate over width of output
                        for k in range(n_channels):  # Iterate over input channels
                            # Ensure the correct slice from the input is selected
                            x_patch = x[b, k, i:i+F_h, j:j+F_w]
                            
                            # Compute the weight gradients (sum over the patch of the input image)
                            delta_w[m, k] += delta_a[b, m, i, j] * x_patch
                            
                            # Compute the gradient for the input x (error propagation)
                            delta_x[b, k, i:i+F_h, j:j+F_w] += delta_a[b, m, i, j] * self.w[m, k]
        
        # Update the weights and biases
        self.w -= learning_rate * delta_w
        self.b -= learning_rate * delta_b
        
        return delta_x


# Test Case: Small Array Example

# Input x (1 sample, 1 channel, 4x4 image)
x = np.array([[[[ 1,  2,  3,  4],
                [ 5,  6,  7,  8],
                [ 9, 10, 11, 12],
                [13, 14, 15, 16]]]])

# Weights (2 filters, 1 input channel, 3x3 filter size)
w = np.array([[[ 0.,  0.,  0.],
               [ 0.,  1.,  0.],
               [ 0., -1.,  0.]],

              [[ 0.,  0.,  0.],
               [ 0., -1.,  1.],
               [ 0.,  0.,  0.]]])

# Biases (2 filters)
b = np.array([0., 0.])

# Create Conv2d layer with 1 input channel, 2 output channels, and 3x3 filters
conv = Conv2d(input_channels=1, output_channels=2, filter_size=(3, 3))
conv.w = w  # Set the weights directly for this example
conv.b = b  # Set the biases

# Forward propagation
print("Forward pass output:")
print(output)

# Backpropagation (delta_a from the next layer, assumed)
delta_a = np.array([[[[-4, -4], [10, 11]], [[ 1, -7], [ 1, -11]]]])


Forward pass output:
[[[[0. 0.]
   [0. 0.]]

  [[0. 0.]
   [0. 0.]]]]


In [14]:
# Perform backward pass
print("\nBackward pass (delta_x):")
# (?,1,2,2,)
delta = np.array([[[ -4,  -4],
                   [ 10,  11]],

                  [[  1,  -7],
                   [  1, -11]]])


Backward pass (delta_x):


In [15]:
# Define the function to calculate the output size of a convolution operation
def conv_output_size(N_in, P, F, S):
    """
    Calculate the output size (height and width) of a convolution operation.
    
    Parameters:
    - N_in: Input size (height/width)
    - P: Padding size
    - F: Filter size
    - S: Stride size
    
    Returns:
    - N_out: Output size (height/width)
    """
    N_out = (N_in + 2 * P - F) // S + 1
    return N_out

# Test the function with example inputs
N_in = 4  # Example input size (height or width)
P = 1      # Padding
F = 3      # Filter size
S = 1      # Stride

# Calculate output size for both height and width
output_height = conv_output_size(N_in, P, F, S)
output_width = conv_output_size(N_in, P, F, S)

output_height, output_width


(4, 4)

In [16]:
import numpy as np

class MaxPool2D:
    def __init__(self, pool_size=(2, 2), stride=(2, 2)):
        self.pool_size = pool_size  # Pooling window size (height, width)
        self.stride = stride        # Stride size (height, width)

    def forward(self, x):
        """
        Perform forward pass through max pooling layer.

        x: Input array (batch_size, channels, height, width)

        Returns:
        Output array after applying max pooling (batch_size, channels, pooled_height, pooled_width)
        """
        batch_size, channels, height, width = x.shape
        pool_height, pool_width = self.pool_size
        stride_height, stride_width = self.stride
        
        # Calculate output dimensions
        pooled_height = (height - pool_height) // stride_height + 1
        pooled_width = (width - pool_width) // stride_width + 1
        
        # Create an output array to store pooled results
        output = np.zeros((batch_size, channels, pooled_height, pooled_width))
        
        # For storing the indices of the max values for backpropagation
        self.max_indices = np.zeros((batch_size, channels, pooled_height, pooled_width, 2), dtype=int)

        # Perform max pooling
        for b in range(batch_size):  # Iterate over batches
            for c in range(channels):  # Iterate over channels
                for i in range(pooled_height):  # Iterate over pooled height
                    for j in range(pooled_width):  # Iterate over pooled width
                        # Calculate the start and end indices for the pooling window
                        start_i = i * stride_height
                        start_j = j * stride_width
                        end_i = start_i + pool_height
                        end_j = start_j + pool_width
                        
                        # Extract the pooling window
                        window = x[b, c, start_i:end_i, start_j:end_j]
                        
                        # Find the maximum value in the pooling window
                        max_val = np.max(window)
                        output[b, c, i, j] = max_val
                        
                        # Store the indices of the max value for backpropagation
                        max_pos = np.unravel_index(np.argmax(window), window.shape)
                        self.max_indices[b, c, i, j] = (max_pos[0] + start_i, max_pos[1] + start_j)

        return output

    def backward(self, x, delta_a, learning_rate):
        """
        Perform backward pass through max pooling layer.

        x: Input array (batch_size, channels, height, width)
        delta_a: Gradient of the loss w.r.t. output (batch_size, channels, pooled_height, pooled_width)
        learning_rate: Learning rate (not used here, but included for consistency)

        Returns:
        Gradient w.r.t. input array (batch_size, channels, height, width)
        """
        batch_size, channels, height, width = x.shape
        pool_height, pool_width = self.pool_size
        stride_height, stride_width = self.stride
        
        # Create an array to store the gradients w.r.t. input
        delta_x = np.zeros_like(x)

        # Propagate the gradients back to the positions where max values were selected
        for b in range(batch_size):  # Iterate over batches
            for c in range(channels):  # Iterate over channels
                for i in range(delta_a.shape[2]):  # Iterate over pooled height
                    for j in range(delta_a.shape[3]):  # Iterate over pooled width
                        # Get the indices of the maximum value during forward pass
                        p, q = self.max_indices[b, c, i, j]
                        
                        # Update the gradient w.r.t input at the position of the max value
                        delta_x[b, c, p, q] += delta_a[b, c, i, j]
                        
        return delta_x


In [17]:
# Create a MaxPool2D object with a 2x2 pooling window and stride of 2
maxpool = MaxPool2D(pool_size=(2, 2), stride=(2, 2))

# Example input with shape (1, 1, 4, 4) representing 1 sample, 1 channel, 4x4 image
x = np.array([[[[1, 2, 3, 4],
                [5, 6, 7, 8],
                [9, 10, 11, 12],
                [13, 14, 15, 16]]]])

# Perform forward pass
output = maxpool.forward(x)
print("Forward pass output:")
print(output)

# Example gradient for the output (delta_a) with shape (1, 1, 2, 2)
delta_a = np.array([[[[0.1, 0.2],
                      [0.3, 0.4]]]])

# Perform backward pass
delta_x = maxpool.backward(x, delta_a, learning_rate=0.01)
print("\nBackward pass (delta_x):")
print(delta_x)


Forward pass output:
[[[[ 6.  8.]
   [14. 16.]]]]

Backward pass (delta_x):
[[[[0 0 0 0]
   [0 0 0 0]
   [0 0 0 0]
   [0 0 0 0]]]]


In [18]:
import numpy as np

class AveragePool2D:
    def __init__(self, pool_size=(2, 2), stride=(2, 2)):
        self.pool_size = pool_size  # Pooling window size (height, width)
        self.stride = stride        # Stride size (height, width)

    def forward(self, x):
        """
        Perform forward pass through average pooling layer.

        x: Input array (batch_size, channels, height, width)

        Returns:
        Output array after applying average pooling (batch_size, channels, pooled_height, pooled_width)
        """
        batch_size, channels, height, width = x.shape
        pool_height, pool_width = self.pool_size
        stride_height, stride_width = self.stride
        
        # Calculate output dimensions
        pooled_height = (height - pool_height) // stride_height + 1
        pooled_width = (width - pool_width) // stride_width + 1
        
        # Create an output array to store pooled results
        output = np.zeros((batch_size, channels, pooled_height, pooled_width))

        # Perform average pooling
        for b in range(batch_size):  # Iterate over batches
            for c in range(channels):  # Iterate over channels
                for i in range(pooled_height):  # Iterate over pooled height
                    for j in range(pooled_width):  # Iterate over pooled width
                        # Calculate the start and end indices for the pooling window
                        start_i = i * stride_height
                        start_j = j * stride_width
                        end_i = start_i + pool_height
                        end_j = start_j + pool_width
                        
                        # Extract the pooling window
                        window = x[b, c, start_i:end_i, start_j:end_j]
                        
                        # Compute the average value in the pooling window
                        avg_val = np.mean(window)
                        output[b, c, i, j] = avg_val

        return output

    def backward(self, x, delta_a, learning_rate):
        """
        Perform backward pass through average pooling layer.

        x: Input array (batch_size, channels, height, width)
        delta_a: Gradient of the loss w.r.t. output (batch_size, channels, pooled_height, pooled_width)
        learning_rate: Learning rate (not used here, but included for consistency)

        Returns:
        Gradient w.r.t. input array (batch_size, channels, height, width)
        """
        batch_size, channels, height, width = x.shape
        pool_height, pool_width = self.pool_size
        stride_height, stride_width = self.stride
        
        # Create an array to store the gradients w.r.t input
        delta_x = np.zeros_like(x)

        # Backpropagate the gradients for each pooling window
        for b in range(batch_size):  # Iterate over batches
            for c in range(channels):  # Iterate over channels
                for i in range(delta_a.shape[2]):  # Iterate over pooled height
                    for j in range(delta_a.shape[3]):  # Iterate over pooled width
                        # Calculate the start and end indices for the pooling window
                        start_i = i * stride_height
                        start_j = j * stride_width
                        end_i = start_i + pool_height
                        end_j = start_j + pool_width
                        
                        # Compute the average gradient for the window
                        window_size = pool_height * pool_width
                        delta_x[b, c, start_i:end_i, start_j:end_j] += delta_a[b, c, i, j] / window_size

        return delta_x


In [24]:
# Create an AveragePool2D object with a 2x2 pooling window and stride of 2
avgpool = AveragePool2D(pool_size=(2, 2), stride=(2, 2))

# Example input with shape (1, 1, 4, 4) representing 1 sample, 1 channel, 4x4 image
x = np.array([[[[1, 2, 3, 4],
                [5, 6, 7, 8],
                [9, 10, 11, 12],
                [13, 14, 15, 16]]]])

# Perform forward pass
output = avgpool.forward(x)
print("Forward pass output:")
print(output)

# Example gradient for the output (delta_a) with shape (1, 1, 2, 2)
delta_a = np.array([[[[0.1, 0.2],
                      [0.3, 0.4]]]])

# Perform backward pass
output = avgpool.backward
print("\nBackward pass (delta_x):")
print(output)


Forward pass output:
[[[[ 3.5  5.5]
   [11.5 13.5]]]]

Backward pass (delta_x):
<bound method AveragePool2D.backward of <__main__.AveragePool2D object at 0x000002B45D14FC90>>


In [25]:
import numpy as np

class Flatten:
    def __init__(self):
        self.input_shape = None  # Will store the input shape for backward pass

    def forward(self, x):
        """
        Perform the forward pass: Flatten the input tensor into a 2D array.

        x: Input array (batch_size, channels, height, width)

        Returns:
        Flattened output array (batch_size, channels * height * width)
        """
        # Store the input shape for backward pass
        self.input_shape = x.shape
        
        # Reshape the input to flatten all dimensions except the batch size
        batch_size = x.shape[0]
        flattened = x.reshape(batch_size, -1)  # Flatten the rest of the dimensions
        return flattened

    def backward(self, delta_a):
        """
        Perform the backward pass: Reshape the delta to match the input shape.

        delta_a: Gradient of the loss w.r.t. the flattened output (batch_size, flattened_size)

        Returns:
        Gradient w.r.t. the input array (batch_size, channels, height, width)
        """
        # Reshape the delta to match the original input shape
        batch_size = self.input_shape[0]
        delta_x = delta_a.reshape(self.input_shape)
        return delta_x


In [28]:
# Create a Flatten object
flatten = Flatten()

# Example input with shape (1, 2, 4, 4) representing 1 sample, 2 channels, 4x4 image
x = np.array([[[[1, 2, 3, 4],
                [5, 6, 7, 8],
                [9, 10, 11, 12],
                [13, 14, 15, 16]],

               [[17, 18, 19, 20],
                [21, 22, 23, 24],
                [25, 26, 27, 28],
                [29, 30, 31, 32]]]])

# Perform forward pass
output = flatten.forward(x)
print("Forward pass output (flattened):")
print(output)

# Example gradient for the output (delta_a) with shape (1, 32)
delta_a = np.array([[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 
                     1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0, 
                     2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3.0, 
                     3.1, 3.2, 3.3, 3.4]])

# Perform backward pass
output = flatten.backward
print("\nBackward pass output (reshaped back):")
print(output)


Forward pass output (flattened):
[[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
  25 26 27 28 29 30 31 32]]

Backward pass output (reshaped back):
<bound method Flatten.backward of <__main__.Flatten object at 0x000002B46477A250>>


In [56]:
pip install torch torchvision torchaudio


Collecting torchNote: you may need to restart the kernel to use updated packages.

  Downloading torch-2.5.1-cp311-cp311-win_amd64.whl.metadata (28 kB)
Collecting torchvision
  Downloading torchvision-0.20.1-cp311-cp311-win_amd64.whl.metadata (6.2 kB)
Collecting torchaudio
  Downloading torchaudio-2.5.1-cp311-cp311-win_amd64.whl.metadata (6.5 kB)
Collecting filelock (from torch)
  Downloading filelock-3.16.1-py3-none-any.whl.metadata (2.9 kB)
Collecting networkx (from torch)
  Downloading networkx-3.4.2-py3-none-any.whl.metadata (6.3 kB)
Collecting jinja2 (from torch)
  Downloading jinja2-3.1.4-py3-none-any.whl.metadata (2.6 kB)
Collecting fsspec (from torch)
  Downloading fsspec-2024.10.0-py3-none-any.whl.metadata (11 kB)
Collecting sympy==1.13.1 (from torch)
  Downloading sympy-1.13.1-py3-none-any.whl.metadata (12 kB)
Collecting mpmath<1.4,>=1.1.0 (from sympy==1.13.1->torch)
  Downloading mpmath-1.3.0-py3-none-any.whl.metadata (8.6 kB)
Downloading torch-2.5.1-cp311-cp311-win_amd64.wh

In [57]:
import torch
print(torch.__version__)
print(torch.cuda.is_available())  # To check if GPU support is available


2.5.1+cpu
False


In [58]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

# Define a simple Conv2d model
class SimpleCNN(nn.Module):
    def __init__(self):
        super(SimpleCNN, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=3, stride=1, padding=1)
        self.relu = nn.ReLU()
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
        self.fc1 = nn.Linear(16 * 14 * 14, 10)  # MNIST images are 28x28

    def forward(self, x):
        x = self.conv1(x)
        x = self.relu(x)
        x = self.pool(x)
        x = x.view(x.size(0), -1)  # Flatten the tensor
        x = self.fc1(x)
        return x

# Set up data loaders
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

train_dataset = datasets.MNIST(root="./data", train=True, download=True, transform=transform)
test_dataset = datasets.MNIST(root="./data", train=False, download=True, transform=transform)

train_loader = DataLoader(dataset=train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=64, shuffle=False)

# Initialize the model, loss function, and optimizer
model = SimpleCNN()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

# Training loop
def train(model, train_loader, criterion, optimizer, epochs=1):
    model.train()
    for epoch in range(epochs):
        for batch_idx, (data, target) in enumerate(train_loader):
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()

            if batch_idx % 100 == 0:
                print(f"Epoch {epoch+1}, Batch {batch_idx}, Loss: {loss.item():.4f}")

# Evaluation loop
def evaluate(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for data, target in test_loader:
            output = model(data)
            _, predicted = torch.max(output, 1)
            correct += (predicted == target).sum().item()
            total += target.size(0)

    accuracy = correct / total
    print(f"Test Accuracy: {accuracy * 100:.2f}%")

# Run training and evaluation
train(model, train_loader, criterion, optimizer, epochs=1)
evaluate(model, test_loader)


Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz to ./data\MNIST\raw\train-images-idx3-ubyte.gz


100.0%


Extracting ./data\MNIST\raw\train-images-idx3-ubyte.gz to ./data\MNIST\raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz to ./data\MNIST\raw\train-labels-idx1-ubyte.gz


100.0%


Extracting ./data\MNIST\raw\train-labels-idx1-ubyte.gz to ./data\MNIST\raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz to ./data\MNIST\raw\t10k-images-idx3-ubyte.gz


100.0%


Extracting ./data\MNIST\raw\t10k-images-idx3-ubyte.gz to ./data\MNIST\raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz to ./data\MNIST\raw\t10k-labels-idx1-ubyte.gz


100.0%


Extracting ./data\MNIST\raw\t10k-labels-idx1-ubyte.gz to ./data\MNIST\raw

Epoch 1, Batch 0, Loss: 2.3318
Epoch 1, Batch 100, Loss: 0.8257
Epoch 1, Batch 200, Loss: 0.5103
Epoch 1, Batch 300, Loss: 0.2559
Epoch 1, Batch 400, Loss: 0.5564
Epoch 1, Batch 500, Loss: 0.3979
Epoch 1, Batch 600, Loss: 0.3124
Epoch 1, Batch 700, Loss: 0.2519
Epoch 1, Batch 800, Loss: 0.2490
Epoch 1, Batch 900, Loss: 0.2934
Test Accuracy: 93.01%


In [59]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

# Define the LeNet model
class LeNet(nn.Module):
    def __init__(self):
        super(LeNet, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5, stride=1, padding=0)
        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv2 = nn.Conv2d(in_channels=6, out_channels=16, kernel_size=5, stride=1, padding=0)
        self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.fc1 = nn.Linear(16 * 4 * 4, 120)  # After two poolings, the input size becomes 4x4
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = torch.relu(self.conv1(x))
        x = self.pool1(x)
        x = torch.relu(self.conv2(x))
        x = self.pool2(x)
        x = x.view(x.size(0), -1)  # Flatten the tensor
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Set up data loaders
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

train_dataset = datasets.MNIST(root="./data", train=True, download=True, transform=transform)
test_dataset = datasets.MNIST(root="./data", train=False, download=True, transform=transform)

train_loader = DataLoader(dataset=train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=64, shuffle=False)

# Initialize the model, loss function, and optimizer
model = LeNet()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

# Training loop
def train(model, train_loader, criterion, optimizer, epochs=1):
    model.train()
    for epoch in range(epochs):
        for batch_idx, (data, target) in enumerate(train_loader):
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()

            if batch_idx % 100 == 0:
                print(f"Epoch {epoch+1}, Batch {batch_idx}, Loss: {loss.item():.4f}")

# Evaluation loop
def evaluate(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for data, target in test_loader:
            output = model(data)
            _, predicted = torch.max(output, 1)
            correct += (predicted == target).sum().item()
            total += target.size(0)

    accuracy = correct / total
    print(f"Test Accuracy: {accuracy * 100:.2f}%")

# Run training and evaluation
train(model, train_loader, criterion, optimizer, epochs=5)
evaluate(model, test_loader)


Epoch 1, Batch 0, Loss: 2.3065
Epoch 1, Batch 100, Loss: 2.3038
Epoch 1, Batch 200, Loss: 2.2940
Epoch 1, Batch 300, Loss: 2.2794
Epoch 1, Batch 400, Loss: 2.2652
Epoch 1, Batch 500, Loss: 2.2673
Epoch 1, Batch 600, Loss: 2.2382
Epoch 1, Batch 700, Loss: 2.1141
Epoch 1, Batch 800, Loss: 1.6047
Epoch 1, Batch 900, Loss: 0.7952
Epoch 2, Batch 0, Loss: 0.7360
Epoch 2, Batch 100, Loss: 0.6467
Epoch 2, Batch 200, Loss: 0.5171
Epoch 2, Batch 300, Loss: 0.3129
Epoch 2, Batch 400, Loss: 0.4610
Epoch 2, Batch 500, Loss: 0.4559
Epoch 2, Batch 600, Loss: 0.2093
Epoch 2, Batch 700, Loss: 0.3878
Epoch 2, Batch 800, Loss: 0.4016
Epoch 2, Batch 900, Loss: 0.3191
Epoch 3, Batch 0, Loss: 0.1391
Epoch 3, Batch 100, Loss: 0.2491
Epoch 3, Batch 200, Loss: 0.2140
Epoch 3, Batch 300, Loss: 0.1751
Epoch 3, Batch 400, Loss: 0.0487
Epoch 3, Batch 500, Loss: 0.0647
Epoch 3, Batch 600, Loss: 0.2101
Epoch 3, Batch 700, Loss: 0.3771
Epoch 3, Batch 800, Loss: 0.1874
Epoch 3, Batch 900, Loss: 0.1217
Epoch 4, Batch 0

There seems to be a technical issue with accessing detailed search results. However, I can provide you with commonly used CNN architectures for image recognition, which are widely implemented and available in popular frameworks like PyTorch and TensorFlow:

Famous CNN Architectures
LeNet (1998) - A foundational CNN architecture designed for handwritten digit recognition.
AlexNet (2012) - Won the ImageNet competition and popularized the use of ReLU, dropout, and GPUs.
VGG (2014) - Known for its simplicity with 16 or 19 layers of stacked convolutional layers (e.g., VGG16).
GoogLeNet/Inception (2014) - Introduced the concept of inception modules for efficient computation.
ResNet (2015) - Revolutionized deep learning with residual connections, enabling very deep networks.
DenseNet (2016) - Uses dense connections to improve gradient flow and feature reuse.
EfficientNet (2019) - Balances model size and accuracy by scaling depth, width, and resolution efficiently.
Vision Transformers (ViT) (2020) - Applies transformer models to image recognition tasks.
Pre-Trained Models in PyTorch
PyTorch provides many pre-trained models through torchvision.models, including:

ResNet (ResNet18, ResNet50, etc.)
VGG (VGG11, VGG16, etc.)
Inception (Inception v3)
DenseNet
EfficientNet
MobileNet
Pre-Trained Models in TensorFlow
TensorFlow offers these via tf.keras.applications:

VGG16, VGG19
InceptionV3, InceptionResNetV2
ResNet50, ResNet152
EfficientNetB0-B7
MobileNetV2
Xception

In [60]:
# Python code to calculate output size and number of parameters for convolutional layers

def calculate_output_size(input_size, filter_size, stride, padding):
    return (input_size - filter_size + 2 * padding) // stride + 1

def calculate_parameters(filter_width, filter_height, input_channels, output_channels):
    return (filter_width * filter_height * input_channels + 1) * output_channels

# Layer 1 parameters
input_size_1 = 144
input_channels_1 = 3
filter_size_1 = 3
output_channels_1 = 6
stride_1 = 1
padding_1 = 0

output_size_1 = calculate_output_size(input_size_1, filter_size_1, stride_1, padding_1)
parameters_1 = calculate_parameters(filter_size_1, filter_size_1, input_channels_1, output_channels_1)

print(f"Layer 1: Output Size = {output_size_1}x{output_size_1}x{output_channels_1}, Parameters = {parameters_1}")

# Layer 2 parameters
input_size_2 = 60
input_channels_2 = 24
filter_size_2 = 3
output_channels_2 = 48
stride_2 = 1
padding_2 = 0

output_size_2 = calculate_output_size(input_size_2, filter_size_2, stride_2, padding_2)
parameters_2 = calculate_parameters(filter_size_2, filter_size_2, input_channels_2, output_channels_2)

print(f"Layer 2: Output Size = {output_size_2}x{output_size_2}x{output_channels_2}, Parameters = {parameters_2}")

# Layer 3 parameters
input_size_3 = 20
input_channels_3 = 10
filter_size_3 = 3
output_channels_3 = 20
stride_3 = 2
padding_3 = 0

output_size_3 = calculate_output_size(input_size_3, filter_size_3, stride_3, padding_3)
parameters_3 = calculate_parameters(filter_size_3, filter_size_3, input_channels_3, output_channels_3)

print(f"Layer 3: Output Size = {output_size_3}x{output_size_3}x{output_channels_3}, Parameters = {parameters_3}")


Layer 1: Output Size = 142x142x6, Parameters = 168
Layer 2: Output Size = 58x58x48, Parameters = 10416
Layer 3: Output Size = 9x9x20, Parameters = 1820


In [61]:
# Python code to demonstrate the effects of 3x3 and 1x1 filters in CNN layers
import torch
import torch.nn as nn

# Define a function to calculate the receptive field for stacked 3x3 filters
def calculate_receptive_field(num_layers, filter_size, stride):
    receptive_field = 1
    for _ in range(num_layers):
        receptive_field = receptive_field + (filter_size - 1) * stride
    return receptive_field

# Example: Receptive field of 3 stacked 3x3 filters
num_layers = 3
filter_size = 3
stride = 1
receptive_field = calculate_receptive_field(num_layers, filter_size, stride)
print(f"Receptive field of {num_layers} stacked 3x3 filters: {receptive_field}x{receptive_field}")

# Define a CNN model with 1x1 and 3x3 filters
class ExampleCNN(nn.Module):
    def __init__(self):
        super(ExampleCNN, self).__init__()
        self.conv_3x3 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1)
        self.conv_1x1 = nn.Conv2d(in_channels=128, out_channels=64, kernel_size=1, stride=1, padding=0)

    def forward(self, x):
        x = self.conv_3x3(x)  # 3x3 filter for feature extraction
        x = nn.ReLU()(x)
        x = self.conv_1x1(x)  # 1x1 filter for dimensionality reduction
        x = nn.ReLU()(x)
        return x

# Create a dummy input tensor (batch_size=1, channels=64, height=32, width=32)
input_tensor = torch.randn(1, 64, 32, 32)

# Pass the tensor through the model
model = ExampleCNN()
output_tensor = model(input_tensor)

# Print input and output shapes
print(f"Input shape: {input_tensor.shape}")
print(f"Output shape after CNN: {output_tensor.shape}")

# Calculate number of parameters in each layer
num_params_3x3 = sum(p.numel() for p in model.conv_3x3.parameters())
num_params_1x1 = sum(p.numel() for p in model.conv_1x1.parameters())
print(f"Number of parameters in 3x3 filter: {num_params_3x3}")
print(f"Number of parameters in 1x1 filter: {num_params_1x1}")


Receptive field of 3 stacked 3x3 filters: 7x7
Input shape: torch.Size([1, 64, 32, 32])
Output shape after CNN: torch.Size([1, 64, 32, 32])
Number of parameters in 3x3 filter: 73856
Number of parameters in 1x1 filter: 8256
