In [1]:
import torch 
import torch.nn as nn

In [2]:
class conv_block(nn.Module):
    def __init__(self, in_channels, out_channels, **kwargs):
        super(conv_block, self).__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, **kwargs)
        self.relu = nn.ReLU()
        self.batchnorm = nn.BatchNorm2d(out_channels)
        
    def forward(self, x):
        x = self.conv(x)
        x = self.relu(x)
        x = self.batchnorm(x)
        return x
    

<figure>
  <img src="asset/inception_module.png" alt="InceptioModule" width="600">
</figure>


---
This diagram is the **Inception module** (from Google’s *GoogLeNet/Inception v1* architecture), and it shows how the model combines multiple convolutional operations of different sizes *in parallel*.

### 1. The problem before Inception

If you just stacked (3 X 3) and (5 X 5) convolutions directly on top of a previous layer, the number of parameters would explode. Each large kernel (especially (5 X 5)) multiplies the number of input channels by the number of filters — computationally heavy and memory-hungry.

So, the Inception idea was: *What if we shrink the depth (number of channels) before applying big kernels?*

That’s where **dimensionality reduction** with (1 X 1) convolutions comes in.

---

### 2. Step-by-step flow

#### **From the “Previous Layer”**

You start with a feature map — think of it as a cube of data with many channels.

From here, the data flows into **four parallel paths**:


#### **Path 1: Simple 1×1 Convolution**

This path directly applies a (1 X 1) convolution.
It captures fine-grained local interactions between channels and provides a lightweight transformation — useful for introducing non-linearity (after ReLU) without increasing parameters.



#### **Path 2: 1×1 → 3×3 Convolution (Reduction before Expansion)**

* First, a **(1 X 1)** convolution reduces the depth — fewer channels.
* Then, a **(3 X 3)** convolution processes the spatial information, but now it’s operating on a smaller number of input channels.

This reduction step drastically cuts down computation while keeping the receptive field of (3 X 3).



#### **Path 3: 1×1 → 5×5 Convolution (Heavier kernel, but reduced input)**

* The **(1 X 1)** convolution again reduces the depth.
* The **(5 X 5)** convolution then captures larger spatial patterns (wider context), but thanks to the reduction, it’s not computationally prohibitive.

This is the main *dimensionality reduction* path — otherwise, (5 X 5) filters would be very expensive.


#### **Path 4: 3×3 Max Pooling → 1×1 Convolution**

Pooling operations don’t change the number of channels — so right after pooling, a **(1 X 1)** convolution is used to *project* (reduce or transform) the pooled output to a smaller or more balanced depth, ensuring that all branches can be concatenated later.


---
### 3. Final Step: **Filter Concatenation**

After all these paths process the same input in different ways:

* The feature maps from all paths are **concatenated along the channel dimension**.
  This gives the network multiple levels of abstraction (fine to coarse features) in one unified output tensor.


In [3]:
class InceptionModule(nn.Module):
    def __init__(self, in_channels, out_1x1, red_3x3, out_3x3, red_5x5, out_5x5, out_pool):
        super(InceptionModule, self).__init__()
        
        # 1x1 convolution branch
        self.branch1 = nn.Sequential(
            nn.Conv2d(in_channels, out_1x1, kernel_size=1), # reduce channels
            nn.ReLU(inplace=True)
        )

        # 1x1 convolution followed by 3x3 convolution branch
        self.branch2 = nn.Sequential(
            nn.Conv2d(in_channels, red_3x3, kernel_size=1), # reduce channels
            nn.ReLU(inplace=True),
            nn.Conv2d(red_3x3, out_3x3, kernel_size=3, padding=1), # 3x3 conv
            nn.ReLU(inplace=True)
        )

        # 1x1 convolution followed by 5x5 convolution branch
        self.branch3 = nn.Sequential(
            nn.Conv2d(in_channels, red_5x5, kernel_size=1), # reduce channels
            nn.ReLU(inplace=True),
            nn.Conv2d(red_5x5, out_5x5, kernel_size=5, padding=2), # 5x5 conv
            nn.ReLU(inplace=True)
        )

        # 3x3 max pooling followed by 1x1 convolution branch
        self.branch4 = nn.Sequential(
            nn.MaxPool2d(kernel_size=3, stride=1, padding=1),   # 3x3 max pooling
            nn.Conv2d(in_channels, out_pool, kernel_size=1), # 1x1 conv
            nn.ReLU(inplace=True)
        )

    def forward(self, x):
        branch1_out = self.branch1(x)
        branch2_out = self.branch2(x)
        branch3_out = self.branch3(x)
        branch4_out = self.branch4(x)

        # Concatenate outputs along the channel dimension
        outputs = [branch1_out, branch2_out, branch3_out, branch4_out]
        return torch.cat(outputs, 1)


<figure>
  <img src="asset/google_net_arch.png" alt="GoogleNet" width="700" height ="500">
  <figcaption>GoogleNet arch From Original Paper
  </figcaption>
</figure>

In [4]:
class GoogleNet(nn.Module):
    def __init__(self, in_channels=3, num_classes=1000):
        super(GoogleNet,self).__init__()
        
        self.conv1 = conv_block(in_channels, 64, kernel_size=7, stride=2, padding=3)
        self.maxpool1 = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        
        self.conv2= conv_block(64, 192, kernel_size=3, stride=1, padding=1)
        self.maxpool2 = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        
        # format :  in_channels, out_1x1, red_3x3, out_3x3, red_5x5, out_5x5, out_pool
        self.inception3a= InceptionModule(in_channels=192, out_1x1=64, red_3x3=96, out_3x3=128, red_5x5=16, out_5x5=32, out_pool=32)
        self.inception3b= InceptionModule(in_channels=256, out_1x1=128, red_3x3=128, out_3x3=192, red_5x5=32, out_5x5=96, out_pool=64)
        self.maxpool3 = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        
        self.inception4a= InceptionModule(in_channels=480, out_1x1=192, red_3x3=96, out_3x3=208, red_5x5=16, out_5x5=48, out_pool=64)
        self.inception4b= InceptionModule(in_channels=512, out_1x1=160, red_3x3=112, out_3x3=224, red_5x5=24, out_5x5=64, out_pool=64)
        self.inception4c= InceptionModule(in_channels=512, out_1x1=128, red_3x3=128, out_3x3=256, red_5x5=24, out_5x5=64, out_pool=64)
        self.inception4d= InceptionModule(in_channels=512, out_1x1=112, red_3x3=144, out_3x3=288, red_5x5=32, out_5x5=64, out_pool=64)
        self.inception4e= InceptionModule(in_channels=528, out_1x1=256, red_3x3=160, out_3x3=320, red_5x5=32, out_5x5=128, out_pool=128)
        self.maxpool4 = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        
        self.inception5a= InceptionModule(in_channels=832, out_1x1=256, red_3x3=160, out_3x3=320, red_5x5=32, out_5x5=128, out_pool=128)
        self.inception5b= InceptionModule(in_channels=832, out_1x1=384, red_3x3=192, out_3x3=384, red_5x5=48, out_5x5=128, out_pool=128)
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.dropout = nn.Dropout(p=0.4)
        self.fc = nn.Linear(1024, num_classes)
        
        
    def forward(self, x):
        x = self.conv1(x)
        x = self.maxpool1(x)
        
        x = self.conv2(x)
        x = self.maxpool2(x)
        
        x = self.inception3a(x)
        x = self.inception3b(x)
        x = self.maxpool3(x)
        
        x = self.inception4a(x)
        x = self.inception4b(x)
        x = self.inception4c(x)
        x = self.inception4d(x)
        x = self.inception4e(x)
        x = self.maxpool4(x)
        
        x = self.inception5a(x)
        x = self.inception5b(x)
        
        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.dropout(x)
        x = self.fc(x)
        
        return x
        
         