In [4]:
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms

In [None]:
class CNN_Encoder(nn.Module):
    def __init__(self, embeded_size):
        #embeded vector is the final output of a embeded_size, which will be the input to RNN
        super(CNN_Encoder, self).__init__()
        #using a pretrained CNN architecture designed for image classification.
        # pretrained=True : Loads pretrained weights from training on the ImageNet dataset.
        resnet = models.resnet50(pretrained=True)

        # Prevents weights from being updated during training, 
        # as the pretrained ResNet will only be used for feature extraction.
        for params in resnet.parameters():
            params.requires_grad_(False)

        modules = list(resnet.children())[:-1] #Removing the last layer which is the classification layer
        # As we don't want to classify the data

        # Wraps the remaining layers into a new sequential module.
        self.resnet = nn.Sequential(*modules)

        # Adding a embedding layer which is the final layer for our Encoder model
        # this will return an embedded vector by Transforming the ResNet's output features into an embedding vector
        # Resnet.fc.in_features = no.of features output by the resnet module, taken as input for embedded layer
        self.embeded_layer = nn.Linear(resnet.fc.in_features, embeded_size)

    def forward(self, images):
        # Images passed through the resnet model which extracts the features/feature maps
        features = self.resnet(images)
        # flatenning those features
        # Flattening: Converts the output into a 2D tensor of shape (batch_size, num_features)
        features = features.view(features.size(0), -1)
        # creating embedding vector of those flatenned features
        features = self.embeded_layer(features)

        # output is the embedded vector
        return features


### Testing the Encoder on a Single Image:

In [None]:
from PIL import Image

img_path = "/Users/laibaqureshi/Desktop/BAI project/000000000025.jpg"
image = Image.open(img_path).convert('RGB')

# Transformations on the image because we're using a pretrained resnet model
# and we need to stick to those because the paper suggests that. 
# https://arxiv.org/pdf/1512.03385

preprocess = transforms.Compose([
    transforms.Resize(256), # Resizes the image's shortest side to 256 pixels.
    transforms.CenterCrop(224), # Crops a 224×224 region from the center.
    transforms.ToTensor(), # Converts the image to a PyTorch tensor of shape (3, 224, 224) (channels, height, width).
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])    
])

image_tensor = preprocess(image).unsqueeze(0)
# Unsqueeze: Adds a batch dimension, resulting in shape (1, 3, 224, 224).

In [None]:
# Initialize the Encoder

# Each image's embedded vector will have 256 values
embed_size = 256
encoder = CNN_Encoder(embed_size)

# Sets the encoder to evaluation mode, disabling operations like dropout.
encoder.eval()

output = image_tensor
for name, module in encoder.resnet.named_children():
    output = module(output)
    # printing the shape of the output after each ResNet layer.
    print(f"Output after {name}: {output.shape}")
    # Eg: Output after 0: torch.Size([1, 64, 112, 112])
    # This means we get 64 feature maps of size 112 x 112



# Get the final feature vector:
output = output.view(output.size(0), -1)
output = encoder.embeded_layer(output)

print(f"Final feature vector shape: {output.shape}")
#Final feature vector shape: torch.Size([1, 256]) -> 1 for 1 image and has 256 values




Output after 0: torch.Size([1, 64, 112, 112])
Output after 1: torch.Size([1, 64, 112, 112])
Output after 2: torch.Size([1, 64, 112, 112])
Output after 3: torch.Size([1, 64, 56, 56])
Output after 4: torch.Size([1, 256, 56, 56])
Output after 5: torch.Size([1, 512, 28, 28])
Output after 6: torch.Size([1, 1024, 14, 14])
Output after 7: torch.Size([1, 2048, 7, 7])
Output after 8: torch.Size([1, 2048, 1, 1])
Final feature vector shape: torch.Size([1, 256])


In [9]:
print("Embedding vector", output)

Embedding vector tensor([[-2.9099e-01, -5.2723e-01, -1.8377e-01, -5.3323e-01,  5.1252e-02,
          6.1005e-02, -1.9923e-01,  4.7904e-02, -1.6908e-01,  3.0531e-01,
          2.9983e-02,  4.2404e-01, -1.3624e-01, -5.7255e-02,  2.4101e-01,
         -1.1460e-01,  1.8864e-01, -2.1262e-02,  3.1975e-01,  3.1031e-01,
          2.5426e-02, -1.3570e-01,  6.8599e-01,  8.1763e-01, -3.1593e-01,
          1.6124e-01,  1.3674e-01,  4.4296e-01,  2.7504e-01, -4.1518e-02,
          1.3502e-01, -2.7734e-01, -9.9362e-02, -3.9854e-02,  1.3523e-02,
         -1.1696e-04, -2.3635e-01, -4.4412e-01,  5.0014e-02, -6.0238e-01,
          7.3476e-02,  2.9803e-01,  8.3499e-02, -1.4206e-01,  2.6459e-02,
         -2.0448e-01, -3.0230e-01,  2.9800e-01,  6.8996e-01,  2.1789e-01,
          4.3664e-01, -2.4174e-01,  3.0750e-01, -3.3873e-01,  3.0980e-01,
          4.2058e-02,  3.4191e-01,  2.8138e-01,  6.4282e-02, -2.5940e-01,
          5.4373e-01, -4.2265e-01,  5.6677e-01,  3.5769e-01, -3.6796e-01,
          1.9611e-01,