In [None]:
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
from PIL import Image

# Define the encoder
class EncoderCNN(nn.Module):
    def __init__(self, embed_size):
        super(EncoderCNN, self).__init__()
        # Load a pre-trained ResNet-152 model
        resnet = models.resnet152(pretrained=True)
        # Remove the last fully-connected layer
        modules = list(resnet.children())[:-1]
        self.resnet = nn.Sequential(*modules)
        # Resize the input image to a fixed size
        self.adaptive_pool = nn.AdaptiveAvgPool2d((1, 1))
        # Linear layer to transform the output of the ResNet-152 model
        self.fc = nn.Linear(resnet.fc.in_features, embed_size)
        # Dropout layer to reduce overfitting
        self.dropout = nn.Dropout(0.5)

    def forward(self, images):
        # Pass the images through the ResNet-152 model
        features = self.resnet(images)
        # Resize the features to a fixed size
        features = self.adaptive_pool(features)
        # Flatten the features
        features = features.view(features.size(0), -1)
        # Pass the features through the linear layer
        features = self.fc(features)
        # Pass the features through the dropout layer
        features = self.dropout(features)
        return features

# Define the decoder
class DecoderRNN(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers=1):
        super(DecoderRNN, self).__init__()
        # Embedding layer to transform the input captions
        self.embed = nn.Embedding(vocab_size, embed_size)
        # LSTM layer to generate the output captions
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        # Linear layer to transform the LSTM output
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, features, captions):
        # Remove the last token from each caption
        captions = captions[:, :-1]
        # Embed the captions
        embeddings = self.embed(captions)
        # Concatenate the features and the embeddings
        embeddings = torch.cat((features.unsqueeze(1), embeddings), 1)
        # Pass the embeddings through the LSTM layer
        hiddens, _ = self.lstm(embeddings)
        # Pass the LSTM output through the linear layer
        outputs = self.fc(hiddens)
        return outputs

# Define the captioning model
class CaptioningModel(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers=1):
        super(CaptioningModel, self).__init__()
        # Create the encoder and the decoder
        self.encoder = EncoderCNN(embed_size)
        self.decoder = DecoderRNN(embed_size, hidden_size, vocab_size, num_layers)

    def forward(self, images, captions):
        # Pass the images through the encoder
        features = self.encoder(images)
        # Pass the features and the captions through the decoder
        outputs = self.decoder(features, captions)
        return outputs


In [None]:
# Load an image
image_path = "image.jpg"
image = Image.open(image_path).convert("RGB")
# Preprocess the image
transform = transforms.Compose([
    transforms.Resize(
