In [13]:
import torch
import torch.nn as nn
from transformers import BertTokenizer
import json

In [14]:

# Load BERT tokenizer (WordPiece Tokenizer)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


In [15]:
# Define the embedding and positional encoding dimensions
dc = 768  # Dimension for caption embeddings
dq = 768  # Dimension for question embeddings
caption_max_len = 15  # Lc = 15 for captions
question_max_len = 20  # Lq = 20 for questions

In [16]:
# Embedding layers with token embedding and positional encoding
class EmbeddingLayer(nn.Module):
    def __init__(self, vocab_size, embed_dim, max_len):
        super(EmbeddingLayer, self).__init__()
        # Token embedding
        self.word_embedding = nn.Embedding(vocab_size, embed_dim)
        # Positional encoding
        self.position_embedding = nn.Embedding(max_len, embed_dim)
        # Layer normalization
        self.layer_norm = nn.LayerNorm(embed_dim)

    def forward(self, input_ids):
        seq_len = input_ids.size(1)
        # Token embeddings
        token_embeddings = self.word_embedding(input_ids)
        # Positional encodings
        position_ids = torch.arange(0, seq_len, dtype=torch.long, device=input_ids.device)
        position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
        positional_embeddings = self.position_embedding(position_ids)
        # Summing token embeddings and positional encodings
        embeddings = token_embeddings + positional_embeddings
        # Layer normalization
        embeddings = self.layer_norm(embeddings)
        return embeddings

In [17]:
# Transformer Encoder Layer Block
class TransformerBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, ff_dim, dropout=0.1):
        super(TransformerBlock, self).__init__()
        # Multi-head attention layer
        self.attention = nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout)
        # Feedforward layers
        self.feed_forward = nn.Sequential(
            nn.Linear(embed_dim, ff_dim),
            nn.ReLU(),
            nn.Linear(ff_dim, embed_dim)
        )
        # Layer normalization and dropout
        self.layer_norm1 = nn.LayerNorm(embed_dim)
        self.layer_norm2 = nn.LayerNorm(embed_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # Apply attention and add residual connection
        attn_output, _ = self.attention(x, x, x)
        x = self.layer_norm1(x + self.dropout(attn_output))
        
        # Apply feedforward network and add residual connection
        ff_output = self.feed_forward(x)
        x = self.layer_norm2(x + self.dropout(ff_output))
        
        return x

In [18]:
# Text encoder consisting of NC = NQ = 9 transformer layers
class TextEncoder(nn.Module):
    def __init__(self, embed_dim, num_layers, num_heads, ff_dim, dropout=0.1):
        super(TextEncoder, self).__init__()
        self.layers = nn.ModuleList([TransformerBlock(embed_dim, num_heads, ff_dim, dropout) for _ in range(num_layers)])

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

In [19]:
# Function to tokenize and preprocess a single sequence
def tokenize_and_prepare(sequence, tokenizer, max_len):
    # Tokenize the sequence and add [CLS] token
    tokens = tokenizer(sequence, add_special_tokens=True, padding='max_length', truncation=True, max_length=max_len)
    input_ids = torch.tensor(tokens['input_ids']).unsqueeze(0)  # Add batch dimension
    return input_ids

In [20]:
# Load data from your annotation and question JSON files
def load_data(annotation_file, question_file):
    # Load captions (annotations) and questions from JSON files
    with open(annotation_file, 'r') as f:
        captions_data = json.load(f)
    with open(question_file, 'r') as f:
        questions_data = json.load(f)

    # Extract captions and questions
    captions = [ann['caption'] for ann in captions_data['annotations']]
    questions = [q['question'] for q in questions_data['questions']]
    
    return captions, questions

In [26]:
# Define the full model that handles both caption and question embedding
class TextEmbeddingModel(nn.Module):
    def __init__(self, tokenizer, vocab_size, embed_dim, caption_max_len, question_max_len, num_layers=9):
        super(TextEmbeddingModel, self).__init__()
        # Embedding layers for captions and questions
        self.caption_embedding = EmbeddingLayer(vocab_size, embed_dim, caption_max_len)
        self.question_embedding = EmbeddingLayer(vocab_size, embed_dim, question_max_len)
        # Shared text encoder
        self.text_encoder = TextEncoder(embed_dim, num_layers, num_heads=8, ff_dim=2048)
        self.tokenizer = tokenizer

    def forward(self, captions, questions):
        # Tokenize captions and questions
        caption_inputs = [tokenize_and_prepare(c, self.tokenizer, caption_max_len) for c in captions]
        question_inputs = [tokenize_and_prepare(q, self.tokenizer, question_max_len) for q in questions]

        # Convert lists to tensors
        caption_inputs = torch.cat(caption_inputs, dim=0)  # Combine into batch tensor
        question_inputs = torch.cat(question_inputs, dim=0)  # Combine into batch tensor

          # Get caption and question embeddings
        #print("Processing captions...")
        C = self.caption_embedding(caption_inputs)  # Caption embeddings
        #print("Caption embedding completed.")

        #print("Processing questions...")
        Q = self.question_embedding(question_inputs)  # Question embeddings
        #print("Question embedding completed.")

        # Permute for transformer input format [seq_len, batch, embed_dim]
        C = C.permute(1, 0, 2)
        Q = Q.permute(1, 0, 2)

        # Encode captions and questions using shared encoder
        encoded_C = self.text_encoder(C)
        encoded_Q = self.text_encoder(Q)

        return encoded_C, encoded_Q

In [27]:
# Initialize the model
vocab_size = tokenizer.vocab_size
model = TextEmbeddingModel(tokenizer, vocab_size=vocab_size, embed_dim=768, caption_max_len=caption_max_len, question_max_len=question_max_len)

In [28]:
# Load data
annotation_file = 'D:/Project_phase_1/new_ds/annotations_trainval2014/annotations/captions_train2014.json'  # Replace with your path
question_file = 'D:/Project_phase_1/question ds/v2_OpenEnded_mscoco_train2014_questions.json'  # Replace with your path
captions, questions = load_data(annotation_file, question_file)

In [29]:
# Forward pass in batches
batch_size = 8  # Set a batch size to process smaller chunks of data
for i in range(0, len(captions), batch_size):
    batch_captions = captions[i:i + batch_size]
    batch_questions = questions[i:i + batch_size]
    
    # Forward pass
    C, Q = model(batch_captions, batch_questions)

    # Output the final representations for the batch
    print(f"Batch {i // batch_size + 1}:")
    print(f"Caption representation shape: {C.shape}")  # Shape: [batch_size, caption_max_len, dc]
    print(f"Question representation shape: {Q.shape}")  # Shape: [batch_size, question_max_len, dq]

Batch 1:
Caption representation shape: torch.Size([15, 8, 768])
Question representation shape: torch.Size([20, 8, 768])
Batch 2:
Caption representation shape: torch.Size([15, 8, 768])
Question representation shape: torch.Size([20, 8, 768])
Batch 3:
Caption representation shape: torch.Size([15, 8, 768])
Question representation shape: torch.Size([20, 8, 768])
Batch 4:
Caption representation shape: torch.Size([15, 8, 768])
Question representation shape: torch.Size([20, 8, 768])
Batch 5:
Caption representation shape: torch.Size([15, 8, 768])
Question representation shape: torch.Size([20, 8, 768])
Batch 6:
Caption representation shape: torch.Size([15, 8, 768])
Question representation shape: torch.Size([20, 8, 768])
Batch 7:
Caption representation shape: torch.Size([15, 8, 768])
Question representation shape: torch.Size([20, 8, 768])
Batch 8:
Caption representation shape: torch.Size([15, 8, 768])
Question representation shape: torch.Size([20, 8, 768])
Batch 9:
Caption representation shape: t

In [30]:
import torch
import torch.nn as nn
from transformers import BertTokenizer

# Load BERT tokenizer (WordPiece Tokenizer)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Define the embedding and positional encoding dimensions
dc = 768  # Dimension for caption embeddings
dq = 768  # Dimension for question embeddings
caption_max_len = 15  # Lc = 15 for captions
question_max_len = 20  # Lq = 20 for questions
num_transformer_layers = 9  # NC = NQ = 9 transformer layers

# Embedding layers with token embedding and positional encoding
class EmbeddingLayer(nn.Module):
    def __init__(self, vocab_size, embed_dim, max_len):
        super(EmbeddingLayer, self).__init__()
        # Token embedding
        self.word_embedding = nn.Embedding(vocab_size, embed_dim)
        # Positional encoding
        self.position_embedding = nn.Embedding(max_len, embed_dim)
        # Layer normalization
        self.layer_norm = nn.LayerNorm(embed_dim)

    def forward(self, input_ids):
        seq_len = input_ids.size(1)
        # Token embeddings
        token_embeddings = self.word_embedding(input_ids)
        # Positional encodings
        position_ids = torch.arange(0, seq_len, dtype=torch.long, device=input_ids.device)
        position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
        positional_embeddings = self.position_embedding(position_ids)
        # Summing token embeddings and positional encodings
        embeddings = token_embeddings + positional_embeddings
        # Layer normalization
        embeddings = self.layer_norm(embeddings)

        return embeddings

# Transformer Encoder Layer Block
class TransformerBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, ff_dim, dropout=0.1):
        super(TransformerBlock, self).__init__()
        # Multi-head attention layer
        self.attention = nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout)
        # Feedforward layers
        self.feed_forward = nn.Sequential(
            nn.Linear(embed_dim, ff_dim),
            nn.ReLU(),
            nn.Linear(ff_dim, embed_dim)
        )
        # Layer normalization and dropout
        self.layer_norm1 = nn.LayerNorm(embed_dim)
        self.layer_norm2 = nn.LayerNorm(embed_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # Apply attention and add residual connection
        attn_output, _ = self.attention(x, x, x)
        x = self.layer_norm1(x + self.dropout(attn_output))
        
        # Apply feedforward network and add residual connection
        ff_output = self.feed_forward(x)
        x = self.layer_norm2(x + self.dropout(ff_output))
        
        return x

# Text encoder consisting of NC = NQ = 9 transformer layers
class TextEncoder(nn.Module):
    def __init__(self, embed_dim, num_layers, num_heads, ff_dim, dropout=0.1):
        super(TextEncoder, self).__init__()
        self.layers = nn.ModuleList([TransformerBlock(embed_dim, num_heads, ff_dim, dropout) for _ in range(num_layers)])

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

# Function to tokenize and preprocess a single sequence
def tokenize_and_prepare(sequence, tokenizer, max_len):
    # Tokenize the sequence and add [CLS] token
    tokens = tokenizer(sequence, add_special_tokens=True, padding='max_length', truncation=True, max_length=max_len)
    input_ids = torch.tensor(tokens['input_ids']).unsqueeze(0)  # Add batch dimension
    return input_ids

# Example sample to check embeddings
sample_caption = "A clean and well decorated bathroom."
sample_question = "What is the color of the bathroom?"

# Tokenize the sample caption and question
caption_input_ids = tokenize_and_prepare(sample_caption, tokenizer, caption_max_len)
question_input_ids = tokenize_and_prepare(sample_question, tokenizer, question_max_len)

# Initialize embedding layers
vocab_size = tokenizer.vocab_size
caption_embedding_layer = EmbeddingLayer(vocab_size, embed_dim=dc, max_len=caption_max_len)
question_embedding_layer = EmbeddingLayer(vocab_size, embed_dim=dq, max_len=question_max_len)

# Initialize shared text encoder with NC = NQ = 9 transformer layers
text_encoder = TextEncoder(embed_dim=768, num_layers=num_transformer_layers, num_heads=8, ff_dim=2048)

# Forward pass for caption
caption_embeddings = caption_embedding_layer(caption_input_ids)
caption_embeddings = caption_embeddings.permute(1, 0, 2)  # Permute for transformer [seq_len, batch, embed_dim]
caption_encoded = text_encoder(caption_embeddings)

# Forward pass for question
question_embeddings = question_embedding_layer(question_input_ids)
question_embeddings = question_embeddings.permute(1, 0, 2)  # Permute for transformer [seq_len, batch, embed_dim]
question_encoded = text_encoder(question_embeddings)

# Print results for inspection
print("Final encoded caption representation:")
print(caption_encoded)

print("\nFinal encoded question representation:")
print(question_encoded)


Final encoded caption representation:
tensor([[[-0.0282,  1.0450, -0.8794,  ...,  0.6521, -0.1339, -1.3370]],

        [[ 0.0528,  1.0046, -1.0900,  ...,  0.3792,  1.0604, -1.1710]],

        [[-0.7996,  0.3187, -1.3398,  ...,  0.6482,  0.6351,  0.6388]],

        ...,

        [[-0.6262,  2.4144,  1.2887,  ..., -0.6935, -0.6277,  0.2695]],

        [[-0.4056,  1.7878, -0.5779,  ...,  0.0551,  0.7041, -0.3234]],

        [[-0.8926,  1.4964, -0.2063,  ..., -0.0712, -0.1415, -0.2849]]],
       grad_fn=<NativeLayerNormBackward0>)

Final encoded question representation:
tensor([[[ 0.0421,  1.2622,  2.1795,  ..., -1.7125, -0.4545, -1.1356]],

        [[-0.8936, -0.7951,  2.3566,  ..., -1.7779, -0.8435,  0.1709]],

        [[-0.7397,  0.7212,  0.6223,  ..., -2.3446,  0.6982,  1.0346]],

        ...,

        [[ 0.4293, -0.3887,  1.3559,  ..., -1.5393, -1.0007, -0.2346]],

        [[ 0.8189, -0.0774,  0.8615,  ..., -0.9842, -0.6266, -0.2021]],

        [[-0.0132,  1.0287,  1.7057,  ..., -1.49

In [31]:
import torch
C, Q = model(batch_captions, batch_questions)

#save the caption and question features
torch.save(C, 'caption_features.pt')
torch.save(Q, 'question_features.pt')

print("Features saved successfully")

Features saved successfully
