Multimodal Fusion Pipeline

✅ Multimodal fusion enhances AI models by combining vision + text embeddings.

✅ Self-attention refines features, and masking removes irrelevant data.

✅ Pseudo-Patch Encoding ensures structured feature representation before classification.

✅ EfficientNetV2 + LLM + CLIP + Self-Attention forms a powerful model pipeline.

Steps for Multimodal Fusion Model


    1	Load dependencies
    2	Apply data augmentation (images)
    3	Extract CNN image features (EfficientNetV2)
    4	Extract LLM text embeddings (BERT)
    5	Compute CLIP embeddings (align image + text)
    6	Apply self-attention mechanism
    7	Apply masking to filter irrelevant data
    8	Convert attention vectors into pseudo-patches
    9	Pass features through a multimodal classifier
    10	Train and optimize the model

In [None]:
#Step 1: Load Dependencies
import torch
import torch.nn as nn
import torchvision.models as models
from transformers import AutoTokenizer, AutoModel, CLIPModel, CLIPProcessor
import albumentations as A
from albumentations.pytorch import ToTensorV2
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import numpy as np
from PIL import Image

In [None]:
#Step 2: Define Data Augmentation for Images
transform = A.Compose([
    A.Resize(224, 224),
    A.HorizontalFlip(p=0.5),
    A.RandomBrightnessContrast(p=0.2),
    A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
    ToTensorV2()
])


In [None]:
#Step 3: Define Feature Extractors 
#EfficientNetV2 for Image Feature Extraction
class EfficientNetFeatureExtractor(nn.Module):
    def __init__(self):
        super().__init__()
        efficientnet = models.efficientnet_v2_l(weights="IMAGENET1K_V1")
        self.feature_extractor = nn.Sequential(*list(efficientnet.children())[:-1])

    def forward(self, x):
        return self.feature_extractor(x).squeeze()

#LLM for Text Feature Extraction (BERT)
class TextEmbeddingLLM(nn.Module):
    def __init__(self, model_name="bert-base-uncased"):
        super().__init__()
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)

    def forward(self, text):
        tokens = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True)
        embeddings = self.model(**tokens).last_hidden_state.mean(dim=1)
        return embeddings
##CLIP Model for Image-Text Alignment
class CLIPFeatureExtractor(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

    def forward(self, image, text):
        inputs = self.processor(text=text, images=image, return_tensors="pt", padding=True)
        outputs = self.model(**inputs)
        return outputs.text_embeds, outputs.image_embeds


In [None]:
 #Step 4: Define Self-Attention & Masking Mechanism
#Self-Attention Layer
class SelfAttention(nn.Module):
    def __init__(self, embed_dim):
        super().__init__()
        self.query = nn.Linear(embed_dim, embed_dim)
        self.key = nn.Linear(embed_dim, embed_dim)
        self.value = nn.Linear(embed_dim, embed_dim)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x):
        Q, K, V = self.query(x), self.key(x), self.value(x)
        attention_scores = self.softmax(Q @ K.transpose(-2, -1) / (embed_dim ** 0.5))
        return attention_scores @ V
#Masking Mechanism
def apply_masking(attention_scores, mask):
    if mask is not None:
        attention_scores = attention_scores.masked_fill(mask == 0, float('-inf'))
    return F.softmax(attention_scores, dim=-1)


In [None]:
#Step 5: Define Pseudo-Patch Encoding
class PseudoPatchEncoder(nn.Module):
    def __init__(self, input_dim, patch_size=16):
        super().__init__()
        self.patch_size = patch_size
        self.projection = nn.Linear(input_dim, input_dim // patch_size)

    def forward(self, x):
        return self.projection(x)


In [None]:
#Define Multimodal Classifier
class MultimodalClassifier(nn.Module):
    def __init__(self, image_dim=1280, text_dim=768, embed_dim=512, num_classes=3):
        super().__init__()
        self.image_fc = nn.Linear(image_dim, embed_dim)
        self.text_fc = nn.Linear(text_dim, embed_dim)
        self.self_attention = SelfAttention(embed_dim)
        self.pseudo_patch_encoder = PseudoPatchEncoder(embed_dim)
        self.classifier = nn.Linear(embed_dim // 16, num_classes)

    def forward(self, image_feats, text_feats):
        img_embed = self.image_fc(image_feats)
        txt_embed = self.text_fc(text_feats)
        combined = torch.cat([img_embed, txt_embed], dim=-1)
        attn_output = self.self_attention(combined)
        patches = self.pseudo_patch_encoder(attn_output)
        return self.classifier(patches)


In [None]:
# Step 7: Define Dataset Loader
class MultimodalDataset(Dataset):
    def __init__(self, image_paths, text_data, labels, transform=None):
        self.image_paths = image_paths
        self.text_data = text_data
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image = Image.open(self.image_paths[idx]).convert("RGB")
        text = self.text_data[idx]
        label = self.labels[idx]

        if self.transform:
            image = self.transform(image=np.array(image))["image"]

        return image, text, label


In [None]:
# Step 8: Training the Model
# Initialize Models
image_model = EfficientNetFeatureExtractor()
text_model = TextEmbeddingLLM()
clip_model = CLIPFeatureExtractor()
multimodal_model = MultimodalClassifier()

# Optimizer and Loss
optimizer = torch.optim.Adam(multimodal_model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()

# Create DataLoader
train_loader = DataLoader(MultimodalDataset(["image1.jpg", "image2.jpg"], 
                                            ["This is a product", "Another product"], 
                                            [0, 1], transform=transform), batch_size=2, shuffle=True)

# Training Loop
for epoch in range(10):
    for images, texts, labels in train_loader:
        optimizer.zero_grad()

        # Extract Features
        image_feats = image_model(images)
        text_feats = text_model(texts)

        # Apply CLIP
        text_clip_feats, image_clip_feats = clip_model(images, texts)

        # Combine Features
        combined_image_feats = image_feats + image_clip_feats
        combined_text_feats = text_feats + text_clip_feats

        # Forward Pass
        outputs = multimodal_model(combined_image_feats, combined_text_feats)

        # Compute Loss & Optimize
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1}, Loss: {loss.item()}")
