Multimodal Fusion Pipeline

✅ Multimodal fusion enhances AI models by combining vision + text embeddings.

✅ Self-attention refines features, and masking removes irrelevant data.

✅ Pseudo-Patch Encoding ensures structured feature representation before classification.

✅ EfficientNetV2 + LLM + CLIP + Self-Attention forms a powerful model pipeline.

Steps for Multimodal Fusion Model

    1	Load dependencies
    2	Image Feature Extractor (EfficientNetV2)- 1280-dimensional image embedding
    3	Text Feature Extractor (BERT)- 768-dimensional embedding
    4	Feature Projection Layers - 512D space
    5	Fusion and Attention -->  projected image and text embeddings (→ 1024D)
    6	Convert attention vectors into Pseudo-Patch Encoder
    7	Regression Head
    8	Train and optimize the model

In [None]:
#Step 1: Load Dependencies
import torch
import torch.nn as nn
import torchvision.models as models
from transformers import AutoTokenizer, AutoModel, CLIPModel, CLIPProcessor
import albumentations as A
from albumentations.pytorch import ToTensorV2
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import numpy as np
from PIL import Image

In [None]:
#Step 2: Define Data Augmentation for Images
transform = A.Compose([
    A.Resize(224, 224),
    A.HorizontalFlip(p=0.5),
    A.RandomBrightnessContrast(p=0.2),
    A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
    ToTensorV2()
])


In [None]:
# Best Simplified Multimodal Regression Architecture using EfficientNetV2 + BERT + Self-Attention

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import Dataset, DataLoader
import numpy as np

# ---------------------------
# EfficientNetV2 Feature Extractor (Image)
# ---------------------------
class EfficientNetFeatureExtractor(nn.Module):
    def __init__(self):
        super().__init__()
        efficientnet = models.efficientnet_v2_l(weights="IMAGENET1K_V1")
        self.feature_extractor = nn.Sequential(*list(efficientnet.children())[:-1])

    def forward(self, x):
        return self.feature_extractor(x).squeeze()

# ---------------------------
# Text Embedding using BERT
# ---------------------------
class TextEmbeddingLLM(nn.Module):
    def __init__(self, model_name="bert-base-uncased"):
        super().__init__()
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)

    def forward(self, text):
        tokens = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True)
        outputs = self.model(**tokens)
        embeddings = outputs.last_hidden_state.mean(dim=1)
        return embeddings

# ---------------------------
# Self-Attention Layer
# ---------------------------
class SelfAttention(nn.Module):
    def __init__(self, embed_dim):
        super().__init__()
        self.query = nn.Linear(embed_dim, embed_dim)
        self.key = nn.Linear(embed_dim, embed_dim)
        self.value = nn.Linear(embed_dim, embed_dim)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x):
        Q = self.query(x)
        K = self.key(x)
        V = self.value(x)
        attention = self.softmax(Q @ K.transpose(-2, -1) / (Q.size(-1) ** 0.5))
        return attention @ V

# ---------------------------
# Pseudo-Patch Encoder
# ---------------------------
class PseudoPatchEncoder(nn.Module):
    def __init__(self, input_dim, patch_size=16):
        super().__init__()
        self.projection = nn.Linear(input_dim, input_dim // patch_size)

    def forward(self, x):
        return self.projection(x)

# ---------------------------
# Multimodal Regressor with Attention and Patch Encoding
# ---------------------------
class MultimodalRegressor(nn.Module):
    def __init__(self, image_dim=1280, text_dim=768, embed_dim=512):
        super().__init__()
        self.image_fc = nn.Linear(image_dim, embed_dim)
        self.text_fc = nn.Linear(text_dim, embed_dim)
        self.attention = SelfAttention(embed_dim)
        self.pseudo_patch = PseudoPatchEncoder(embed_dim)
        self.regressor = nn.Linear(embed_dim // 16, 1)

    def forward(self, image_feat, text_feat):
        img_embed = self.image_fc(image_feat)
        txt_embed = self.text_fc(text_feat)
        combined = torch.cat([img_embed, txt_embed], dim=-1)
        attended = self.attention(combined.unsqueeze(0)).squeeze(0)
        encoded = self.pseudo_patch(attended)
        return self.regressor(encoded)

# ---------------------------
# Dummy Dataset (for Regression)
# ---------------------------
class DummyMultimodalDataset(Dataset):
    def __init__(self, num_samples=100):
        self.num_samples = num_samples
        self.images = torch.randn(num_samples, 3, 224, 224)
        self.texts = ["This is a sample text."] * num_samples
        self.kpi_targets = torch.randn(num_samples, 1) * 100

    def __len__(self):
        return self.num_samples

    def __getitem__(self, idx):
        return {
            "image": self.images[idx],
            "text": self.texts[idx],
            "kpi": self.kpi_targets[idx]
        }

# ---------------------------
# Training Function
# ---------------------------
def train_model(model, dataloader, optimizer, criterion, image_model, text_model):
    model.train()
    image_model.eval()
    text_model.eval()
    total_loss = 0
    for batch in dataloader:
        optimizer.zero_grad()
        image_feat = image_model(batch["image"])
        text_feat = text_model(batch["text"])
        output = model(image_feat, text_feat)
        loss = criterion(output, batch["kpi"])
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

# ---------------------------
# Main Execution
# ---------------------------
if __name__ == "__main__":
    train_dataset = DummyMultimodalDataset(num_samples=100)
    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

    image_model = EfficientNetFeatureExtractor()
    text_model = TextEmbeddingLLM()
    model = MultimodalRegressor()

    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    criterion = nn.L1Loss()  # MAE for regression

    for epoch in range(5):
        train_loss = train_model(model, train_loader, optimizer, criterion, image_model, text_model)
        print(f"Epoch {epoch+1} - Train MAE: {train_loss:.4f}")

    torch.save(model.state_dict(), "best_multimodal_regressor.pt")
    print("Model saved as 'best_multimodal_regressor.pt'")
