In [None]:
import pandas as pd
import numpy as np
from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader

class PixelRecDataset(Dataset):
    def __init__(self, data, image_transform, tokenizer, normalizer):
        self.data = data
        self.image_transform = image_transform
        self.tokenizer = tokenizer
        self.normalizer = normalizer
       
    def __len__(self):
        return len(self.data)
   
    def __getitem__(self, idx):
        item = self.data[idx]
       
        # Load and transform image
        image = Image.open(item['image']).convert('RGB')
        image = self.image_transform(image)
       
        # Combine text fields
        text = f"Title: {item['title']} Tag: {item['tag']} Description: {item['description']}"
        text_encoding = self.tokenizer(text, padding='max_length',
                                       truncation=True, max_length=128,
                                       return_tensors='pt')
       
        # Use normalizer to normalize engagement metrics
        normalized_engagement = self.normalizer.transform(item['engagement'])
        # normalized_engagement is now a numpy array: [likes, comments, shares, views, favorites]
       
        engagement = torch.tensor([
            normalized_engagement[0],  # likes
            normalized_engagement[1],  # comments
            # normalized_engagement[2],  # shares
            normalized_engagement[2],  # views, actually position 3!!
            # normalized_engagement[4]   # favorites
        ], dtype=torch.float32)
       
        return {
            'image': image,
            'input_ids': text_encoding['input_ids'].squeeze(),
            'attention_mask': text_encoding['attention_mask'].squeeze(),
            'engagement': engagement
        }

In [3]:
import torch
import torch.nn as nn
from transformers import CLIPModel, CLIPProcessor, BertModel, BertTokenizer
from torchvision import models, transforms
from alive_progress import alive_bar

NUM_ENGAGEMENT_METRICS = 3

class MultimodalEngagementPredictor(nn.Module):
    """
    Multimodal model for predicting engagement metrics from images and text.
    Combines CLIP vision encoder with BERT text encoder.
    """
    def __init__(self, num_engagement_metrics=NUM_ENGAGEMENT_METRICS, hidden_dim=512, dropout=0.3):
        super().__init__()
        
        # Vision encoder (using CLIP's vision model)
        self.vision_encoder = models.resnet50(pretrained=True)
        self.vision_encoder.fc = nn.Identity()  # Remove final layer
        vision_dim = 2048
        
        # Text encoder (using BERT)
        self.text_encoder = BertModel.from_pretrained('bert-base-uncased')
        text_dim = 768
        
        # Projection layers to common dimension
        self.vision_projection = nn.Sequential(
            nn.Linear(vision_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout)
        )
        
        self.text_projection = nn.Sequential(
            nn.Linear(text_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout)
        )
        
        # Fusion and prediction layers
        self.fusion = nn.Sequential(
            nn.Linear(hidden_dim * 2, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.Dropout(dropout)
        )
        
        # Separate heads for each engagement metric
        self.engagement_heads = nn.ModuleList([
            nn.Linear(hidden_dim // 2, 1) for _ in range(num_engagement_metrics)
        ])
        
    def forward(self, images, input_ids, attention_mask):
        # Extract visual features
        vision_features = self.vision_encoder(images)
        vision_features = self.vision_projection(vision_features)
        
        # Extract text features
        text_outputs = self.text_encoder(input_ids=input_ids, 
                                         attention_mask=attention_mask)
        text_features = text_outputs.last_hidden_state[:, 0, :]  # CLS token
        text_features = self.text_projection(text_features)
        
        # Fuse features
        fused_features = torch.cat([vision_features, text_features], dim=1)
        fused_features = self.fusion(fused_features)
        
        # Predict each engagement metric
        predictions = torch.cat([head(fused_features) for head in self.engagement_heads], dim=1)
        
        return predictions


class CLIPBasedEngagementPredictor(nn.Module):
    """
    Alternative approach using CLIP's multimodal capabilities directly.
    More efficient and leverages CLIP's pre-trained vision-language alignment.
    """
    def __init__(self, num_engagement_metrics=NUM_ENGAGEMENT_METRICS, hidden_dim=512, dropout=0.3):
        super().__init__()
        
        # Load pre-trained CLIP
        self.clip = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        
        # Freeze CLIP parameters (optional - can unfreeze for full fine-tuning)
        for param in self.clip.parameters():
            param.requires_grad = False
        
        clip_dim = 512  # CLIP embedding dimension
        
        # Regression head
        self.predictor = nn.Sequential(
            nn.Linear(clip_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim // 2, num_engagement_metrics)
        )
        
    def forward(self, images, input_ids, attention_mask):
        # Get CLIP embeddings
        outputs = self.clip(
            pixel_values=images,
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        
        # Use multimodal embeddings (average of image and text)
        image_embeds = outputs.image_embeds
        text_embeds = outputs.text_embeds
        fused_embeds = (image_embeds + text_embeds) / 2
        
        # Predict engagement
        predictions = self.predictor(fused_embeds)
        
        return predictions


# Training utilities
def train_epoch(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    print(f"inside train epoch... dataloader length is {len(dataloader)}")  # DEBUGGING LINE
    with alive_bar(len(dataloader), title="Training Progress") as bar:
        for batch in dataloader:
            images = batch['image'].to(device)
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            targets = batch['engagement'].to(device)
            
            optimizer.zero_grad()
            
            predictions = model(images, input_ids, attention_mask)
            loss = criterion(predictions, targets)
            
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            bar()  # Update the progress bar

    # for batch in dataloader:
    #     images = batch['image'].to(device)
    #     input_ids = batch['input_ids'].to(device)
    #     attention_mask = batch['attention_mask'].to(device)
    #     targets = batch['engagement'].to(device)
        
    #     optimizer.zero_grad()
        
    #     predictions = model(images, input_ids, attention_mask)
    #     loss = criterion(predictions, targets)
        
    #     loss.backward()
    #     optimizer.step()
        
    #     total_loss += loss.item()
    
    return total_loss / len(dataloader)


def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    all_predictions = []
    all_targets = []
    
    with torch.no_grad():
        for batch in dataloader:
            images = batch['image'].to(device)
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            targets = batch['engagement'].to(device)
            
            predictions = model(images, input_ids, attention_mask)
            loss = criterion(predictions, targets)
            
            total_loss += loss.item()
            all_predictions.append(predictions.cpu())
            all_targets.append(targets.cpu())
    
    all_predictions = torch.cat(all_predictions)
    all_targets = torch.cat(all_targets)
    
    return total_loss / len(dataloader), all_predictions, all_targets

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, random_split
from transformers import BertTokenizer, CLIPProcessor
from torchvision import transforms
import numpy as np
from sklearn.preprocessing import StandardScaler
import json
import pickle

# Data normalization
class EngagementNormalizer:
    """Normalize engagement metrics for better training"""
    def __init__(self):
        self.scaler = StandardScaler()
        
    def fit(self, engagement_data):
        """Fit on training data"""
        engagement_array = np.array([[
            e['likes']
            , e['comments']
            #, e['shares'] 
            , e['views'] 
            #, e['favorites']
        ] for e in engagement_data])
        self.scaler.fit(engagement_array)
        
    def transform(self, engagement):
        """Transform engagement dict to normalized array"""
        arr = np.array([[
            engagement['likes']
            , engagement['comments']
            # , engagement['shares'] 
            , engagement['views']
            # , engagement['favorites']
        ]])
        return self.scaler.transform(arr)[0]
    
    def inverse_transform(self, normalized_engagement):
        """Convert normalized predictions back to original scale"""
        return self.scaler.inverse_transform(normalized_engagement)


# Complete training pipeline
def train_engagement_model(
    train_path,
    val_path,
    batch_size=32,
    num_epochs=50,
    learning_rate=1e-4,
    weight_decay=1e-5,
    device='cuda' if torch.cuda.is_available() else 'cpu'
):
    """
    Complete training pipeline for engagement prediction
    
    Args:
        data_path: Path to JSON file with data
        batch_size: Batch size for training
        num_epochs: Number of training epochs
        learning_rate: Learning rate
        weight_decay: L2 regularization
        device: Device to train on
    """
    
    # Load data
    with open(train_path, 'r') as f:
        train_data = json.load(f)
        
    with open(val_path, 'r') as f:
        val_data = json.load(f)
    
    # Initialize normalizer
    normalizer = EngagementNormalizer()
    normalizer.fit([item['engagement'] for item in train_data])
    
    # Data transforms
    image_transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], 
                           std=[0.229, 0.224, 0.225])
    ])
    
    # Tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    
    # Create dataset
    train_dataset = PixelRecDataset(train_data, image_transform, tokenizer, normalizer)
    val_dataset = PixelRecDataset(val_data, image_transform, tokenizer, normalizer)
    
    # DataLoaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, 
                            shuffle=True, num_workers=4)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, 
                          shuffle=False, num_workers=4)
    
    # Initialize model
    model = MultimodalEngagementPredictor().to(device)
    
    # Loss function (MSE for regression)
    criterion = nn.MSELoss()
    
    # Optimizer
    optimizer = optim.AdamW(model.parameters(), lr=learning_rate, 
                           weight_decay=weight_decay)
    
    # Learning rate scheduler
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='min', factor=0.5, patience=5
    )
    
    # Training loop
    best_val_loss = float('inf')
    patience = 10
    patience_counter = 0
    
    for epoch in range(num_epochs):
        print(f"Epoch {epoch+1}/{num_epochs}")
        # Train
        train_loss = train_epoch(model, train_loader, optimizer, 
                                criterion, device)
        
        # Validate
        val_loss, predictions, targets = evaluate(model, val_loader, 
                                                  criterion, device)
        
        # Calculate metrics
        mae = torch.mean(torch.abs(predictions - targets)).item()
        
        print(f"Train Loss: {train_loss:.4f}")
        print(f"Val Loss: {val_loss:.4f}")
        print(f"Val MAE: {mae:.4f}")
        
        # Learning rate scheduling
        scheduler.step(val_loss)
        
        # Save best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'val_loss': val_loss,
            }, 'best_engagement_model.pth')
            
            # Save normalizer
            with open('engagement_normalizer.pkl', 'wb') as f:
                pickle.dump(normalizer, f)
            
            print(f"Model saved with val_loss: {val_loss:.4f}")
        else:
            patience_counter += 1
            
        # Early stopping
        if patience_counter >= patience:
            print(f"Early stopping triggered after {epoch+1} epochs")
            break
    
    return model, normalizer


# Inference function
def predict_engagement(model, normalizer, image_path, title, tag, description, device='cuda'):
    """
    Predict engagement metrics for a new post
    
    Returns:
        dict: Predicted engagement metrics (likes, comments, shares, views, favorites)
    """
    from PIL import Image
    from torchvision import transforms
    from transformers import BertTokenizer
    
    model.eval()
    
    # Prepare image
    image_transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], 
                           std=[0.229, 0.224, 0.225])
    ])
    image = Image.open(image_path).convert('RGB')
    image = image_transform(image).unsqueeze(0).to(device)
    
    # Prepare text
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    text = f"Title: {title} Tag: {tag} Description: {description}"
    encoding = tokenizer(text, padding='max_length', truncation=True, 
                        max_length=128, return_tensors='pt')
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    # Predict
    with torch.no_grad():
        predictions = model(image, input_ids, attention_mask)
    
    # Denormalize
    predictions_np = predictions.cpu().numpy()
    denormalized = normalizer.inverse_transform(predictions_np)[0]
    
    # Format output
    engagement = {
        'likes': int(max(0, denormalized[0])),
        'comments': int(max(0, denormalized[1])),
        'shares': int(max(0, denormalized[2])),
        'views': int(max(0, denormalized[3])),
        'favorites': int(max(0, denormalized[4]))
    }
    
    return engagement


# Example usage
# if __name__ == "__main__":
#     # Train model
#     model, normalizer = train_engagement_model(
#         data_path='engagement_data.json',
#         batch_size=32,
#         num_epochs=50,
#         learning_rate=1e-4
#     )
    
    # # Make prediction
    # prediction = predict_engagement(
    #     model, 
    #     normalizer,
    #     image_path='test_image.jpg',
    #     title='King of Thieves Feature',
    #     tag='Miscellaneous',
    #     description='Luffy\'s relative Belo Betty'
    # )
    
    # print("Predicted Engagement:", prediction)

In [5]:
model, normalizer = train_engagement_model(
        train_path = 'data/test.json',
        val_path = 'data/validation.json',
        batch_size=32,
        num_epochs=50,
        learning_rate=1e-4
    )



Epoch 1/50
inside train epoch... dataloader length is 130
Training Progress |████████████████████████████████████████| 130/130 [100%] in 1:32:38.8 (0.02/s) 
Train Loss: 0.9948
Val Loss: 1.5471
Val MAE: 0.5844
Model saved with val_loss: 1.5471
Epoch 2/50
inside train epoch... dataloader length is 130
Training Progress |████████████████████████████████████████| 130/130 [100%] in 1:01:38.2 (0.04/s) 
Train Loss: 0.9470
Val Loss: 1.5461
Val MAE: 0.5756
Model saved with val_loss: 1.5461
Epoch 3/50
inside train epoch... dataloader length is 130
Training Progress |████████████████████████████████████████| 130/130 [100%] in 1:01:31.2 (0.04/s) 
Train Loss: 0.8051
Val Loss: 1.8591
Val MAE: 0.6730
Epoch 4/50
inside train epoch... dataloader length is 130
Training Progress |████████████████████████████████████████| 130/130 [100%] in 59:56.6 (0.04/s) 
Train Loss: 0.6463
Val Loss: 1.6333
Val MAE: 0.5349
Epoch 5/50
inside train epoch... dataloader length is 130
Training Progress |█████████████████████

KeyboardInterrupt: 