In [31]:
import pandas as pd
import numpy as np
from sklearn.utils import resample
import torch
import pandas as pd
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
import torch.nn.functional as F
from tqdm.auto import tqdm
import numpy as np
from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
import torchvision.transforms.functional as TF

In [32]:
df = pd.read_csv('data/paintings_with_food_nlp.csv')
df['has_food'] = df.iloc[:, 2:].sum(axis=1) > 0
df = df[['item', 'image_path', 'has_food']]
df

Unnamed: 0,item,image_path,has_food
0,http://www.wikidata.org/entity/Q27064304,img/img_512/Intérieur de cuisine - Joachim Beu...,True
1,http://www.wikidata.org/entity/Q12900365,img/img_512/The Luncheon (SM sg170).png,True
2,http://www.wikidata.org/entity/Q776175,img/img_512/Pieter Bruegel the Elder- The Harv...,True
3,http://www.wikidata.org/entity/Q72701665,img/img_512/Lille PdBA quellin fyt jesus marth...,True
4,http://www.wikidata.org/entity/Q20532659,"img/img_512/OA Hermansen, Et frokostbord, 1884...",True
...,...,...,...
71165,http://www.wikidata.org/entity/Q51247485,"img/img_512/Alex Colville - Infantry, near Nij...",False
71166,http://www.wikidata.org/entity/Q51244389,img/img_512/Ivan Žabota - dekliški portret.jpg,False
71167,http://www.wikidata.org/entity/Q51235353,img/img_512/Ivan Žabota - Marta Krásovej.jpg,False
71168,http://www.wikidata.org/entity/Q51265369,img/img_512/Ivan Žabota - ženski portret.jpg,False


In [33]:

# Assuming your dataframe is called df
print("Total samples:", len(df))
print("With food:", sum(df['has_food']))
print("Without food:", sum(~df['has_food']))
print("Ratio food/no-food:", sum(df['has_food'])/len(df))
  


Total samples: 71170
With food: 1407
Without food: 69763
Ratio food/no-food: 0.01976956582829844


In [34]:


def prepare_input(image_path, has_food, processor):
    """Process a single image and create corresponding text"""
    try:
        # Open image and ensure it's RGB
        image = Image.open(image_path).convert('RGB')
        
        # Resize image to CLIP's expected size (224x224)
        image = TF.resize(image, (224, 224), interpolation=TF.InterpolationMode.BICUBIC)
        image = TF.center_crop(image, (224, 224))
        
        text = 'a painting containing food' if has_food else 'a painting not containing food'
        
        # Process using CLIP processor
        inputs = processor(
            images=image,
            text=[text],
            return_tensors="pt",
            padding="max_length",
            max_length=77,
            truncation=True
        )
        
        # Add label (squeeze to make it a scalar)
        inputs['labels'] = torch.tensor([float(has_food)])
        return inputs
    except Exception as e:
        print(f"Error processing image {image_path}: {str(e)}")
        return None

def create_batch(samples):
    """Collate function to create batches, handling None values"""
    # Filter out None values
    samples = [s for s in samples if s is not None]
    
    if not samples:
        return None
        
    batch = {
        'pixel_values': torch.stack([x['pixel_values'][0] for x in samples]),
        'input_ids': torch.stack([x['input_ids'][0] for x in samples]),
        'attention_mask': torch.stack([x['attention_mask'][0] for x in samples]),
        'labels': torch.stack([x['labels'] for x in samples])
    }
    return batch

def train_epoch(model, train_loader, optimizer, device):
    """Train for one epoch"""
    model.train()
    total_loss = 0
    valid_batches = 0
    progress_bar = tqdm(train_loader, desc="Training")
    
    for batch in progress_bar:
        if batch is None:
            continue
            
        # Move batch to device
        batch = {k: v.to(device) for k, v in batch.items()}
        
        # Forward pass
        outputs = model(
            input_ids=batch['input_ids'],
            attention_mask=batch['attention_mask'],
            pixel_values=batch['pixel_values']
        )
        
        # Get image and text features
        image_features = outputs.image_embeds
        text_features = outputs.text_embeds
        
        # Compute similarity scores (ensure it's the right shape)
        similarity = torch.sum(image_features * text_features, dim=-1)  # [batch_size]
        
        # Compute loss (make sure both tensors are the same shape)
        loss = F.binary_cross_entropy_with_logits(
            similarity,  # [batch_size]
            batch['labels'].float().squeeze(),  # [batch_size]
        )
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        valid_batches += 1
        progress_bar.set_postfix({'loss': loss.item()})
    
    return total_loss / valid_batches if valid_batches > 0 else float('inf')

def evaluate(model, eval_loader, device):
    """Evaluate the model"""
    model.eval()
    correct = 0
    total = 0
    total_loss = 0
    
    with torch.no_grad():
        for batch in eval_loader:
            if batch is None:
                continue
                
            batch = {k: v.to(device) for k, v in batch.items()}
            
            outputs = model(
                input_ids=batch['input_ids'],
                attention_mask=batch['attention_mask'],
                pixel_values=batch['pixel_values']
            )
            
            image_features = outputs.image_embeds
            text_features = outputs.text_embeds
            similarity = torch.sum(image_features * text_features, dim=-1)
            
            loss = F.binary_cross_entropy_with_logits(
                similarity,
                batch['labels'].float().squeeze()
            )
            
            predictions = (torch.sigmoid(similarity) > 0.5).float()
            correct += (predictions == batch['labels'].squeeze()).sum().item()
            total += batch['labels'].size(0)
            total_loss += loss.item()
    
    accuracy = correct / total if total > 0 else 0
    avg_loss = total_loss / len(eval_loader) if len(eval_loader) > 0 else float('inf')
    
    return avg_loss, accuracy

def train_model(df, num_epochs=3, batch_size=16, learning_rate=5e-5):
    """Main training function"""
    # Set device
    device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
    print(f"Using device: {device}")
    
    # Load model and processor
    model_name = "openai/clip-vit-base-patch32"
    processor = CLIPProcessor.from_pretrained(model_name)
    model = CLIPModel.from_pretrained(model_name)
    model = model.to(device)
    
    # Balance dataset
    food_samples = df[df['has_food']]
    no_food_samples = df[~df['has_food']]
    min_samples = min(len(food_samples), len(no_food_samples))
    
    # Undersample majority class
    if len(food_samples) > len(no_food_samples):
        food_samples = food_samples.sample(n=min_samples, random_state=42)
    else:
        no_food_samples = no_food_samples.sample(n=min_samples, random_state=42)
    
    balanced_df = pd.concat([food_samples, no_food_samples])
    print(f"Balanced dataset size: {len(balanced_df)}")
    
    # Split data
    train_df, val_df = train_test_split(balanced_df, test_size=0.2, random_state=42)
    
    # Process datasets with error handling
    train_samples = []
    for _, row in tqdm(train_df.iterrows(), desc="Processing train data"):
        sample = prepare_input(row['image_path'], row['has_food'], processor)
        if sample is not None:
            train_samples.append(sample)
    
    val_samples = []
    for _, row in tqdm(val_df.iterrows(), desc="Processing val data"):
        sample = prepare_input(row['image_path'], row['has_food'], processor)
        if sample is not None:
            val_samples.append(sample)
    
    print(f"Processed {len(train_samples)} training samples")
    print(f"Processed {len(val_samples)} validation samples")
    
    # Create data loaders
    train_loader = DataLoader(
        train_samples,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=create_batch
    )
    
    val_loader = DataLoader(
        val_samples,
        batch_size=batch_size,
        shuffle=False,
        collate_fn=create_batch
    )
    
    # Initialize optimizer
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
    
    # Training loop
    best_accuracy = 0
    for epoch in range(num_epochs):
        print(f"\nEpoch {epoch + 1}/{num_epochs}")
        
        # Train
        train_loss = train_epoch(model, train_loader, optimizer, device)
        
        # Evaluate
        val_loss, accuracy = evaluate(model, val_loader, device)
        
        print(f"Train Loss: {train_loss:.4f}")
        print(f"Val Loss: {val_loss:.4f}")
        print(f"Accuracy: {accuracy:.4f}")
        
        # Save if best model
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'accuracy': accuracy,
            }, 'best_food_detector.pth')
    
    return model, processor

def predict(image_path, model, processor, device):
    """Make prediction for a single image"""
    inputs = prepare_input(image_path, False, processor)  # label doesn't matter here
    if inputs is None:
        return None, None
        
    # Move to device
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model(
            input_ids=inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            pixel_values=inputs['pixel_values']
        )
        
        image_features = outputs.image_embeds
        text_features = outputs.text_embeds
        similarity = torch.sum(image_features * text_features, dim=-1)
        
    probability = torch.sigmoid(similarity).cpu().numpy()[0]
    return probability > 0.5, probability


    


In [35]:


def prepare_balanced_data(df, balance_strategy='undersample'):
    """
    Balance the dataset using different strategies
    
    Parameters:
    - df: DataFrame with 'has_food' column
    - balance_strategy: 'undersample', 'oversample', or 'weighted'
    
    Returns:
    - Balanced DataFrame or (DataFrame, sample_weights)
    """
    food_samples = df[df['has_food']]
    no_food_samples = df[~df['has_food']]
    
    print(f"Original distribution:")
    print(f"Food samples: {len(food_samples)}")
    print(f"No food samples: {len(no_food_samples)}")
    
    if balance_strategy == 'undersample':
        # Undersample majority class
        no_food_balanced = resample(
            no_food_samples,
            replace=False,
            n_samples=len(food_samples),
            random_state=42
        )
        balanced_df = pd.concat([food_samples, no_food_balanced])
        print(f"\nAfter undersampling:")
        print(f"Total samples: {len(balanced_df)}")
        return balanced_df
        
    elif balance_strategy == 'oversample':
        # Oversample minority class
        food_balanced = resample(
            food_samples,
            replace=True,
            n_samples=len(no_food_samples),
            random_state=42
        )
        balanced_df = pd.concat([food_balanced, no_food_samples])
        print(f"\nAfter oversampling:")
        print(f"Total samples: {len(balanced_df)}")
        return balanced_df
        
    elif balance_strategy == 'weighted':
        # Calculate class weights
        total_samples = len(df)
        weight_for_0 = (1 / len(no_food_samples)) * (total_samples / 2)
        weight_for_1 = (1 / len(food_samples)) * (total_samples / 2)
        
        sample_weights = np.where(df['has_food'], weight_for_1, weight_for_0)
        print("\nUsing weighted sampling")
        print(f"Weight for no food: {weight_for_0:.3f}")
        print(f"Weight for food: {weight_for_1:.3f}")
        return df, sample_weights

def modify_train_function_for_weights(train_model_fn):
    """
    Modify the training function to use sample weights
    """
    def weighted_loss(predictions, targets, weights):
        return F.binary_cross_entropy_with_logits(
            predictions, 
            targets,
            weight=weights,
            reduction='mean'
        )
    
    # Modify the training loop to include weights
    def train_epoch_weighted(model, train_loader, optimizer, device):
        model.train()
        total_loss = 0
        progress_bar = tqdm(train_loader, desc="Training")
        
        for batch in progress_bar:
            batch = {k: v.to(device) for k, v in batch.items()}
            
            outputs = model(
                input_ids=batch['input_ids'],
                attention_mask=batch['attention_mask'],
                pixel_values=batch['pixel_values']
            )
            
            image_features = outputs.image_embeds
            text_features = outputs.text_embeds
            similarity = torch.sum(image_features * text_features, dim=-1)
            
            loss = weighted_loss(
                similarity, 
                batch['labels'],
                batch['weights'].to(device)
            )
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            progress_bar.set_postfix({'loss': loss.item()})
        
        return total_loss / len(train_loader)
    
    return train_epoch_weighted


    

  

In [36]:



# Choose one of these approaches:

# 1. Undersampling approach
balanced_df = prepare_balanced_data(df, 'undersample')

"""  # 2. Oversampling approach
balanced_df = prepare_balanced_data(df, 'oversample')
model, processor = train_model(balanced_df)

# 3. Weighted approach
df, sample_weights = prepare_balanced_data(df, 'weighted')
# You'll need to modify the train_model function to use weights """

# Train model
# Verify image paths exist and are accessible
#balanced_df['exists'] = balanced_df['image_path'].apply(lambda x: Path(x).exists())
#print(f"Found {sum(df['exists'])} valid images out of {len(df)} total")

# Filter to only existing images
#balanced_df = balanced_df[balanced_df['exists']]

# Train model
model, processor = train_model(balanced_df)

# Example prediction
    # Test prediction
test_image = df['image_path'].iloc[0]
contains_food, confidence = predict(test_image, model, processor, 
                                    torch.device('mps' if torch.backends.mps.is_available() else 'cpu'))
print(f"Contains food: {contains_food} (confidence: {confidence:.2f})")




Original distribution:
Food samples: 1407
No food samples: 69763

After undersampling:
Total samples: 2814
Using device: mps
Balanced dataset size: 2814


Processing train data: 0it [00:00, ?it/s]

Processing val data: 0it [00:00, ?it/s]

Processed 2251 training samples
Processed 563 validation samples

Epoch 1/3


Training:   0%|          | 0/141 [00:00<?, ?it/s]

Train Loss: 0.3272
Val Loss: 0.3133
Accuracy: 1.0000

Epoch 2/3


Training:   0%|          | 0/141 [00:00<?, ?it/s]

Train Loss: 0.3133
Val Loss: 0.3133
Accuracy: 1.0000

Epoch 3/3


Training:   0%|          | 0/141 [00:00<?, ?it/s]

Train Loss: 0.3133
Val Loss: 0.3133
Accuracy: 1.0000
Contains food: False (confidence: 0.27)


In [37]:
test_image

'img/img_512/Intérieur de cuisine - Joachim Beuckelaer - Musée du louvre Peintures RF 2659.jpg'