In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/multi-label-classification-competition-2025/COMP5329S1A2Dataset/train.csv
/kaggle/input/multi-label-classification-competition-2025/COMP5329S1A2Dataset/test.csv
/kaggle/input/multi-label-classification-competition-2025/COMP5329S1A2Dataset/data/31973.jpg
/kaggle/input/multi-label-classification-competition-2025/COMP5329S1A2Dataset/data/30778.jpg
/kaggle/input/multi-label-classification-competition-2025/COMP5329S1A2Dataset/data/19812.jpg
/kaggle/input/multi-label-classification-competition-2025/COMP5329S1A2Dataset/data/22735.jpg
/kaggle/input/multi-label-classification-competition-2025/COMP5329S1A2Dataset/data/38246.jpg
/kaggle/input/multi-label-classification-competition-2025/COMP5329S1A2Dataset/data/16916.jpg
/kaggle/input/multi-label-classification-competition-2025/COMP5329S1A2Dataset/data/39500.jpg
/kaggle/input/multi-label-classification-competition-2025/COMP5329S1A2Dataset/data/7981.jpg
/kaggle/input/multi-label-classification-competition-2025/COMP5329S1A2Dataset/data

In [2]:
# ======================================================================================
#                              SETUP AND IMPORTS
# ======================================================================================
import os
import sys
import pandas as pd
import numpy as np
from PIL import Image
import cv2
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score
from tqdm.notebook import tqdm if 'ipykernel' in sys.modules else tqdm # Progress bar for notebooks or scripts
import timm
from sentence_transformers import SentenceTransformer
import gc
import warnings

# Suppress warnings
warnings.filterwarnings("ignore")

# ======================================================================================
#                          CONFIGURATION & HYPERPARAMETERS
# ======================================================================================
class CFG:
    """
    Configuration class for all hyperparameters and settings.
    """
    # General
    SEED = 42
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Data paths
    DATA_DIR = "/kaggle/input/multi-label-classification-competition-2025/COMP5329S1A2Dataset/data/"
    TRAIN_CSV = "/kaggle/input/multi-label-classification-competition-2025/COMP5329S1A2Dataset/train.csv"
    TEST_CSV = "/kaggle/input/multi-label-classification-competition-2025/COMP5329S1A2Dataset/test.csv"
    SUBMISSION_FILE = "/kaggle/input/multi-label-classification-competition-2025/COMP5329S1A2Dataset/submission.csv"

    # Model parameters
    IMAGE_MODEL_NAME = 'vit_base_patch16_224' # A powerful Vision Transformer
    TEXT_MODEL_NAME = 'all-MiniLM-L6-v2'      # A fast and effective sentence transformer
    IMAGE_SIZE = 224
    TEXT_EMBEDDING_DIM = 384 # Dimension of 'all-MiniLM-L6-v2' embeddings
    
    # Training parameters
    EPOCHS = 8
    BATCH_SIZE = 32
    LEARNING_RATE = 1e-4
    WEIGHT_DECAY = 1e-6
    N_FOLDS = 1 # For simplicity, we'll run a single train/validation split. Change to 5 for 5-fold CV.
    
    # Prediction
    PREDICTION_THRESHOLD = 0.4 # Threshold to convert probabilities to binary labels

# Helper function to seed everything for reproducibility
def set_seed(seed=CFG.SEED):
    """Sets the seed for reproducibility."""
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)

# ======================================================================================
#                          GENERATE DUMMY DATA (for testing)
# ======================================================================================
def create_dummy_data():
    """
    Creates dummy data files if they don't exist. This allows the script
    to be run end-to-end without the actual dataset.
    """
    # Create data directory
    os.makedirs(CFG.DATA_DIR, exist_ok=True)
    
    # Create dummy train.csv
    if not os.path.exists(CFG.TRAIN_CSV):
        print("Creating dummy train.csv...")
        train_data = []
        all_labels = [i for i in range(1, 20) if i != 12]
        for i in range(100): # 100 dummy samples
            img_id = f"train_{i:04d}.jpg"
            # Generate 1 to 3 random labels
            num_labels = np.random.randint(1, 4)
            labels = np.random.choice(all_labels, num_labels, replace=False)
            labels_str = " ".join(map(str, labels))
            caption = f"A sample caption for image {i}"
            train_data.append([img_id, labels_str, caption])
            # Create a dummy image file
            dummy_image = np.random.randint(0, 256, (100, 100, 3), dtype=np.uint8)
            cv2.imwrite(os.path.join(CFG.DATA_DIR, img_id), dummy_image)
        pd.DataFrame(train_data, columns=['ImageID', 'Labels', 'Caption']).to_csv(CFG.TRAIN_CSV, index=False)
        
    # Create dummy test.csv
    if not os.path.exists(CFG.TEST_CSV):
        print("Creating dummy test.csv...")
        test_data = []
        for i in range(50): # 50 dummy test samples
            img_id = f"test_{i:04d}.jpg"
            caption = f"A test caption for image {i}"
            test_data.append([img_id, caption])
            # Create a dummy image file
            dummy_image = np.random.randint(0, 256, (100, 100, 3), dtype=np.uint8)
            cv2.imwrite(os.path.join(CFG.DATA_DIR, img_id), dummy_image)
        pd.DataFrame(test_data, columns=['ImageID', 'Caption']).to_csv(CFG.TEST_CSV, index=False)

# ======================================================================================
#                        DATA PREPARATION AND PREPROCESSING
# ======================================================================================
def prepare_data():
    """
    Loads and preprocesses the training and testing data.
    - Encodes labels into a multi-hot format.
    - Generates text embeddings using a sentence transformer.
    """
    print("Preparing data...")
    train_df = pd.read_csv(CFG.TRAIN_CSV)
    test_df = pd.read_csv(CFG.TEST_CSV)
    
    # --- Process file paths ---
    train_df['image_path'] = train_df['ImageID'].apply(lambda x: os.path.join(CFG.DATA_DIR, x))
    test_df['image_path'] = test_df['ImageID'].apply(lambda x: os.path.join(CFG.DATA_DIR, x))
    
    # --- Handle optional captions ---
    # Fill missing captions with an empty string
    train_df['Caption'] = train_df['Caption'].fillna('')
    test_df['Caption'] = test_df['Caption'].fillna('')
    
    # --- Process Labels ---
    # Convert space-separated string of labels to a list of integers
    train_df['Labels'] = train_df['Labels'].apply(lambda x: [int(i) for i in x.split()])
    
    # Use MultiLabelBinarizer to one-hot encode the labels
    mlb = MultiLabelBinarizer()
    train_labels = mlb.fit_transform(train_df['Labels'])
    
    # Store encoded labels back into the DataFrame
    train_df['encoded_labels'] = list(train_labels)
    
    print(f"Label classes found: {mlb.classes_}")
    
    # --- Generate Text Embeddings ---
    print("Generating text embeddings... (This might take a while)")
    text_model = SentenceTransformer(CFG.TEXT_MODEL_NAME, device=CFG.DEVICE)
    
    # It's more efficient to encode all captions at once
    train_captions = train_df['Caption'].tolist()
    test_captions = test_df['Caption'].tolist()
    
    # Generate embeddings
    train_text_embeddings = text_model.encode(train_captions, show_progress_bar=True, batch_size=CFG.BATCH_SIZE*2)
    test_text_embeddings = text_model.encode(test_captions, show_progress_bar=True, batch_size=CFG.BATCH_SIZE*2)
    
    train_df['text_embeddings'] = list(train_text_embeddings)
    test_df['text_embeddings'] = list(test_text_embeddings)
    
    # Clean up GPU memory
    del text_model
    gc.collect()
    torch.cuda.empty_cache()
    
    return train_df, test_df, mlb

# ======================================================================================
#                              PYTORCH DATASET
# ======================================================================================
class MultimodalDataset(Dataset):
    """
    Custom PyTorch Dataset for multimodal data (image + text).
    """
    def __init__(self, df, transforms=None, is_test=False):
        self.df = df
        self.transforms = transforms
        self.is_test = is_test

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        
        # --- Image ---
        image = Image.open(row['image_path']).convert('RGB')
        if self.transforms:
            image = np.array(image) # Albumentations requires numpy array
            image = self.transforms(image=image)['image']
        
        # --- Text ---
        text_embedding = torch.tensor(row['text_embeddings'], dtype=torch.float)
        
        if self.is_test:
            return image, text_embedding
        else:
            # --- Labels ---
            labels = torch.tensor(row['encoded_labels'], dtype=torch.float)
            return image, text_embedding, labels

# Image transformations using Albumentations
def get_transforms():
    """Returns a dictionary of train and validation transforms."""
    # Mean and std for ImageNet pre-trained models
    mean, std = [0.485, 0.456, 0.406], [0.229, 0.224, 0.225]
    
    train_transforms = transforms.Compose([
        transforms.Resize((CFG.IMAGE_SIZE, CFG.IMAGE_SIZE)),
        transforms.RandomHorizontalFlip(p=0.5),
        transforms.RandomRotation(degrees=15),
        transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
        transforms.ToTensor(),
        transforms.Normalize(mean=mean, std=std),
    ])
    
    # For validation and test, we only resize, convert to tensor and normalize
    valid_transforms = transforms.Compose([
        transforms.Resize((CFG.IMAGE_SIZE, CFG.IMAGE_SIZE)),
        transforms.ToTensor(),
        transforms.Normalize(mean=mean, std=std),
    ])

    # Let's switch to torchvision transforms for simplicity in this script
    # To use Albumentations, you would keep the numpy conversion in __getitem__
    # For now, we will use PIL-based transforms from torchvision
    return {'train': train_transforms, 'valid': valid_transforms}

# ======================================================================================
#                                MODEL ARCHITECTURE
# ======================================================================================
class MultimodalModel(nn.Module):
    """
    A multimodal model that fuses features from an image model and a text model.
    """
    def __init__(self, num_classes, image_model_name=CFG.IMAGE_MODEL_NAME, text_embedding_dim=CFG.TEXT_EMBEDDING_DIM):
        super().__init__()
        
        # --- Image Branch ---
        self.image_model = timm.create_model(image_model_name, pretrained=True, num_classes=0) # num_classes=0 returns features
        num_image_features = self.image_model.num_features
        
        # --- Text Branch (a simple MLP) ---
        self.text_model = nn.Sequential(
            nn.Linear(text_embedding_dim, 512),
            nn.ReLU(),
            nn.Dropout(0.3)
        )
        
        # --- Classifier Head ---
        # Concatenated features from both branches
        fused_features_dim = num_image_features + 512
        self.classifier = nn.Sequential(
            nn.Linear(fused_features_dim, 512),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(512, num_classes)
        )

    def forward(self, image, text_embedding):
        image_features = self.image_model(image)
        text_features = self.text_model(text_embedding)
        
        # Concatenate features
        fused_features = torch.cat([image_features, text_features], dim=1)
        
        # Get logits
        output = self.classifier(fused_features)
        return output

# ======================================================================================
#                             TRAINING & VALIDATION LOOPS
# ======================================================================================

def train_one_epoch(model, dataloader, optimizer, criterion, device):
    """Trains the model for one epoch."""
    model.train()
    total_loss = 0
    progress_bar = tqdm(dataloader, desc="Training", leave=False)
    
    for images, text_embeds, labels in progress_bar:
        images, text_embeds, labels = images.to(device), text_embeds.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(images, text_embeds)
        loss = criterion(outputs, labels)
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())
        
    return total_loss / len(dataloader)

def validate_one_epoch(model, dataloader, criterion, device, threshold):
    """Validates the model for one epoch."""
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        progress_bar = tqdm(dataloader, desc="Validation", leave=False)
        for images, text_embeds, labels in progress_bar:
            images, text_embeds, labels = images.to(device), text_embeds.to(device), labels.to(device)
            
            outputs = model(images, text_embeds)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            
            # Get predictions
            probs = torch.sigmoid(outputs)
            preds = (probs > threshold).cpu().numpy()
            
            all_preds.append(preds)
            all_labels.append(labels.cpu().numpy())
            
    # Concatenate all batches
    all_preds = np.concatenate(all_preds, axis=0)
    all_labels = np.concatenate(all_labels, axis=0)
    
    # Calculate Mean F1-Score (macro average)
    f1 = f1_score(all_labels, all_preds, average='macro', zero_division=0)
    
    return total_loss / len(dataloader), f1

# ======================================================================================
#                                    MAIN EXECUTION
# ======================================================================================
def main():
    """Main function to run the entire pipeline."""
    set_seed(CFG.SEED)
    create_dummy_data() # Create dummy data if real data is not available
    
    train_df, test_df, mlb = prepare_data()
    num_classes = len(mlb.classes_)
    
    # --- Data Splitting ---
    # For a robust solution, consider StratifiedKFold from skmultilearn
    # Here, we use a simple train_test_split for demonstration
    train_data, val_data = train_test_split(train_df, test_size=0.2, random_state=CFG.SEED)
    
    transforms_dict = get_transforms()
    
    # --- Create Datasets and Dataloaders ---
    train_dataset = MultimodalDataset(train_data, transforms=transforms_dict['train'])
    val_dataset = MultimodalDataset(val_data, transforms=transforms_dict['valid'])
    
    train_loader = DataLoader(train_dataset, batch_size=CFG.BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=CFG.BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)
    
    # --- Model, Loss, Optimizer ---
    print(f"Initializing model on device: {CFG.DEVICE}")
    model = MultimodalModel(num_classes=num_classes).to(CFG.DEVICE)
    
    # BCEWithLogitsLoss is ideal for multi-label classification
    criterion = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=CFG.LEARNING_RATE, weight_decay=CFG.WEIGHT_DECAY)
    
    # --- Training Loop ---
    best_f1 = -1
    best_model_path = "best_model.pth"
    
    for epoch in range(CFG.EPOCHS):
        print(f"\n===== Epoch {epoch+1}/{CFG.EPOCHS} =====")
        
        train_loss = train_one_epoch(model, train_loader, optimizer, criterion, CFG.DEVICE)
        val_loss, val_f1 = validate_one_epoch(model, val_loader, criterion, CFG.DEVICE, CFG.PREDICTION_THRESHOLD)
        
        print(f"Epoch {epoch+1}: Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val F1 (macro): {val_f1:.4f}")
        
        # Save the best model based on validation F1 score
        if val_f1 > best_f1:
            best_f1 = val_f1
            print(f"🎉 New best F1 score: {best_f1:.4f}. Saving model to {best_model_path}")
            torch.save(model.state_dict(), best_model_path)
    
    print(f"\nTraining complete. Best validation F1 score: {best_f1:.4f}")
    
    # --- Inference on Test Set ---
    print("\nStarting inference on the test set...")
    
    # Load the best model for inference
    model.load_state_dict(torch.load(best_model_path))
    model.eval()
    
    test_dataset = MultimodalDataset(test_df, transforms=transforms_dict['valid'], is_test=True)
    test_loader = DataLoader(test_dataset, batch_size=CFG.BATCH_SIZE, shuffle=False, num_workers=2)
    
    all_test_preds = []
    with torch.no_grad():
        for images, text_embeds in tqdm(test_loader, desc="Predicting"):
            images, text_embeds = images.to(CFG.DEVICE), text_embeds.to(CFG.DEVICE)
            
            outputs = model(images, text_embeds)
            probs = torch.sigmoid(outputs)
            preds = (probs > CFG.PREDICTION_THRESHOLD).cpu().numpy()
            all_test_preds.append(preds)
            
    all_test_preds = np.concatenate(all_test_preds, axis=0)
    
    # --- Create Submission File ---
    # Inverse transform the one-hot predictions back to original label numbers
    predicted_labels = mlb.inverse_transform(all_test_preds)
    
    # Format for submission: space-separated string
    predicted_labels_str = [" ".join(map(str, labels)) if labels else "" for labels in predicted_labels]
    
    submission_df = pd.DataFrame({
        'ImageID': test_df['ImageID'],
        'Labels': predicted_labels_str
    })
    
    submission_df.to_csv(CFG.SUBMISSION_FILE, index=False)
    print(f"\nSubmission file created at '{CFG.SUBMISSION_FILE}'")
    print("Sample of submission file:")
    print(submission_df.head())

if __name__ == '__main__':
    main()

SyntaxError: invalid syntax (1780463329.py, line 17)