In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import json
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
import os

# Determine the project root
try:
    ROOT_DIR = Path(__file__).resolve().parent.parent
except NameError:  # __file__ is not defined inside Kaggle/Jupyter
    ROOT_DIR = Path.cwd()

# Define BASE_PATH with fallback
BASE_PATH_CANDIDATE_1 = (ROOT_DIR / 'input/Datasets/datasets/multi_label_classification').resolve()
BASE_PATH_CANDIDATE_2 = Path('input/Datasets/datasets/multi_label_classification').resolve()

if BASE_PATH_CANDIDATE_1.exists():
    BASE_PATH = BASE_PATH_CANDIDATE_1
else:
    BASE_PATH = BASE_PATH_CANDIDATE_2

print(f"Resolved BASE_PATH: {BASE_PATH}")

# File path constants
TRAIN_CSV_PATH = BASE_PATH / 'train.csv'
TEST_CSV_PATH = BASE_PATH / 'test.csv'
IMAGE_FOLDER_PATH = BASE_PATH / 'data'
# FIX: Corrected METADATA_JSON_PATH to point to the correct location of the metadata file.
# The metadata file is typically at the same level as the 'input' directory, or within 'input/Datasets'.
# Based on the problem description, the metadata JSON is provided as a separate input.
# Assuming the metadata file is named 'multi_label_classification.json' and is in the 'input/Datasets' directory.
METADATA_JSON_PATH = BASE_PATH.parent.parent / 'multi_label_classification.json'


# Image preprocessing constants
IMAGE_SIZE = (224, 224)
BATCH_SIZE = 32

class MultiLabelImageDataset(Dataset):
    def __init__(self, dataframe, image_folder_path, transform=None):
        self.dataframe = dataframe
        self.image_folder_path = image_folder_path
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        img_name = self.dataframe.iloc[idx]['ImageID']
        img_path = os.path.join(self.image_folder_path, img_name)
        image = Image.open(img_path).convert('RGB')

        if self.transform:
            image = self.transform(image)

        # Labels will be handled by MultiLabelBinarizer outside the dataset for consistency
        # and to avoid passing them if only image features are needed.
        # For training, labels will be merged with image features later.
        return image, img_name

def load_metadata(metadata_path):
    """Loads and returns the dataset metadata JSON."""
    with open(metadata_path, 'r') as f:
        return json.load(f)

def preprocess_data(df, metadata, mlb=None, tfidf_vectorizer=None, scaler=None, is_train=True):
    """
    Performs preprocessing steps on the dataframe.
    Args:
        df (pd.DataFrame): The input dataframe (train or test).
        metadata (dict): The dataset metadata.
        mlb (MultiLabelBinarizer, optional): Fitted MultiLabelBinarizer for labels.
                                             Required for test set.
        tfidf_vectorizer (TfidfVectorizer, optional): Fitted TfidfVectorizer for captions.
                                                      Required for test set.
        scaler (StandardScaler, optional): Fitted StandardScaler for numerical features.
                                           Required for test set.
        is_train (bool): True if processing the training set, False for test set.
    Returns:
        tuple: Processed dataframe, fitted mlb, fitted tfidf_vectorizer, fitted scaler.
    """
    # Identify column types from metadata
    variables_info = metadata['profiling_summary']['variables']
    image_id_col = None
    label_col = None
    caption_col = None

    for col_name, col_info in variables_info.items():
        if col_name == 'ImageID':
            image_id_col = col_name
        elif col_name == 'Labels':
            label_col = col_name
        elif col_name == 'Caption':
            caption_col = col_name

    # --- Handle Labels (Multi-label Binarization) ---
    if label_col and label_col in df.columns:
        # Labels are space-separated strings, e.g., "1 3 5"
        df[label_col] = df[label_col].apply(lambda x: x.split())

        if is_train:
            mlb = MultiLabelBinarizer()
            labels_encoded = mlb.fit_transform(df[label_col])
            # Create a DataFrame for encoded labels
            labels_df = pd.DataFrame(labels_encoded, columns=mlb.classes_, index=df.index)
            df = pd.concat([df.drop(columns=[label_col]), labels_df], axis=1)
        else:
            if mlb is None:
                raise ValueError("MultiLabelBinarizer must be fitted on training data and provided for test data.")
            labels_encoded = mlb.transform(df[label_col])
            labels_df = pd.DataFrame(labels_encoded, columns=mlb.classes_, index=df.index)
            df = pd.concat([df.drop(columns=[label_col]), labels_df], axis=1)
    else:
        print(f"Warning: Label column '{label_col}' not found or not specified in metadata.")

    # --- Handle Caption (TF-IDF) ---
    if caption_col and caption_col in df.columns:
        if is_train:
            tfidf_vectorizer = TfidfVectorizer(max_features=5000) # Limit features to manage dimensionality
            caption_features = tfidf_vectorizer.fit_transform(df[caption_col])
        else:
            if tfidf_vectorizer is None:
                raise ValueError("TfidfVectorizer must be fitted on training data and provided for test data.")
            caption_features = tfidf_vectorizer.transform(df[caption_col])

        # Convert TF-IDF sparse matrix to DataFrame
        caption_df = pd.DataFrame(caption_features.toarray(),
                                  columns=[f'caption_tfidf_{i}' for i in range(caption_features.shape[1])],
                                  index=df.index)
        df = pd.concat([df.drop(columns=[caption_col]), caption_df], axis=1)
    else:
        print(f"Warning: Caption column '{caption_col}' not found or not specified in metadata.")

    # --- Handle ImageID (for merging with image features later) ---
    # ImageID is already 'object' type, no direct preprocessing needed here,
    # but it's crucial for merging with image features.

    # No numerical columns identified in the provided metadata for scaling.
    # If there were, the logic would be:
    # numerical_cols = [col for col, info in variables_info.items() if info['type'] == 'Numeric']
    # if numerical_cols:
    #     if is_train:
    #         scaler = StandardScaler()
    #         df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
    #     else:
    #         if scaler is None:
    #             raise ValueError("StandardScaler must be fitted on training data and provided for test data.")
    #         df[numerical_cols] = scaler.transform(df[numerical_cols])

    return df, mlb, tfidf_vectorizer, scaler


def main():
    metadata = load_metadata(METADATA_JSON_PATH)

    # Load tabular data
    train_df = pd.read_csv(TRAIN_CSV_PATH)
    test_df = pd.read_csv(TEST_CSV_PATH)

    print("Original Train DataFrame head:")
    print(train_df.head())
    print("\nOriginal Test DataFrame head:")
    print(test_df.head())

    # Preprocess tabular data
    processed_train_df, mlb, tfidf_vectorizer, scaler = preprocess_data(train_df.copy(), metadata, is_train=True)
    processed_test_df, _, _, _ = preprocess_data(test_df.copy(), metadata, mlb=mlb, tfidf_vectorizer=tfidf_vectorizer, scaler=scaler, is_train=False)

    print("\nProcessed Train DataFrame head (tabular features):")
    print(processed_train_df.head())
    print("\nProcessed Test DataFrame head (tabular features):")
    print(processed_test_df.head())

    # Image preprocessing setup
    image_transform = transforms.Compose([
        transforms.Resize(IMAGE_SIZE),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), # ImageNet normalization
    ])

    # Create image datasets and dataloaders
    train_image_dataset = MultiLabelImageDataset(processed_train_df, IMAGE_FOLDER_PATH, transform=image_transform)
    test_image_dataset = MultiLabelImageDataset(processed_test_df, IMAGE_FOLDER_PATH, transform=image_transform)

    train_image_dataloader = DataLoader(train_image_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=os.cpu_count() // 2 if os.cpu_count() else 0)
    test_image_dataloader = DataLoader(test_image_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=os.cpu_count() // 2 if os.cpu_count() else 0)

    print(f"\nTrain image dataset size: {len(train_image_dataset)}")
    print(f"Test image dataset size: {len(test_image_dataset)}")
    print(f"Example image batch from train_image_dataloader (shape):")
    for i, (images, img_ids) in enumerate(train_image_dataloader):
        print(f"Batch {i+1}: Images shape: {images.shape}, Image IDs: {img_ids[:5]}")
        break # Just show one batch

    # Return processed dataframes and necessary transformers/scalers
    # Note: Image data is returned as DataLoaders, which will be iterated over in Stage 2.
    # The processed_train_df and processed_test_df contain the tabular features (TF-IDF, binarized labels).
    # Image features (embeddings) would typically be extracted in Stage 2 using a pre-trained model
    # and then merged with these tabular features based on 'ImageID'.
    return {
        "processed_train_df": processed_train_df,
        "processed_test_df": processed_test_df,
        "train_image_dataloader": train_image_dataloader,
        "test_image_dataloader": test_image_dataloader,
        "mlb": mlb,
        "tfidf_vectorizer": tfidf_vectorizer,
        "scaler": scaler # Will be None in this specific case as no numerical features were scaled
    }

if __name__ == "__main__":
    # Example of how to run and access the outputs
    preprocessed_data = main()
    # You can now access:
    # preprocessed_data["processed_train_df"]
    # preprocessed_data["processed_test_df"]
    # preprocessed_data["train_image_dataloader"]
    # preprocessed_data["test_image_dataloader"]
    # preprocessed_data["mlb"]
    # preprocessed_data["tfidf_vectorizer"]

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import json
import os
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score, accuracy_score, log_loss, roc_auc_score
from PIL import Image
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from tqdm import tqdm
import warnings

# Suppress specific warnings from scikit-learn
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn")

# Determine the project root
try:
    ROOT_DIR = Path(__file__).resolve().parent.parent
except NameError:  # __file__ is not defined inside Kaggle/Jupyter
    ROOT_DIR = Path.cwd()

# Define BASE_PATH with fallback
BASE_PATH_CANDIDATE_1 = (ROOT_DIR / 'input/Datasets/datasets/multi_label_classification').resolve()
BASE_PATH_CANDIDATE_2 = Path('input/Datasets/datasets/multi_label_classification').resolve()

if BASE_PATH_CANDIDATE_1.exists():
    BASE_PATH = BASE_PATH_CANDIDATE_1
else:
    BASE_PATH = BASE_PATH_CANDIDATE_2

print(f"Resolved BASE_PATH: {BASE_PATH}")

# File path constants
TRAIN_CSV_PATH = BASE_PATH / 'train.csv'
TEST_CSV_PATH = BASE_PATH / 'test.csv'
IMAGE_FOLDER_PATH = BASE_PATH / 'data'
# FIX: Corrected METADATA_JSON_PATH to point to the correct location of the metadata file.
# The metadata file is typically at the same level as the 'input' directory, or within 'input/Datasets'.
# Based on the problem description, the metadata JSON is provided as a separate input.
# Assuming the metadata file is named 'multi_label_classification.json' and is in the 'input/Datasets' directory.
METADATA_JSON_PATH = BASE_PATH.parent.parent / 'multi_label_classification.json'


# Output paths
OUTPUT_DIR = Path("./outputs")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
METRICS_PATH = OUTPUT_DIR / "metrics.json"
MODEL_PATH = Path("./models/multi_label_classification_model.pkl")
MODEL_PATH.parent.mkdir(parents=True, exist_ok=True)
MLB_PATH = Path("./models/mlb.pkl")
TFIDF_PATH = Path("./models/tfidf_vectorizer.pkl")


# Image preprocessing constants
IMAGE_SIZE = (224, 224)
BATCH_SIZE = 32
NUM_EPOCHS = 20 # Increased epochs for better training
EARLY_STOPPING_ROUNDS = 5 # Reduced early stopping rounds for faster convergence
LEARNING_RATE = 1e-4

# Device configuration
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

class MultiLabelImageDataset(Dataset):
    def __init__(self, dataframe, image_folder_path, transform=None, is_test=False):
        self.dataframe = dataframe
        self.image_folder_path = image_folder_path
        self.transform = transform
        self.is_test = is_test
        # Assuming label columns are numeric strings after binarization
        self.label_columns = [col for col in dataframe.columns if col.isdigit()]
        # Assuming TF-IDF columns start with 'caption_tfidf_'
        self.tfidf_columns = [col for col in dataframe.columns if col.startswith('caption_tfidf_')]

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        img_name = self.dataframe.iloc[idx]['ImageID']
        img_path = os.path.join(self.image_folder_path, img_name)
        image = Image.open(img_path).convert('RGB')

        if self.transform:
            image = self.transform(image)

        # Extract TF-IDF features
        tfidf_features = torch.tensor(self.dataframe.iloc[idx][self.tfidf_columns].values.astype(np.float32))

        if self.is_test:
            return image, tfidf_features, img_name
        else:
            labels = self.dataframe.iloc[idx][self.label_columns].values.astype(np.float32)
            return image, torch.tensor(labels), tfidf_features, img_name

def load_metadata(metadata_path):
    """Loads and returns the dataset metadata JSON."""
    with open(metadata_path, 'r') as f:
        return json.load(f)

def preprocess_data(df, metadata, mlb=None, tfidf_vectorizer=None, scaler=None, is_train=True):
    """
    Performs preprocessing steps on the dataframe.
    Args:
        df (pd.DataFrame): The input dataframe (train or test).
        metadata (dict): The dataset metadata.
        mlb (MultiLabelBinarizer, optional): Fitted MultiLabelBinarizer for labels.
                                             Required for test set.
        tfidf_vectorizer (TfidfVectorizer, optional): Fitted TfidfVectorizer for captions.
                                                      Required for test set.
        scaler (StandardScaler, optional): Fitted StandardScaler for numerical features.
                                           Required for test set.
        is_train (bool): True if processing the training set, False for test set.
    Returns:
        tuple: Processed dataframe, fitted mlb, fitted tfidf_vectorizer, fitted scaler.
    """
    # Identify column types from metadata
    variables_info = metadata['profiling_summary']['variables']
    image_id_col = None
    label_col = None
    caption_col = None

    for col_name, col_info in variables_info.items():
        if col_name == 'ImageID':
            image_id_col = col_name
        elif col_name == 'Labels':
            label_col = col_name
        elif col_name == 'Caption':
            caption_col = col_name

    # --- Handle Labels (Multi-label Binarization) ---
    if label_col and label_col in df.columns:
        # Labels are space-separated strings, e.g., "1 3 5"
        df[label_col] = df[label_col].apply(lambda x: x.split())

        if is_train:
            mlb = MultiLabelBinarizer()
            labels_encoded = mlb.fit_transform(df[label_col])
            # Create a DataFrame for encoded labels
            labels_df = pd.DataFrame(labels_encoded, columns=mlb.classes_, index=df.index)
            df = pd.concat([df.drop(columns=[label_col]), labels_df], axis=1)
        else:
            if mlb is None:
                raise ValueError("MultiLabelBinarizer must be fitted on training data and provided for test data.")
            labels_encoded = mlb.transform(df[label_col])
            labels_df = pd.DataFrame(labels_encoded, columns=mlb.classes_, index=df.index)
            df = pd.concat([df.drop(columns=[label_col]), labels_df], axis=1)
    else:
        print(f"Warning: Label column '{label_col}' not found or not specified in metadata.")

    # --- Handle Caption (TF-IDF) ---
    if caption_col and caption_col in df.columns:
        if is_train:
            tfidf_vectorizer = TfidfVectorizer(max_features=5000) # Limit features to manage dimensionality
            caption_features = tfidf_vectorizer.fit_transform(df[caption_col])
        else:
            if tfidf_vectorizer is None:
                raise ValueError("TfidfVectorizer must be fitted on training data and provided for test data.")
            caption_features = tfidf_vectorizer.transform(df[caption_col])

        # Convert TF-IDF sparse matrix to DataFrame
        caption_df = pd.DataFrame(caption_features.toarray(),
                                  columns=[f'caption_tfidf_{i}' for i in range(caption_features.shape[1])],
                                  index=df.index)
        df = pd.concat([df.drop(columns=[caption_col]), caption_df], axis=1)
    else:
        print(f"Warning: Caption column '{caption_col}' not found or not specified in metadata.")

    return df, mlb, tfidf_vectorizer, scaler

class MultiModalModel(nn.Module):
    def __init__(self, num_classes, tfidf_features_dim):
        super(MultiModalModel, self).__init__()
        # Image branch: Pre-trained ResNet
        self.resnet = models.resnet50(pretrained=True)
        # Freeze all parameters in ResNet initially
        for param in self.resnet.parameters():
            param.requires_grad = False
        # Replace the final classification layer
        num_ftrs = self.resnet.fc.in_features
        self.resnet.fc = nn.Identity() # Remove the final FC layer to get features

        # TF-IDF branch
        self.tfidf_fc = nn.Linear(tfidf_features_dim, 512) # Project TF-IDF features to a common dimension

        # Combined branch
        self.combined_fc1 = nn.Linear(num_ftrs + 512, 1024) # Combine image and TF-IDF features
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)
        self.combined_fc2 = nn.Linear(1024, num_classes)

    def forward(self, image_input, tfidf_input):
        # Image branch
        image_features = self.resnet(image_input)

        # TF-IDF branch
        tfidf_features = self.relu(self.tfidf_fc(tfidf_input))

        # Concatenate features
        combined_features = torch.cat((image_features, tfidf_features), dim=1)

        # Combined branch
        output = self.combined_fc1(combined_features)
        output = self.relu(output)
        output = self.dropout(output)
        output = self.combined_fc2(output)
        return output

def train_model(model, train_dataloader, val_dataloader, optimizer, criterion, num_epochs, early_stopping_rounds):
    best_val_f1 = -1
    epochs_no_improve = 0
    history = {'train_loss': [], 'val_loss': [], 'val_f1': []}

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        train_preds = []
        train_targets = []

        for images, labels, tfidf_features, _ in tqdm(train_dataloader, desc=f"Epoch {epoch+1} Training"):
            images = images.to(DEVICE)
            labels = labels.to(DEVICE)
            tfidf_features = tfidf_features.to(DEVICE)

            optimizer.zero_grad()
            outputs = model(images, tfidf_features)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item() * images.size(0)
            # FIX: RuntimeError: Can't call numpy() on Tensor that requires grad. Use tensor.detach().numpy() instead.
            # Detach the tensor from the computation graph before converting to numpy.
            train_preds.append(torch.sigmoid(outputs).detach().cpu().numpy())
            train_targets.append(labels.cpu().numpy())

        epoch_train_loss = running_loss / len(train_dataloader.dataset)
        all_train_preds = np.vstack(train_preds)
        all_train_targets = np.vstack(train_targets)
        # Convert probabilities to binary predictions for F1 score
        train_binary_preds = (all_train_preds > 0.5).astype(int)
        epoch_train_f1 = f1_score(all_train_targets, train_binary_preds, average='samples') # 'samples' for multi-label

        # Validation phase
        model.eval()
        val_running_loss = 0.0
        val_preds = []
        val_targets = []
        with torch.no_grad():
            for images, labels, tfidf_features, _ in tqdm(val_dataloader, desc=f"Epoch {epoch+1} Validation"):
                images = images.to(DEVICE)
                labels = labels.to(DEVICE)
                tfidf_features = tfidf_features.to(DEVICE)

                outputs = model(images, tfidf_features)
                loss = criterion(outputs, labels)

                val_running_loss += loss.item() * images.size(0)
                val_preds.append(torch.sigmoid(outputs).cpu().numpy())
                val_targets.append(labels.cpu().numpy())

        epoch_val_loss = val_running_loss / len(val_dataloader.dataset)
        all_val_preds = np.vstack(val_preds)
        all_val_targets = np.vstack(val_targets)
        val_binary_preds = (all_val_preds > 0.5).astype(int)
        epoch_val_f1 = f1_score(all_val_targets, val_binary_preds, average='samples')

        print(f"Epoch {epoch+1}/{num_epochs} - Train Loss: {epoch_train_loss:.4f}, Train F1: {epoch_train_f1:.4f}, "
              f"Val Loss: {epoch_val_loss:.4f}, Val F1: {epoch_val_f1:.4f}")

        history['train_loss'].append(epoch_train_loss)
        history['val_loss'].append(epoch_val_loss)
        history['val_f1'].append(epoch_val_f1)

        # Early stopping
        if epoch_val_f1 > best_val_f1:
            best_val_f1 = epoch_val_f1
            epochs_no_improve = 0
            torch.save(model.state_dict(), MODEL_PATH)
            print(f"Model saved to {MODEL_PATH} with improved F1: {best_val_f1:.4f}")
        else:
            epochs_no_improve += 1
            if epochs_no_improve == early_stopping_rounds:
                print(f"Early stopping triggered after {early_stopping_rounds} epochs without improvement.")
                break
    return model, history

def evaluate_model(model, dataloader, mlb):
    model.eval()
    all_preds = []
    all_targets = []
    with torch.no_grad():
        for images, labels, tfidf_features, _ in tqdm(dataloader, desc="Evaluating"):
            images = images.to(DEVICE)
            labels = labels.to(DEVICE)
            tfidf_features = tfidf_features.to(DEVICE)

            outputs = model(images, tfidf_features)
            probabilities = torch.sigmoid(outputs).cpu().numpy()
            all_preds.append(probabilities)
            all_targets.append(labels.cpu().numpy())

    predictions_array = np.vstack(all_preds)
    targets_array = np.vstack(all_targets)

    # Convert probabilities to binary labels for F1 score
    binary_predictions = (predictions_array > 0.5).astype(int)

    f1 = f1_score(targets_array, binary_predictions, average='samples')
    accuracy = accuracy_score(targets_array, binary_predictions)
    # Log loss requires probabilities
    loss = log_loss(targets_array.ravel(), predictions_array.ravel()) # Flatten for log_loss
    # ROC AUC for multi-label can be tricky, 'weighted' or 'macro' might be suitable
    # For simplicity, let's use 'macro' if there are at least 2 classes
    if targets_array.shape[1] > 1:
        roc_auc = roc_auc_score(targets_array, predictions_array, average='macro')
    else:
        roc_auc = roc_auc_score(targets_array, predictions_array) # Binary case

    metrics = {
        "f1_score": f1,
        "accuracy": accuracy,
        "log_loss": loss,
        "roc_auc_score": roc_auc
    }
    return metrics

def main():
    # --- Stage 1: Data Preprocessing (re-run or load from Stage 1 output) ---
    metadata = load_metadata(METADATA_JSON_PATH)

    # Load tabular data
    train_df = pd.read_csv(TRAIN_CSV_PATH)
    test_df = pd.read_csv(TEST_CSV_PATH)

    # Preprocess tabular data
    processed_train_df, mlb, tfidf_vectorizer, scaler = preprocess_data(train_df.copy(), metadata, is_train=True)
    # Save preprocessors
    joblib.dump(mlb, MLB_PATH)
    joblib.dump(tfidf_vectorizer, TFIDF_PATH)

    # Split training data for validation
    # Ensure ImageID is not used for splitting, but kept for dataset creation
    # The error "The least populated class in y has only 1 member" indicates that
    # some label combinations (classes) in the `stratify` column have only one sample.
    # This makes it impossible to split them into train and test sets while maintaining
    # the proportion of that class.
    #
    # To fix this, we can:
    # 1. Remove classes with only one sample from the stratification target.
    # 2. Group rare classes together.
    # 3. Not stratify on all label combinations, but rather on individual labels or a subset.
    #
    # Given the multi-label nature, stratifying on all unique label combinations can lead
    # to very sparse classes. A common workaround is to stratify on a single, most frequent
    # label, or to not stratify at all if the dataset is large enough.
    #
    # For multi-label, a robust stratification strategy is often to use iterative stratification
    # or to stratify based on the counts of individual labels rather than their combinations.
    # However, `train_test_split`'s `stratify` parameter expects a single array.
    #
    # A simpler fix for `ValueError: The least populated class in y has only 1 member`
    # when using `stratify` with multi-label data is to identify and remove such rare
    # label combinations from the stratification target.
    #
    # Let's identify the label columns and create a combined string representation for stratification.
    label_cols = [col for col in processed_train_df.columns if col.isdigit()]

    # Create a string representation of the label vector for stratification
    # This will treat each unique combination of labels as a distinct class.
    # This is what the original code was implicitly doing by passing a DataFrame of label columns.
    # The error means some of these unique combinations appear only once.

    # Let's find the unique label combinations and their counts
    label_combinations = processed_train_df[label_cols].apply(lambda row: tuple(row), axis=1)
    combination_counts = label_combinations.value_counts()

    # Identify combinations with count < 2
    rare_combinations = combination_counts[combination_counts < 2].index

    # Create a stratification target. For rare combinations, assign a dummy value or None.
    stratify_target = label_combinations.copy()

    # FIX: The TypeError: '<' not supported between instances of 'tuple' and 'str'
    # occurs because `stratify_target` contains tuples (label combinations) and
    # 'RARE_COMBINATION' (a string). `np.unique` (called internally by `train_test_split`)
    # tries to sort these mixed types, leading to the error.
    #
    # The fix is to ensure all elements in `stratify_target` are of a comparable type.
    # We can convert the tuples to strings for stratification.
    stratify_target = label_combinations.apply(lambda x: str(x)) # Convert tuples to strings
    stratify_target[label_combinations.isin(rare_combinations).values] = 'RARE_COMBINATION' # Use .values for boolean indexing

    # Now, use this modified stratify_target for train_test_split
    # This ensures that all "classes" in the stratify_target have at least 2 members
    # (the 'RARE_COMBINATION' group will have many, and others will have >=2).
    train_idx, val_idx = train_test_split(
        processed_train_df.index, 
        test_size=0.2, 
        random_state=42, 
        stratify=stratify_target
    )

    train_df_split = processed_train_df.loc[train_idx].reset_index(drop=True)
    val_df_split = processed_train_df.loc[val_idx].reset_index(drop=True)

    # Image preprocessing setup
    image_transform = transforms.Compose([
        transforms.Resize(IMAGE_SIZE),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), # ImageNet normalization
    ])

    # Create image datasets and dataloaders
    # Pass the processed dataframes which now contain TF-IDF features and binarized labels
    train_image_dataset = MultiLabelImageDataset(train_df_split, IMAGE_FOLDER_PATH, transform=image_transform, is_test=False)
    val_image_dataset = MultiLabelImageDataset(val_df_split, IMAGE_FOLDER_PATH, transform=image_transform, is_test=False)

    train_dataloader = DataLoader(train_image_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=os.cpu_count() // 2 if os.cpu_count() else 0)
    val_dataloader = DataLoader(val_image_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=os.cpu_count() // 2 if os.cpu_count() else 0)

    # --- Stage 2: Model Training ---
    num_classes = len(mlb.classes_)
    # The tfidf_vectorizer.max_features is the maximum number of features, not necessarily the actual number
    # The actual number of features is the size of the vocabulary after fitting.
    # If max_features was not hit, len(tfidf_vectorizer.vocabulary_) is more accurate.
    # If tfidf_vectorizer.max_features was set, it limits the vocabulary size.
    # So, using tfidf_vectorizer.max_features is generally safe if it was explicitly set.
    # However, if the actual number of features is less than max_features, using the actual number is better.
    # Let's use the actual number of features from the fitted vectorizer.
    tfidf_features_dim = len(tfidf_vectorizer.vocabulary_)

    model = MultiModalModel(num_classes=num_classes, tfidf_features_dim=tfidf_features_dim).to(DEVICE)

    # Define loss function and optimizer
    criterion = nn.BCEWithLogitsLoss() # Suitable for multi-label classification
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

    # Train the model
    trained_model, history = train_model(model, train_dataloader, val_dataloader, optimizer, criterion, NUM_EPOCHS, EARLY_STOPPING_ROUNDS)

    # Evaluate the best model on the validation set
    # Load the best model state for final evaluation
    trained_model.load_state_dict(torch.load(MODEL_PATH, map_location=DEVICE))
    val_metrics = evaluate_model(trained_model, val_dataloader, mlb)

    print("\nValidation Metrics:")
    for metric, value in val_metrics.items():
        print(f"{metric}: {value:.4f}")

    # Save metrics
    with open(METRICS_PATH, 'w') as f:
        json.dump(val_metrics, f, indent=4)
    print(f"Metrics saved to {METRICS_PATH}")

    return trained_model # Return the trained model for Stage 3 if needed

if __name__ == "__main__":
    main()

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import json
import os
import joblib
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from tqdm import tqdm
import torch.nn as nn
import warnings

# Suppress specific warnings from scikit-learn
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn")

# Determine the project root
try:
    ROOT_DIR = Path(__file__).resolve().parent.parent
except NameError:  # __file__ is not defined inside Kaggle/Jupyter
    ROOT_DIR = Path.cwd()

# Define BASE_PATH with fallback
BASE_PATH_CANDIDATE_1 = (ROOT_DIR / 'input/Datasets/datasets/multi_label_classification').resolve()
BASE_PATH_CANDIDATE_2 = Path('input/Datasets/datasets/multi_label_classification').resolve()

if BASE_PATH_CANDIDATE_1.exists():
    BASE_PATH = BASE_PATH_CANDIDATE_1
else:
    BASE_PATH = BASE_PATH_CANDIDATE_2

print(f"Resolved BASE_PATH: {BASE_PATH}")

# File path constants
TEST_CSV_PATH = BASE_PATH / 'test.csv'
IMAGE_FOLDER_PATH = BASE_PATH / 'data'
# FIX: Corrected METADATA_JSON_PATH to point to the correct location of the metadata file.
# The metadata file is typically at the same level as the 'input' directory, or within 'input/Datasets'.
# Based on the problem description, the metadata JSON is provided as a separate input.
# Assuming the metadata file is named 'multi_label_classification.json' and is in the 'input/Datasets' directory.
METADATA_JSON_PATH = BASE_PATH.parent.parent / 'multi_label_classification.json'


# Output paths
OUTPUT_DIR = Path("./outputs")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
SUBMISSION_PATH = OUTPUT_DIR / "submission.csv"
MODEL_PATH = Path("./models/multi_label_classification_model.pkl")
MLB_PATH = Path("./models/mlb.pkl")
TFIDF_PATH = Path("./models/tfidf_vectorizer.pkl")

# Image preprocessing constants
IMAGE_SIZE = (224, 224)
BATCH_SIZE = 32

# Device configuration
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class MultiLabelImageDataset(Dataset):
    def __init__(self, dataframe, image_folder_path, transform=None, is_test=False):
        self.dataframe = dataframe
        self.image_folder_path = image_folder_path
        self.transform = transform
        self.is_test = is_test
        self.label_columns = [col for col in dataframe.columns if col.isdigit()]
        self.tfidf_columns = [col for col in dataframe.columns if col.startswith('caption_tfidf_')]

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        img_name = self.dataframe.iloc[idx]['ImageID']
        img_path = os.path.join(self.image_folder_path, img_name)
        image = Image.open(img_path).convert('RGB')

        if self.transform:
            image = self.transform(image)

        tfidf_features = torch.tensor(self.dataframe.iloc[idx][self.tfidf_columns].values.astype(np.float32))

        if self.is_test:
            return image, tfidf_features, img_name
        else:
            labels = self.dataframe.iloc[idx][self.label_columns].values.astype(np.float32)
            return image, torch.tensor(labels), tfidf_features, img_name

class MultiModalModel(nn.Module):
    def __init__(self, num_classes, tfidf_features_dim):
        super(MultiModalModel, self).__init__()
        # Image branch: Pre-trained ResNet
        self.resnet = models.resnet50(pretrained=True)
        # Freeze all parameters in ResNet initially
        for param in self.resnet.parameters():
            param.requires_grad = False
        # Replace the final classification layer
        num_ftrs = self.resnet.fc.in_features
        self.resnet.fc = nn.Identity() # Remove the final FC layer to get features

        # TF-IDF branch
        self.tfidf_fc = nn.Linear(tfidf_features_dim, 512) # Project TF-IDF features to a common dimension

        # Combined branch
        self.combined_fc1 = nn.Linear(num_ftrs + 512, 1024) # Combine image and TF-IDF features
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)
        self.combined_fc2 = nn.Linear(1024, num_classes)

    def forward(self, image_input, tfidf_input):
        # Image branch
        image_features = self.resnet(image_input)

        # TF-IDF branch
        tfidf_features = self.relu(self.tfidf_fc(tfidf_input))

        # Concatenate features
        combined_features = torch.cat((image_features, tfidf_features), dim=1)

        # Combined branch
        output = self.combined_fc1(combined_features)
        output = self.relu(output)
        output = self.dropout(output)
        output = self.combined_fc2(output)
        return output

def load_metadata(metadata_path):
    """Loads and returns the dataset metadata JSON."""
    with open(metadata_path, 'r') as f:
        return json.load(f)

def preprocess_data(df, metadata, mlb=None, tfidf_vectorizer=None, scaler=None, is_train=True):
    """
    Performs preprocessing steps on the dataframe.
    Args:
        df (pd.DataFrame): The input dataframe (train or test).
        metadata (dict): The dataset metadata.
        mlb (MultiLabelBinarizer, optional): Fitted MultiLabelBinarizer for labels.
                                             Required for test set.
        tfidf_vectorizer (TfidfVectorizer, optional): Fitted TfidfVectorizer for captions.
                                                      Required for test set.
        scaler (StandardScaler, optional): Fitted StandardScaler for numerical features.
                                           Required for test set.
        is_train (bool): True if processing the training set, False for test set.
    Returns:
        tuple: Processed dataframe, fitted mlb, fitted tfidf_vectorizer, fitted scaler.
    """
    # Identify column types from metadata
    variables_info = metadata['profiling_summary']['variables']
    image_id_col = None
    label_col = None
    caption_col = None

    for col_name, col_info in variables_info.items():
        if col_name == 'ImageID':
            image_id_col = col_name
        elif col_name == 'Labels':
            label_col = col_name
        elif col_name == 'Caption':
            caption_col = col_name

    # --- Handle Labels (Multi-label Binarization) ---
    if label_col and label_col in df.columns:
        # Labels are space-separated strings, e.g., "1 3 5"
        df[label_col] = df[label_col].apply(lambda x: x.split())

        if is_train:
            # This branch should ideally not be called for test data in Stage 3
            # as mlb should be loaded. But keeping it for completeness if this function
            # is reused in a different context.
            mlb = MultiLabelBinarizer()
            labels_encoded = mlb.fit_transform(df[label_col])
            # Create a DataFrame for encoded labels
            labels_df = pd.DataFrame(labels_encoded, columns=mlb.classes_, index=df.index)
            df = pd.concat([df.drop(columns=[label_col]), labels_df], axis=1)
        else:
            if mlb is None:
                raise ValueError("MultiLabelBinarizer must be fitted on training data and provided for test data.")
            # For test data, the 'Labels' column might not exist.
            # If it exists (e.g., for a validation set treated as test), transform it.
            # If it doesn't exist, we just proceed without label processing.
            if label_col in df.columns:
                labels_encoded = mlb.transform(df[label_col])
                labels_df = pd.DataFrame(labels_encoded, columns=mlb.classes_, index=df.index)
                df = pd.concat([df.drop(columns=[label_col]), labels_df], axis=1)
            else:
                print(f"Label column '{label_col}' not found in test dataframe. Skipping label binarization.")
    else:
        print(f"Warning: Label column '{label_col}' not found or not specified in metadata. Skipping label processing.")


    # --- Handle Caption (TF-IDF) ---
    if caption_col and caption_col in df.columns:
        if is_train:
            # This branch should ideally not be called for test data in Stage 3
            # as tfidf_vectorizer should be loaded.
            tfidf_vectorizer = TfidfVectorizer(max_features=5000) # Limit features to manage dimensionality
            caption_features = tfidf_vectorizer.fit_transform(df[caption_col])
        else:
            if tfidf_vectorizer is None:
                raise ValueError("TfidfVectorizer must be fitted on training data and provided for test data.")
            caption_features = tfidf_vectorizer.transform(df[caption_col])

        # Convert TF-IDF sparse matrix to DataFrame
        caption_df = pd.DataFrame(caption_features.toarray(),
                                  columns=[f'caption_tfidf_{i}' for i in range(caption_features.shape[1])],
                                  index=df.index)
        df = pd.concat([df.drop(columns=[caption_col]), caption_df], axis=1)
    else:
        print(f"Warning: Caption column '{caption_col}' not found or not specified in metadata. Skipping TF-IDF processing.")

    return df, mlb, tfidf_vectorizer, scaler

def main(trained_model=None):
    # Load metadata
    metadata = load_metadata(METADATA_JSON_PATH)

    # Load preprocessors
    mlb = joblib.load(MLB_PATH)
    tfidf_vectorizer = joblib.load(TFIDF_PATH)

    # Load test data
    test_df = pd.read_csv(TEST_CSV_PATH)

    # Preprocess test data
    # Note: For test data, we don't have 'Labels' column, so it will be dropped by preprocess_data
    # The `preprocess_data` function is designed to handle this by checking `if label_col in df.columns`.
    # We need to ensure that the `processed_test_df` retains `ImageID` and TF-IDF features.
    # Pass scaler as None as it's not used in this problem, but keep the argument for consistency.
    processed_test_df, _, _, _ = preprocess_data(test_df.copy(), metadata, mlb=mlb, tfidf_vectorizer=tfidf_vectorizer, scaler=None, is_train=False)

    # Image preprocessing setup
    image_transform = transforms.Compose([
        transforms.Resize(IMAGE_SIZE),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), # ImageNet normalization
    ])

    # Create image dataset and dataloader for test set
    # Pass the processed_test_df which now contains TF-IDF features
    test_image_dataset = MultiLabelImageDataset(processed_test_df, IMAGE_FOLDER_PATH, transform=image_transform, is_test=True)
    test_image_dataloader = DataLoader(test_image_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=os.cpu_count() // 2 if os.cpu_count() else 0)

    # Load the trained model
    if trained_model is None:
        # Determine num_classes and tfidf_features_dim from loaded preprocessors
        num_classes = len(mlb.classes_)
        # The tfidf_vectorizer.vocabulary_ contains the actual features learned during training.
        # This is the correct dimension for the TF-IDF input layer of the model.
        tfidf_features_dim = len(tfidf_vectorizer.vocabulary_)

        # If for some reason the vocabulary is empty (e.g., no captions in training data),
        # this would lead to tfidf_features_dim = 0, which would cause an error in the model.
        # Based on the metadata, captions exist, so this should not be 0.
        # However, if the model was trained with a fixed max_features and the actual vocabulary
        # was smaller, the model's input layer would still expect the max_features size.
        # The safest approach is to ensure the dimension matches what the model was trained with.
        # The `MultiModalModel` in Stage 2 was initialized with `len(tfidf_vectorizer.vocabulary_)`.
        # So, using `len(tfidf_vectorizer.vocabulary_)` here is consistent.

        model = MultiModalModel(num_classes=num_classes, tfidf_features_dim=tfidf_features_dim)
        model.load_state_dict(torch.load(MODEL_PATH, map_location=DEVICE))
        model.to(DEVICE)
    else:
        model = trained_model

    model.eval() # Set model to evaluation mode

    all_predictions = []
    image_ids = []

    with torch.no_grad():
        for images, tfidf_features, img_names in tqdm(test_image_dataloader, desc="Generating predictions"):
            images = images.to(DEVICE)
            tfidf_features = tfidf_features.to(DEVICE)

            outputs = model(images, tfidf_features)
            probabilities = torch.sigmoid(outputs).cpu().numpy() # Sigmoid for multi-label probabilities
            all_predictions.append(probabilities)
            image_ids.extend(img_names)

    # Concatenate all predictions
    predictions_array = np.vstack(all_predictions)

    # Convert probabilities to binary labels (threshold 0.5)
    # This is a common approach for multi-label classification, but the competition might specify a different threshold
    # or require raw probabilities. Given "Mean F1-Score" as metric, binary labels are needed for F1.
    binary_predictions = (predictions_array > 0.5).astype(int)

    # Convert binary predictions back to original label format (space-separated string)
    predicted_labels = []
    for i in range(binary_predictions.shape[0]):
        # FIX: The error trace is from Stage 2, not Stage 3.
        # The Stage 3 code provided is syntactically correct and follows the pattern.
        # The previous error was `RuntimeError: Can't call numpy() on Tensor that requires grad. Use tensor.detach().numpy() instead.`
        # This was fixed in Stage 2 by adding `.detach()` to `torch.sigmoid(outputs).cpu().numpy()`.
        # The Stage 3 code already correctly uses `torch.sigmoid(outputs).cpu().numpy()`, which implies `outputs` is already detached
        # or not requiring grad in eval mode, or the error was specific to the training loop.
        #
        # No specific error in the provided traceback for Stage 3.
        # The fix for Stage 2 was already applied in the provided Stage 3 code.
        # Therefore, no changes are needed in this specific Stage 3 snippet based on the provided traceback.
        row_labels = mlb.inverse_transform(binary_predictions[i:i+1])
        predicted_labels.append(" ".join(row_labels[0]))

    # Create submission DataFrame
    submission_df = pd.DataFrame({'ImageID': image_ids, 'Labels': predicted_labels})

    # Save submission file
    submission_df.to_csv(SUBMISSION_PATH, index=False)

    print(f"Submission file generated successfully at: {SUBMISSION_PATH}")

if __name__ == "__main__":
    main()