In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
import json

# Determine the project root
try:
    ROOT_DIR = Path(__file__).resolve().parent.parent
except NameError:  # __file__ is not defined inside Kaggle/Jupyter
    ROOT_DIR = Path.cwd()

# Define base path with fallback
# The error indicates that the image path is incorrect.
# It's trying to find '/kaggle/input/planttraits2024/PlantTraits2024/train_images/192154313.0.jpeg'
# The current BASE_PATH logic might be resolving to a path that doesn't include 'kaggle/input'
# or the structure is slightly different.
# Let's explicitly set the base path to match the Kaggle environment's typical input structure.
# Assuming the dataset is mounted at /kaggle/input/planttraits2024
BASE_PATH = Path('/kaggle/input/planttraits2024')

print(f"Resolved BASE_PATH: {BASE_PATH}")

# File path constants
TRAIN_CSV_PATH = BASE_PATH / 'train.csv'
TEST_CSV_PATH = BASE_PATH / 'test.csv'
TARGET_NAME_META_PATH = BASE_PATH / 'target_name_meta.tsv'
SAMPLE_SUBMISSION_PATH = BASE_PATH / 'sample_submission.csv'
TRAIN_IMAGES_PATH = BASE_PATH / 'train_images'
TEST_IMAGES_PATH = BASE_PATH / 'test_images'
DATASET_METADATA_PATH = BASE_PATH / 'PlantTraits2024.json' # Assuming this is the metadata file itself

# Image preprocessing constants
IMAGE_SIZE = (224, 224)

class PlantDataset(Dataset):
    def __init__(self, dataframe, img_dir, transform=None):
        self.dataframe = dataframe
        self.img_dir = img_dir
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        # The error message indicates the image name is '192154313.0.jpeg'
        # The 'id' column is numeric, and when converted to string, it might append '.0'
        # We need to ensure the image name matches the actual file names.
        # The metadata states 'id' is an integer, so we should convert it to int before forming the filename.
        img_id = int(self.dataframe.iloc[idx]['id'])
        img_name = f"{img_id}.jpeg"
        img_path = self.img_dir / img_name
        image = Image.open(img_path).convert('RGB')

        if self.transform:
            image = self.transform(image)

        # Return image and tabular data (excluding 'id' and target columns if present)
        # For training, targets will be handled separately
        tabular_data = self.dataframe.iloc[idx].drop(
            columns=['id'] + [col for col in self.dataframe.columns if col.startswith('X') and '_mean' in col],
            errors='ignore'
        )
        return image, tabular_data.values.astype(np.float32) # Ensure numeric type

def load_metadata(metadata_path):
    with open(metadata_path, 'r') as f:
        return json.load(f)

def preprocess_data():
    # Load metadata
    metadata = load_metadata(DATASET_METADATA_PATH)
    target_columns = metadata['task_definition']['target_columns']

    # Load datasets
    train_df = pd.read_csv(TRAIN_CSV_PATH)
    test_df = pd.read_csv(TEST_CSV_PATH)

    # Separate features and targets for training data
    X_train = train_df.drop(columns=target_columns)
    y_train = train_df[target_columns]
    X_test = test_df.copy() # Test set does not have target columns

    # Identify numerical and categorical columns from profiling summary
    numerical_cols = []
    categorical_cols = []

    # Get all columns from train_df and test_df excluding 'id' and target columns
    train_feature_cols_no_id = [col for col in X_train.columns if col != 'id']
    test_feature_cols_no_id = [col for col in X_test.columns if col != 'id']

    # Find common feature columns present in both train and test (excluding 'id')
    common_feature_cols = list(set(train_feature_cols_no_id) & set(test_feature_cols_no_id))

    for col in common_feature_cols:
        if col in metadata['profiling_summary']['variables']:
            var_info = metadata['profiling_summary']['variables'][col]
            if var_info['type'] == 'Numeric':
                numerical_cols.append(col)
            # If there were categorical features, they would be added here.
            # For this dataset, all relevant features are numeric.

    # Sort numerical_cols for consistency (optional but good practice)
    numerical_cols.sort()

    # Imputation
    numerical_imputer = SimpleImputer(strategy='median')
    X_train_numerical_imputed = numerical_imputer.fit_transform(X_train[numerical_cols])
    X_test_numerical_imputed = numerical_imputer.transform(X_test[numerical_cols])

    # Scaling numerical features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_numerical_imputed)
    X_test_scaled = scaler.transform(X_test_numerical_imputed)

    # Create DataFrames with only the processed numerical columns and original 'id'
    X_train_processed = pd.DataFrame(X_train_scaled, columns=numerical_cols, index=X_train['id'])
    X_test_processed = pd.DataFrame(X_test_scaled, columns=numerical_cols, index=X_test['id'])

    # Ensure y_train index matches X_train_processed index
    y_train = y_train.set_index(train_df['id']) # Use original train_df 'id' for y_train index
    y_train_processed = y_train.loc[X_train_processed.index] # Align y_train to processed X_train

    return X_train_processed, y_train_processed, X_test_processed, numerical_cols, categorical_cols, scaler, numerical_imputer, target_columns

def main():
    X_train_processed, y_train_processed, X_test_processed, numerical_cols, categorical_cols, scaler, numerical_imputer, target_columns = preprocess_data()

    print("\n--- Preprocessing Summary ---")
    print(f"Shape of X_train_processed: {X_train_processed.shape}")
    print(f"Shape of y_train_processed: {y_train_processed.shape}")
    print(f"Shape of X_test_processed: {X_test_processed.shape}")
    print(f"Number of training images (from original train_df): {len(pd.read_csv(TRAIN_CSV_PATH))}")
    print(f"Number of test images (from original test_df): {len(pd.read_csv(TEST_CSV_PATH))}")
    print(f"Numerical columns processed: {numerical_cols[:5]}...")
    print(f"Categorical columns processed: {categorical_cols}") # Expected to be empty for this dataset
    print(f"Missing values in X_train_processed after imputation: {X_train_processed.isnull().sum().sum()}")
    print(f"Missing values in X_test_processed after imputation: {X_test_processed.isnull().sum().sum()}")

    # Example of accessing data from the image dataset
    # For Stage 1, we just need to ensure the PlantDataset can be initialized.
    # The actual image loading and combining with tabular data will be done in later stages.

    # Create DataFrames for image dataset, including 'id' for image lookup
    train_image_df = pd.read_csv(TRAIN_CSV_PATH)[['id']]
    test_image_df = pd.read_csv(TEST_CSV_PATH)[['id']]

    image_transform = transforms.Compose([
        transforms.Resize(IMAGE_SIZE),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    train_image_dataset = PlantDataset(train_image_df, TRAIN_IMAGES_PATH, transform=image_transform)
    test_image_dataset = PlantDataset(test_image_df, TEST_IMAGES_PATH, transform=image_transform)

    # first_image, _ = train_image_dataset[0]
    # print(f"Shape of first image in training dataset: {first_image.shape}")

    return X_train_processed, y_train_processed, X_test_processed, train_image_dataset, test_image_dataset, numerical_cols, categorical_cols, scaler, numerical_imputer, target_columns

if __name__ == '__main__':
    X_train_processed, y_train_processed, X_test_processed, train_image_dataset, test_image_dataset, numerical_cols, categorical_cols, scaler, numerical_imputer, target_columns = main()

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
import json
import joblib
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import torch.nn as nn
import torch.optim as optim
from torchvision import models
import warnings
from tqdm import tqdm # Import tqdm for progress bar

# Suppress specific warnings from LightGBM
warnings.filterwarnings("ignore", category=UserWarning, module="lightgbm")

# Determine the project root
try:
    ROOT_DIR = Path(__file__).resolve().parent.parent
except NameError:  # __file__ is not defined inside Kaggle/Jupyter
    ROOT_DIR = Path.cwd()

# Define base path with fallback
# The error indicates that the image path is incorrect.
# It's trying to find '/kaggle/input/planttraits2024/PlantTraits2024/train_images/192154313.0.jpeg'
# The current BASE_PATH logic might be resolving to a path that doesn't include 'kaggle/input'
# or the structure is slightly different.
# Let's explicitly set the base path to match the Kaggle environment's typical input structure.
# Assuming the dataset is mounted at /kaggle/input/planttraits2024
BASE_PATH = Path('/kaggle/input/planttraits2024')

print(f"Resolved BASE_PATH: {BASE_PATH}")

# File path constants
TRAIN_CSV_PATH = BASE_PATH / 'train.csv'
TEST_CSV_PATH = BASE_PATH / 'test.csv'
TARGET_NAME_META_PATH = BASE_PATH / 'target_name_meta.tsv'
SAMPLE_SUBMISSION_PATH = BASE_PATH / 'sample_submission.csv'
TRAIN_IMAGES_PATH = BASE_PATH / 'train_images'
TEST_IMAGES_PATH = BASE_PATH / 'test_images'
DATASET_METADATA_PATH = BASE_PATH / 'PlantTraits2024.json' # Assuming this is the metadata file itself

# Model and metrics paths
MODEL_PATH = Path("./models/PlantTraits2024_model.pth") # Changed to .pth for PyTorch state_dict
METRICS_PATH = Path("./outputs/metrics.json")

# Ensure output directories exist
MODEL_PATH.parent.mkdir(parents=True, exist_ok=True)
METRICS_PATH.parent.mkdir(parents=True, exist_ok=True)

# Image preprocessing constants
IMAGE_SIZE = (224, 224)

class PlantDataset(Dataset):
    def __init__(self, dataframe, img_dir, transform=None, is_train=True, target_columns=None, tabular_features_df=None):
        self.dataframe = dataframe # This dataframe is primarily for image IDs and targets if is_train
        self.img_dir = img_dir
        self.transform = transform
        self.is_train = is_train
        self.target_columns = target_columns if target_columns is not None else []
        self.tabular_features_df = tabular_features_df # Pre-processed tabular features

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        # The error message indicates the image name is '192154313.0.jpeg'
        # The 'id' column is numeric, and when converted to string, it might append '.0'
        # We need to ensure the image name matches the actual file names.
        # The metadata states 'id' is an integer, so we should convert it to int before forming the filename.
        img_id = int(self.dataframe.iloc[idx]['id'])
        img_name = f"{img_id}.jpeg"
        img_path = self.img_dir / img_name
        image = Image.open(img_path).convert('RGB')

        if self.transform:
            image = self.transform(image)

        # Get tabular data from the pre-processed dataframe using the same index
        # Ensure that the index of self.tabular_features_df matches self.dataframe
        # This is crucial for correct alignment of image and tabular data.
        # The `tabular_features_df` is indexed by 'id'. We need to use the `img_id` to lookup.
        tabular_data_row = self.tabular_features_df.loc[img_id]
        tabular_data_tensor = torch.tensor(tabular_data_row.values.astype(np.float32))

        if self.is_train:
            targets = self.dataframe.iloc[idx][self.target_columns].values.astype(np.float32)
            return image, tabular_data_tensor, torch.tensor(targets)
        else:
            return image, tabular_data_tensor

def load_metadata(metadata_path):
    with open(metadata_path, 'r') as f:
        return json.load(f)

def preprocess_data():
    # Load metadata
    metadata = load_metadata(DATASET_METADATA_PATH)
    target_columns = metadata['task_definition']['target_columns']

    # Load datasets
    train_df = pd.read_csv(TRAIN_CSV_PATH)
    test_df = pd.read_csv(TEST_CSV_PATH)

    # Separate features and targets for training data
    X_train = train_df.drop(columns=target_columns)
    y_train = train_df[target_columns]
    X_test = test_df.copy() # Test set does not have target columns

    # Identify numerical and categorical columns from profiling summary
    numerical_cols = []
    categorical_cols = []

    # Get all columns from train_df and test_df excluding 'id' and target columns
    train_feature_cols_no_id = [col for col in X_train.columns if col != 'id']
    test_feature_cols_no_id = [col for col in X_test.columns if col != 'id']

    # Find common feature columns present in both train and test (excluding 'id')
    common_feature_cols = list(set(train_feature_cols_no_id) & set(test_feature_cols_no_id))

    for col in common_feature_cols:
        if col in metadata['profiling_summary']['variables']:
            var_info = metadata['profiling_summary']['variables'][col]
            if var_info['type'] == 'Numeric':
                numerical_cols.append(col)
            # If there were categorical features, they would be added here.
            # For this dataset, all relevant features are numeric.

    # Sort numerical_cols for consistency (optional but good practice)
    numerical_cols.sort()

    # Imputation
    numerical_imputer = SimpleImputer(strategy='median')
    X_train_numerical_imputed = numerical_imputer.fit_transform(X_train[numerical_cols])
    X_test_numerical_imputed = numerical_imputer.transform(X_test[numerical_cols])

    # Scaling numerical features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_numerical_imputed)
    X_test_scaled = scaler.transform(X_test_numerical_imputed)

    # Create DataFrames with only the processed numerical columns and original 'id'
    # This is the crucial fix: ensure X_train_processed and X_test_processed only contain the selected features.
    X_train_processed = pd.DataFrame(X_train_scaled, columns=numerical_cols, index=X_train['id'])
    X_test_processed = pd.DataFrame(X_test_scaled, columns=numerical_cols, index=X_test['id'])

    # Ensure y_train index matches X_train_processed index
    y_train = y_train.set_index(train_df['id']) # Use original train_df 'id' for y_train index
    y_train_processed = y_train.loc[X_train_processed.index] # Align y_train to processed X_train

    return X_train_processed, y_train_processed, X_test_processed, numerical_cols, categorical_cols, scaler, numerical_imputer, target_columns

class MultiModalModel(nn.Module):
    def __init__(self, num_tabular_features, num_targets):
        super(MultiModalModel, self).__init__()
        # Image branch: Pre-trained ResNet
        self.resnet = models.resnet18(weights=models.ResNet18_Weights.DEFAULT)
        self.resnet.fc = nn.Identity() # Remove the final classification layer

        # Tabular branch
        self.tabular_fc = nn.Sequential(
            nn.Linear(num_tabular_features, 128),
            nn.ReLU(),
            nn.Dropout(0.2)
        )

        # Fusion and output layer
        # ResNet18 outputs 512 features
        self.fusion_fc = nn.Sequential(
            nn.Linear(512 + 128, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, num_targets)
        )

    def forward(self, image_input, tabular_input):
        image_features = self.resnet(image_input)
        tabular_features = self.tabular_fc(tabular_input)

        combined_features = torch.cat((image_features, tabular_features), dim=1)
        output = self.fusion_fc(combined_features)
        return output

def main():
    X_train_processed, y_train_processed, X_test_processed, numerical_cols, categorical_cols, scaler, numerical_imputer, target_columns = preprocess_data()

    print("\n--- Preprocessing Summary ---")
    print(f"Shape of X_train_processed: {X_train_processed.shape}")
    print(f"Shape of y_train_processed: {y_train_processed.shape}")
    print(f"Shape of X_test_processed: {X_test_processed.shape}")
    print(f"Numerical columns processed: {numerical_cols[:5]}...")
    print(f"Categorical columns processed: {categorical_cols}") # Expected to be empty for this dataset
    print(f"Missing values in X_train_processed after imputation: {X_train_processed.isnull().sum().sum()}")
    print(f"Missing values in X_test_processed after imputation: {X_test_processed.isnull().sum().sum()}")

    # Stratified split for training and validation
    # For regression, stratification is typically done on binned target values.
    # Given multiple targets, a simple split is often used, or a more complex multi-output stratification.
    # For simplicity and given the instruction, we'll use a direct split.
    # The instruction mentions `stratify=y`, which is problematic for multi-output regression.
    # We will proceed with a non-stratified split for multi-output regression.
    # If stratification is strictly required, target values would need to be binned or a single representative target chosen.

    # Combine X and y for splitting to ensure correct row alignment
    train_combined_df = pd.concat([X_train_processed, y_train_processed], axis=1)

    # Split train_combined_df into training and validation sets
    train_ids = train_combined_df.index
    train_idx, val_idx = train_test_split(train_ids, test_size=0.2, random_state=42)

    # Create separate dataframes for image dataset initialization, ensuring 'id' is a column
    # and that they contain the target columns for is_train=True
    original_train_df = pd.read_csv(TRAIN_CSV_PATH)
    original_test_df = pd.read_csv(TEST_CSV_PATH)

    # Filter original_train_df by split indices for image dataset
    train_image_df_split = original_train_df[original_train_df['id'].isin(train_idx)].reset_index(drop=True)
    val_image_df_split = original_train_df[original_train_df['id'].isin(val_idx)].reset_index(drop=True)

    # Filter processed tabular data by split indices
    X_train_split_tabular = X_train_processed.loc[train_idx]
    y_train_split = y_train_processed.loc[train_idx]
    X_val_split_tabular = X_train_processed.loc[val_idx]
    y_val_split = y_train_processed.loc[val_idx]

    print(f"Shape of X_train_split_tabular: {X_train_split_tabular.shape}")
    print(f"Shape of y_train_split: {y_train_split.shape}")
    print(f"Shape of X_val_split_tabular: {X_val_split_tabular.shape}")
    print(f"Shape of y_val_split: {y_val_split.shape}")

    # Image data loading setup (PyTorch Dataset)
    image_transform = transforms.Compose([
        transforms.Resize(IMAGE_SIZE),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    # Initialize PlantDataset with the appropriate tabular feature dataframes
    train_dataset = PlantDataset(train_image_df_split, TRAIN_IMAGES_PATH, transform=image_transform, is_train=True, target_columns=target_columns, tabular_features_df=X_train_split_tabular)
    val_dataset = PlantDataset(val_image_df_split, TRAIN_IMAGES_PATH, transform=image_transform, is_train=True, target_columns=target_columns, tabular_features_df=X_val_split_tabular)
    test_dataset = PlantDataset(original_test_df, TEST_IMAGES_PATH, transform=image_transform, is_train=False, tabular_features_df=X_test_processed)


    # Create DataLoader instances
    BATCH_SIZE = 32
    train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
    val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)
    test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)

    # Model Training
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    num_tabular_features = X_train_processed.shape[1] # Use X_train_processed shape for model definition
    num_targets = y_train_processed.shape[1] # Use y_train_processed shape for model definition

    model = MultiModalModel(num_tabular_features, num_targets).to(device)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    num_epochs = 50 # Reduced for faster execution, can be increased
    patience = 10 # Early stopping patience
    best_val_loss = float('inf')
    epochs_no_improve = 0

    print("\n--- Training Multi-Modal Model ---")
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        # Add progress bar to training loop
        for images, tabular_data, targets in tqdm(train_dataloader, desc=f"Epoch {epoch+1} Training"):
            images, tabular_data, targets = images.to(device), tabular_data.to(device), targets.to(device)

            optimizer.zero_grad()
            outputs = model(images, tabular_data)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * images.size(0)

        epoch_loss = running_loss / len(train_dataloader.dataset)

        # Validation phase
        model.eval()
        val_running_loss = 0.0
        # Add progress bar to validation loop
        with torch.no_grad():
            for images, tabular_data, targets in tqdm(val_dataloader, desc=f"Epoch {epoch+1} Validation"):
                images, tabular_data, targets = images.to(device), tabular_data.to(device), targets.to(device)
                outputs = model(images, tabular_data)
                loss = criterion(outputs, targets)
                val_running_loss += loss.item() * images.size(0)

        val_epoch_loss = val_running_loss / len(val_dataloader.dataset)

        print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {epoch_loss:.4f}, Val Loss: {val_epoch_loss:.4f}")

        # Early stopping
        if val_epoch_loss < best_val_loss:
            best_val_loss = val_epoch_loss
            epochs_no_improve = 0
            # Save the best model state
            torch.save(model.state_dict(), MODEL_PATH) # Save PyTorch model state
        else:
            epochs_no_improve += 1
            if epochs_no_improve == patience:
                print(f"Early stopping triggered after {epoch+1} epochs!")
                break

    # Load the best model for evaluation
    model.load_state_dict(torch.load(MODEL_PATH))
    model.eval()

    # Evaluation on validation set
    val_predictions = []
    val_true = []
    with torch.no_grad():
        for images, tabular_data, targets in val_dataloader:
            images, tabular_data = images.to(device), tabular_data.to(device)
            outputs = model(images, tabular_data)
            val_predictions.append(outputs.cpu().numpy())
            val_true.append(targets.cpu().numpy())

    val_predictions = np.vstack(val_predictions)
    val_true = np.vstack(val_true)

    metrics = {}
    overall_r2 = []
    per_target_rmse = []             

    for i, target_col in enumerate(target_columns):
        y_true_target = val_true[:, i]
        y_pred_target = val_predictions[:, i]

        rmse = np.sqrt(mean_squared_error(y_true_target, y_pred_target))
        mae = mean_absolute_error(y_true_target, y_pred_target)
        r2  = r2_score(y_true_target, y_pred_target)

        metrics[target_col] = {
                "RMSE": float(rmse),
                "MAE":  float(mae),
                "R2":   float(r2)
        }
        per_target_rmse.append(rmse)  
        overall_r2.append(r2)

        print(f"\nMetrics for {target_col}:")
        print(f"  RMSE: {rmse:.4f}")
        print(f"  MAE:  {mae:.4f}")
        print(f"  R2:   {r2:.4f}")

    overall_mse  = np.mean((val_predictions - val_true) ** 2)
    overall_rmse = np.sqrt(overall_mse)
    metrics["overall_RMSE"]            = float(overall_rmse)
    metrics["mean_per_target_RMSE"]    = float(np.mean(per_target_rmse))
    mean_r2_overall = np.mean(overall_r2)
    metrics["overall_mean_R2"]         = float(mean_r2_overall)
    metrics["overall_mean_R2"] = float(mean_r2_overall) # Convert to standard float
    print(f"\nOverall Mean R2: {mean_r2_overall:.4f}")

    # Save metrics to JSON
    with open(METRICS_PATH, 'w') as f:
        json.dump(metrics, f, indent=2)
    print(f"Metrics saved to {METRICS_PATH}")

    # Persist the trained model (PyTorch state dict is saved, can also save the whole model)
    # For simplicity, we'll return the trained PyTorch model instance.
    # The state_dict is already saved to MODEL_PATH.with_suffix('.pth')
    # If you need to save the entire model object (e.g., for deployment with specific class definition),
    # you would use torch.save(model, MODEL_PATH) and load with model = torch.load(MODEL_PATH)

    # For joblib.dump, we would need a scikit-learn compatible model.
    # Since we are using PyTorch, saving the state_dict is standard.
    # If a LightGBM model was chosen for tabular data, joblib.dump would be appropriate.
    # For this multi-modal setup, we'll indicate the PyTorch model is returned.

    print(f"Trained model state saved to {MODEL_PATH}")

    return model # Return the trained PyTorch model instance

if __name__ == '__main__':
    trained_model = main()

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
import json
import joblib
import torch.nn as nn
import torch.optim as optim # Added for completeness, though not used in main() of Stage 3
from torchvision import models
import warnings
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm # Import tqdm for progress bar

# Suppress specific warnings from LightGBM
warnings.filterwarnings("ignore", category=UserWarning, module="lightgbm")

# Determine the project root
try:
    ROOT_DIR = Path(__file__).resolve().parent.parent
except NameError:  # __file__ is not defined inside Kaggle/Jupyter
    ROOT_DIR = Path.cwd()

# Define base path with fallback
# The error indicates that the image path is incorrect.
# It's trying to find '/kaggle/input/planttraits2024/PlantTraits2024/train_images/192154313.0.jpeg'
# The current BASE_PATH logic might be resolving to a path that doesn't include 'kaggle/input'
# or the structure is slightly different.
# Let's explicitly set the base path to match the Kaggle environment's typical input structure.
# Assuming the dataset is mounted at /kaggle/input/planttraits2024
BASE_PATH = Path('/kaggle/input/planttraits2024')

print(f"Resolved BASE_PATH: {BASE_PATH}")

# File path constants
TRAIN_CSV_PATH = BASE_PATH / 'train.csv'
TEST_CSV_PATH = BASE_PATH / 'test.csv'
TARGET_NAME_META_PATH = BASE_PATH / 'target_name_meta.tsv'
SAMPLE_SUBMISSION_PATH = BASE_PATH / 'sample_submission.csv'
TRAIN_IMAGES_PATH = BASE_PATH / 'train_images'
TEST_IMAGES_PATH = BASE_PATH / 'test_images'
DATASET_METADATA_PATH = BASE_PATH / 'PlantTraits2024.json' # Assuming this is the metadata file itself

# Model and metrics paths
MODEL_PATH = Path("./models/PlantTraits2024_model.pth") # Changed to .pth for PyTorch state_dict
METRICS_PATH = Path("./outputs/metrics.json")

# Ensure output directories exist
MODEL_PATH.parent.mkdir(parents=True, exist_ok=True)
METRICS_PATH.parent.mkdir(parents=True, exist_ok=True)

# Prediction and submission paths
TEST_PROCESSED_PATH = Path("./processed/test_processed.csv")
SUBMISSION_PATH = Path("./outputs/submission.csv")
TEST_PROCESSED_PATH.parent.mkdir(parents=True, exist_ok=True)
SUBMISSION_PATH.parent.mkdir(parents=True, exist_ok=True)


# Image preprocessing constants
IMAGE_SIZE = (224, 224)

class PlantDataset(Dataset):
    def __init__(self, dataframe, img_dir, transform=None, is_train=True, target_columns=None, tabular_features_df=None):
        self.dataframe = dataframe # This dataframe is primarily for image IDs and targets if is_train
        self.img_dir = img_dir
        self.transform = transform
        self.is_train = is_train
        self.target_columns = target_columns if target_columns is not None else []
        self.tabular_features_df = tabular_features_df # Pre-processed tabular features

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        # The error message indicates the image name is '192154313.0.jpeg'
        # The 'id' column is numeric, and when converted to string, it might append '.0'
        # We need to ensure the image name matches the actual file names.
        # The metadata states 'id' is an integer, so we should convert it to int before forming the filename.
        img_id = int(self.dataframe.iloc[idx]['id'])
        img_name = f"{img_id}.jpeg"
        img_path = self.img_dir / img_name
        image = Image.open(img_path).convert('RGB')

        if self.transform:
            image = self.transform(image)

        # Get tabular data from the pre-processed dataframe using the same index
        # Ensure that the index of self.tabular_features_df matches self.dataframe
        # This is crucial for correct alignment of image and tabular data.
        # The `tabular_features_df` is indexed by 'id'. We need to use the `img_id` to lookup.
        tabular_data_row = self.tabular_features_df.loc[img_id]
        tabular_data_tensor = torch.tensor(tabular_data_row.values.astype(np.float32))

        if self.is_train:
            targets = self.dataframe.iloc[idx][self.target_columns].values.astype(np.float32)
            return image, tabular_data_tensor, torch.tensor(targets)
        else:
            return image, tabular_data_tensor

def load_metadata(metadata_path):
    with open(metadata_path, 'r') as f:
        return json.load(f)

def preprocess_data():
    # Load metadata
    metadata = load_metadata(DATASET_METADATA_PATH)
    target_columns = metadata['task_definition']['target_columns']

    # Load datasets
    train_df = pd.read_csv(TRAIN_CSV_PATH)
    test_df = pd.read_csv(TEST_CSV_PATH)

    # Separate features and targets for training data
    X_train = train_df.drop(columns=target_columns)
    y_train = train_df[target_columns]
    X_test = test_df.copy() # Test set does not have target columns

    # Identify numerical and categorical columns from profiling summary
    numerical_cols = []
    categorical_cols = [] # No categorical columns identified in profiling summary for this dataset

    # Get all columns from train_df and test_df excluding 'id' and target columns
    train_feature_cols_no_id = [col for col in X_train.columns if col != 'id']
    test_feature_cols_no_id = [col for col in X_test.columns if col != 'id']

    # Find common feature columns present in both train and test (excluding 'id')
    common_feature_cols = list(set(train_feature_cols_no_id) & set(test_feature_cols_no_id))

    for col in common_feature_cols:
        if col in metadata['profiling_summary']['variables']:
            var_info = metadata['profiling_summary']['variables'][col]
            if var_info['type'] == 'Numeric':
                numerical_cols.append(col)
            # If there were categorical features, they would be added here.
            # For this dataset, all relevant features are numeric.

    # Sort numerical_cols for consistency (optional but good practice)
    numerical_cols.sort()

    # Imputation
    numerical_imputer = SimpleImputer(strategy='median')
    X_train_numerical_imputed = numerical_imputer.fit_transform(X_train[numerical_cols])
    X_test_numerical_imputed = numerical_imputer.transform(X_test[numerical_cols])

    # Scaling numerical features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_numerical_imputed)
    X_test_scaled = scaler.transform(X_test_numerical_imputed)

    # Create DataFrames with only the processed numerical columns and original 'id'
    # FIX: Ensure X_train_processed and X_test_processed only contain the selected numerical_cols.
    X_train_processed = pd.DataFrame(X_train_scaled, columns=numerical_cols, index=X_train['id'])
    X_test_processed = pd.DataFrame(X_test_scaled, columns=numerical_cols, index=X_test['id'])

    # Ensure y_train index matches X_train_processed index
    y_train = y_train.set_index(train_df['id']) # Use original train_df 'id' for y_train index
    y_train_processed = y_train.loc[X_train_processed.index] # Align y_train to processed X_train

    return X_train_processed, y_train_processed, X_test_processed, numerical_cols, categorical_cols, scaler, numerical_imputer, target_columns

class MultiModalModel(nn.Module):
    def __init__(self, num_tabular_features, num_targets):
        super(MultiModalModel, self).__init__()
        # Image branch: Pre-trained ResNet
        self.resnet = models.resnet18(weights=models.ResNet18_Weights.DEFAULT)
        self.resnet.fc = nn.Identity() # Remove the final classification layer

        # Tabular branch
        self.tabular_fc = nn.Sequential(
            nn.Linear(num_tabular_features, 128), 
            nn.ReLU(),
            nn.Dropout(0.2)
        )

        # Fusion and output layer
        # ResNet18 outputs 512 features
        self.fusion_fc = nn.Sequential(
            nn.Linear(512 + 128, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, num_targets)
        )

    def forward(self, image_input, tabular_input):
        image_features = self.resnet(image_input)
        tabular_features = self.tabular_fc(tabular_input)

        combined_features = torch.cat((image_features, tabular_features), dim=1)
        output = self.fusion_fc(combined_features)
        return output

def main(run_training=False): # Added run_training flag to control execution flow
    X_train_processed, y_train_processed, X_test_processed, numerical_cols, categorical_cols, scaler, numerical_imputer, target_columns = preprocess_data()

    print("\n--- Preprocessing Summary ---")
    print(f"Shape of X_train_processed: {X_train_processed.shape}")
    print(f"Shape of y_train_processed: {y_train_processed.shape}")
    print(f"Shape of X_test_processed: {X_test_processed.shape}")
    print(f"Numerical columns processed: {numerical_cols[:5]}...")
    print(f"Categorical columns processed: {categorical_cols}") # Expected to be empty for this dataset
    print(f"Missing values in X_train_processed after imputation: {X_train_processed.isnull().sum().sum()}")
    print(f"Missing values in X_test_processed after imputation: {X_test_processed.isnull().sum().sum()}")

    # Save processed test data for potential later use (e.g., if this script was split)
    X_test_processed.to_csv(TEST_PROCESSED_PATH)
    print(f"Processed test features saved to {TEST_PROCESSED_PATH}")

    # Image data loading setup (PyTorch Dataset)
    image_transform = transforms.Compose([
        transforms.Resize(IMAGE_SIZE),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    class CombinedTestDataset(Dataset):
        def __init__(self, X_test_processed_df, img_dir, transform=None):
            self.X_test_processed_df = X_test_processed_df
            self.img_dir = img_dir
            self.transform = transform
            self.ids = X_test_processed_df.index.values # Use the IDs from the processed tabular data

        def __len__(self):
            return len(self.ids)

        def __getitem__(self, idx):
            current_id = self.ids[idx]

            # Load image
            # The error was due to the image name format.
            # The 'id' column is numeric, and when converted to string, it might append '.0'
            # We need to ensure the image name matches the actual file names.
            # The metadata states 'id' is an integer, so we should convert it to int before forming the filename.
            img_name = f"{int(current_id)}.jpeg"
            img_path = self.img_dir / img_name
            image = Image.open(img_path).convert('RGB')
            if self.transform:
                image = self.transform(image)

            # Get tabular data (already processed)
            tabular_data = self.X_test_processed_df.loc[current_id].values.astype(np.float32)
            tabular_data_tensor = torch.tensor(tabular_data)

            return image, tabular_data_tensor

    # Create the combined test dataset
    # Pass X_test_processed directly, as it contains the IDs as index and the processed tabular features.
    test_dataset = CombinedTestDataset(X_test_processed, TEST_IMAGES_PATH, transform=image_transform)

    # Create DataLoader instances
    BATCH_SIZE = 32
    test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)

    # Model Loading and Prediction
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device for prediction: {device}")

    # Corrected num_tabular_features calculation:
    # Ensure that X_train_processed and X_test_processed only contain the features
    # that are actually used by the tabular model.
    # The `preprocess_data` function needs to be modified to return only these columns.
    # For now, let's assume the `preprocess_data` is fixed and `X_train_processed.shape[1]`
    # will yield the correct number of features (169).
    num_tabular_features = X_train_processed.shape[1] 
    num_targets = y_train_processed.shape[1] 

    model = MultiModalModel(num_tabular_features, num_targets).to(device)

    # Load the trained model state
    model.load_state_dict(torch.load(MODEL_PATH, map_location=device))

    model.eval() # Set model to evaluation mode

    test_predictions = []
    # Add progress bar to prediction loop
    with torch.no_grad():
        for images, tabular_data in tqdm(test_dataloader, desc="Predicting on Test Set"):
            images, tabular_data = images.to(device), tabular_data.to(device)
            outputs = model(images, tabular_data)
            test_predictions.append(outputs.cpu().numpy())

    test_predictions = np.vstack(test_predictions)

    # Create submission DataFrame
    submission_df = pd.DataFrame(test_predictions, columns=target_columns)
    submission_df.insert(0, 'id', X_test_processed.index.values) # Add 'id' column from processed test data index

    # Save submission file
    submission_df.to_csv(SUBMISSION_PATH, index=False)
    print(f"Submission file generated and saved to {SUBMISSION_PATH}")

    return submission_df

# The `main` function in Stage 2 also needs the `tqdm` import and progress bars.
# The `main` function in Stage 2 also needs to convert numpy floats to standard floats for JSON serialization.
# Since the request is to fix Stage 3, I'll assume Stage 2 is already fixed or will be fixed similarly.
# The `TypeError: Object of type float32 is not JSON serializable` was the primary error in the traceback,
# which is fixed by `float(rmse)`, `float(mae)`, `float(r2)`, and `float(mean_r2_overall)` in Stage 2.
# The `RuntimeError: mat1 and mat2 shapes cannot be multiplied (32x176 and 169x128)` was due to `preprocess_data`
# not explicitly selecting only the `numerical_cols` for the final processed dataframes, leading to a mismatch
# between the model's expected input features (169) and the actual input features (176).
# This is fixed by ensuring `X_train_processed` and `X_test_processed` are created by explicitly selecting `numerical_cols`.

if __name__ == '__main__':
    # Set run_training to True to train the model, False to only run prediction using a pre-trained model
    # The original traceback was from `main(run_training=True)` in Stage 2, which then tried to save metrics.
    # The current Stage 3 `main` function does not train, it only predicts.
    # The `run_training` flag is not used in the Stage 3 `main` function, but kept for consistency if it were to be re-integrated.
    # For submission, `run_training` should be False.
    submission_df = main(run_training=False)