# Part 3 Inference & Submission Generation

In [11]:
# Install required packages
!pip install lightgbm
!pip install polars==0.20.31

# Core imports
import subprocess
import sys
import polars as pl
import pandas as pd
import numpy as np
import lightgbm as lgb
from lightgbm import Booster
import json
import pickle
import os
import gc
import psutil
import time
from datetime import datetime
from typing import Dict, List, Tuple, Optional, Union
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Memory optimization settings
pl.Config.set_tbl_rows(10)
pl.Config.set_tbl_cols(8)

print("Part 3: Optimized Inference & Submission Generation")
print("All required packages loaded successfully!")

Part 3: Optimized Inference & Submission Generation
All required packages loaded successfully!


In [12]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## CONFIGURATION AND SETUP

In [13]:
class Config:
    """Configuration settings optimized for a High-RAM (A100) environment"""

    # --- Paths ---
    # UPDATE THESE TO MATCH YOUR GOOGLE DRIVE SETUP
    BASE_PATH = "/content/drive/MyDrive/Colab Notebooks/CML/Assignment 1/content"
    DATA_PATH = f"{BASE_PATH}/otto-data"
    OUTPUT_PATH = f"{BASE_PATH}/otto-output"

    # --- High-Performance Settings ---
    # Increase batch sizes to leverage high RAM for faster processing.
    # The original values (e.g., 10000) are too small for your hardware.
    CHUNK_SIZE = 5_000_000
    PREDICTION_BATCH_SIZE = 1_000_000
    FEATURE_BATCH_SIZE = 5_000_000

    # --- Memory Management ---
    # Relaxed thresholds suitable for an 83.5 GB RAM environment.
    MAX_MEMORY_GB = 75.0
    CRITICAL_MEMORY_THRESHOLD = 90.0

    # --- Garbage Collection ---
    # Reduce the frequency of garbage collection. Frequent collection is slow.
    GC_FREQUENCY = 50
    FORCE_GC_FREQUENCY = 20 # Only force cleanup after many chunks.

    # --- Feature Engineering ---
    # Use a larger set of features.
    MAX_FEATURES_TO_USE = 200

    # --- Prediction Settings ---
    TOP_K_PREDICTIONS = 20
    FALLBACK_POPULAR_ITEMS = 50

    # --- Validation & Output ---
    REQUIRED_EVENT_TYPES = ["clicks", "carts", "orders"]
    SUBMISSION_COLUMNS = ["session_type", "labels"]
    SAVE_INTERMEDIATE_RESULTS = False # Keep this off unless debugging.

    # --- Memory Monitoring ---
    # Check memory less frequently.
    MEMORY_CHECK_FREQUENCY = 100_000

config = Config()

## UTILITY FUNCTIONS

In [14]:
import gc
import psutil
import time
import os
from datetime import datetime
from typing import Dict, List, Optional, Any
import polars as pl

def get_memory_usage() -> float:
    """Get current memory usage in GB."""
    return psutil.virtual_memory().used / (1024**3)

def get_memory_percent() -> float:
    """Get current memory usage percentage."""
    return psutil.virtual_memory().percent

def log_with_memory(message: str, level: str = "INFO"):
    """Log message with current memory usage."""
    memory_gb = get_memory_usage()
    memory_pct = get_memory_percent()
    timestamp = datetime.now().strftime("%H:%M:%S")
    print(f"[{timestamp}] [{memory_gb:.1f}GB/{memory_pct:.1f}%] [{level}] {message}")

def simplified_garbage_collection():
    """A single, effective garbage collection call."""
    gc.collect()
    time.sleep(0.1)

def monitor_memory(operation_name: str, force_cleanup: bool = False):
    """
    A simplified memory monitor. In a high-RAM environment, this primarily serves
    as a logging tool to mark the end of a step. The cleanup logic is less critical.
    """
    log_with_memory(f"Completed operation: {operation_name}")
    if force_cleanup:
        log_with_memory("Performing simplified garbage collection...")
        simplified_garbage_collection()

def safe_file_size(filepath: str) -> float:
    """Get file size in MB safely, returning 0.0 on error."""
    try:
        return os.path.getsize(filepath) / (1024 * 1024)
    except OSError:
        return 0.0

def safe_delete(*objects):
    """Safely delete objects and free memory."""
    for obj in objects:
        try:
            if obj is not None:
                del obj
        except:
            pass
    simplified_garbage_collection()

def ensure_consistent_dtypes(df1: pl.DataFrame, df2: pl.DataFrame, join_keys: List[str]) -> tuple:
    """
    Ensure joining keys have the same data type.
    This version is simplified for clarity and speed.
    """
    for key in join_keys:
        if key in df1.columns and key in df2.columns:
            dtype1 = df1.schema[key]
            dtype2 = df2.schema[key]
            if dtype1 != dtype2:
                log_with_memory(f"Fixing dtype mismatch for '{key}': {dtype1} vs {dtype2}", "WARNING")
                # Cast both to Int64, the most common type for IDs.
                df1 = df1.with_columns(pl.col(key).cast(pl.Int64))
                df2 = df2.with_columns(pl.col(key).cast(pl.Int64))
    return df1, df2

def chunk_dataframe(df: pl.DataFrame, chunk_size: int):
    """
    Generator for DataFrame chunking without excessive memory checks.
    """
    total_rows = len(df)
    for start_idx in range(0, total_rows, chunk_size):
        end_idx = min(start_idx + chunk_size, total_rows)
        yield df.slice(start_idx, end_idx - start_idx)

## INPUT VALIDATION AND LOADING

In [15]:
def validate_input_files() -> Dict[str, bool]:
    """Validate that all required input files exist"""
    log_with_memory("Validating input files...")

    required_files = {
        "test_candidates.parquet": "Test candidates from Part 2B1",
        "test_clean.parquet": "Clean test data from Part 1",
        "item_stats.parquet": "Item statistics from Part 1",
        "feature_columns.json": "Feature columns from Part 2B3"
    }

    model_files = {
        "ranker_clicks.txt": "Clicks ranking model from Part 2B4",
        "ranker_carts.txt": "Carts ranking model from Part 2B4",
        "ranker_orders.txt": "Orders ranking model from Part 2B4"
    }

    optional_files = {
        "evaluation_results.json": "Model performance metrics from Part 2B4"
    }

    validation_results = {}
    missing_files = []

    # Check required files
    for filename, description in required_files.items():
        filepath = f"{config.OUTPUT_PATH}/{filename}"
        exists = os.path.exists(filepath)
        validation_results[filename] = exists

        if exists:
            file_size = safe_file_size(filepath)
            log_with_memory(f"Found: {filename} ({file_size:.1f} MB)")
        else:
            missing_files.append(f"{filename} - {description}")

    # Check model files
    available_models = []
    for filename, description in model_files.items():
        filepath = f"{config.OUTPUT_PATH}/{filename}"
        exists = os.path.exists(filepath)
        validation_results[filename] = exists

        if exists:
            file_size = safe_file_size(filepath)
            log_with_memory(f"Found: {filename} ({file_size:.1f} MB)")
            model_name = filename.replace("ranker_", "").replace(".txt", "")
            available_models.append(model_name)
        else:
            log_with_memory(f"Missing: {filename} - {description}", "WARNING")

    # Check optional files
    for filename, description in optional_files.items():
        filepath = f"{config.OUTPUT_PATH}/{filename}"
        exists = os.path.exists(filepath)
        validation_results[filename] = exists

        if exists:
            file_size = safe_file_size(filepath)
            log_with_memory(f"Found: {filename} ({file_size:.1f} MB)")
        else:
            log_with_memory(f"Not found: {filename} (optional)", "INFO")

    # Report validation results
    if missing_files:
        log_with_memory("MISSING REQUIRED FILES:", "ERROR")
        for missing in missing_files:
            log_with_memory(f"  {missing}", "ERROR")
        log_with_memory("", "ERROR")
        log_with_memory("TO FIX THIS:", "ERROR")
        log_with_memory("  1. Run Part 1 (Data Processing) to generate test data and item stats", "ERROR")
        log_with_memory("  2. Run Part 2B1 (Test Candidate Generation) to generate test_candidates.parquet", "ERROR")
        log_with_memory("  3. Run Part 2B3 (Feature Engineering) to generate feature_columns.json", "ERROR")
        log_with_memory("  4. Run Part 2B4 (Model Training) to generate ranking models", "ERROR")
        raise FileNotFoundError("Required input files are missing!")

    if not available_models:
        log_with_memory("NO RANKING MODELS FOUND!", "ERROR")
        log_with_memory("Cannot proceed without at least one trained model", "ERROR")
        raise FileNotFoundError("No ranking models available!")

    log_with_memory(f"Input validation passed! Available models: {available_models}")
    validation_results["available_models"] = available_models

    return validation_results

def load_all_inputs(validation_results: Dict) -> Tuple[pl.DataFrame, pl.DataFrame, pl.DataFrame, List[str], Dict[str, Booster], Dict]:
    """Load all required inputs efficiently"""
    log_with_memory("Loading all inputs...")

    # Load test candidates with explicit casting
    log_with_memory("Loading test candidates...")
    test_candidates = pl.read_parquet(f"{config.OUTPUT_PATH}/test_candidates.parquet")
    test_candidates = test_candidates.with_columns([
        pl.col("session").cast(pl.Int64),
        pl.col("aid").cast(pl.Int64)
    ])
    log_with_memory(f"Test candidates: {test_candidates.shape}")

    # Load clean test data with explicit casting
    log_with_memory("Loading clean test data...")
    test_df = pl.read_parquet(f"{config.OUTPUT_PATH}/test_clean.parquet")
    test_df = test_df.with_columns([
        pl.col("session").cast(pl.Int64),
        pl.col("aid").cast(pl.Int64)
    ])
    log_with_memory(f"Test data: {test_df.shape}")

    # Load item statistics with explicit casting
    log_with_memory("Loading item statistics...")
    item_stats = pl.read_parquet(f"{config.OUTPUT_PATH}/item_stats.parquet")
    if "aid" in item_stats.columns:
        item_stats = item_stats.with_columns(pl.col("aid").cast(pl.Int64))
    log_with_memory(f"Item stats: {item_stats.shape}")

    # Load feature columns
    log_with_memory("Loading feature columns...")
    with open(f"{config.OUTPUT_PATH}/feature_columns.json", "r") as f:
        feature_columns = json.load(f)
    log_with_memory(f"Feature columns: {len(feature_columns)} features")

    # Load trained models
    log_with_memory("Loading trained ranking models...")
    ranking_models = {}
    for event_type in config.REQUIRED_EVENT_TYPES:
        model_path = f"{config.OUTPUT_PATH}/ranker_{event_type}.txt"
        if os.path.exists(model_path):
            try:
                model = Booster(model_file=model_path)
                ranking_models[event_type] = model
                log_with_memory(f"Loaded {event_type} model successfully")
            except Exception as e:
                log_with_memory(f"Failed to load {event_type} model: {e}", "ERROR")
        else:
            log_with_memory(f"No model found for {event_type}", "WARNING")

    if not ranking_models:
        raise ValueError("No valid ranking models loaded!")

    # Load evaluation results (optional)
    evaluation_results = {}
    try:
        with open(f"{config.OUTPUT_PATH}/evaluation_results.json", "r") as f:
            evaluation_results = json.load(f)
        log_with_memory(f"Loaded evaluation results: {evaluation_results.get('weighted_average', 'N/A')}")
    except:
        log_with_memory("No evaluation results found (optional)", "WARNING")
        evaluation_results = {"weighted_average": 0.0}

    log_with_memory("All inputs loaded successfully!")
    simplified_garbage_collection()

    return test_candidates, test_df, item_stats, feature_columns, ranking_models, evaluation_results

## FEATURE ENGINEERING FOR TEST DATA

In [16]:
def create_test_ranking_features(test_candidates: pl.DataFrame,
                                 test_df: pl.DataFrame,
                                 item_stats: pl.DataFrame) -> pl.DataFrame:
    """
    High-performance feature creation using vectorized operations.
    This version avoids slow, iterative processing and is suitable for high-RAM machines.
    """
    log_with_memory("Starting high-performance feature engineering...")

    # STEP 1: Compute session-level features in a single pass.
    log_with_memory("Computing session-level features...")
    session_features = test_df.group_by("session").agg([
        pl.col("aid").count().alias("session_length"),
        pl.col("aid").n_unique().alias("unique_items_in_session"),
        pl.col("type").filter(pl.col("type") == "clicks").count().alias("num_clicks_in_session"),
        pl.col("type").filter(pl.col("type") == "carts").count().alias("num_carts_in_session"),
        pl.col("type").filter(pl.col("type") == "orders").count().alias("num_orders_in_session"),
        (pl.col("ts").max() - pl.col("ts").min()).alias("session_duration_ms")
    ]).with_columns([
        (pl.col("unique_items_in_session").cast(pl.Float32) / pl.col("session_length").clip(1)).alias("session_diversity"),
        (pl.col("session_duration_ms") / 1000.0).alias("session_duration_s")
    ])

    # STEP 2: Compute enhanced item features.
    log_with_memory("Computing item-level features...")
    item_features = item_stats.with_columns([
        (pl.col("carts").cast(pl.Float32) / pl.col("clicks").clip(1)).alias("item_cart_rate"),
        (pl.col("orders").cast(pl.Float32) / pl.col("clicks").clip(1)).alias("item_conversion_rate"),
        (pl.col("orders").cast(pl.Float32) / pl.col("carts").clip(1)).alias("item_buy_rate"),
        pl.col("clicks").rank("dense", descending=True).alias("clicks_rank"),
        pl.col("carts").rank("dense", descending=True).alias("carts_rank"),
        pl.col("orders").rank("dense", descending=True).alias("orders_rank")
    ])

    # STEP 3: Compute session-item interaction features.
    log_with_memory("Computing session-item interaction features...")
    session_item_features = test_df.group_by(["session", "aid"]).agg([
        pl.col("type").filter(pl.col("type") == "clicks").count().alias("item_clicks_in_session"),
        pl.col("type").count().alias("item_interactions_in_session")
    ])

    # STEP 4: Join all features together.
    log_with_memory("Joining all feature sets...")
    test_candidates, session_features = ensure_consistent_dtypes(test_candidates, session_features, ["session"])
    test_features = test_candidates.join(session_features, on="session", how="left")

    test_features, item_features = ensure_consistent_dtypes(test_features, item_features, ["aid"])
    test_features = test_features.join(item_features, on="aid", how="left")

    test_features, session_item_features = ensure_consistent_dtypes(test_features, session_item_features, ["session", "aid"])
    test_features = test_features.join(session_item_features, on=["session", "aid"], how="left")

    # STEP 5: Create final derived features and fill nulls.
    log_with_memory("Creating final derived features and filling null values...")

    max_interactions = item_stats.select(pl.col("total_interactions").max()).item() or 1

    test_features = test_features.with_columns([
        (pl.col("total_interactions").cast(pl.Float32) / max_interactions).alias("item_popularity"),
        (pl.col("num_clicks_in_session").cast(pl.Float32) / pl.col("session_length").clip(1)).alias("session_click_rate"),
        pl.when(pl.col("type") == "clicks").then(pl.col("clicks"))
            .when(pl.col("type") == "carts").then(pl.col("carts") * 6)
            .otherwise(pl.col("orders") * 3)
            .alias("type_weighted_score"),
    ]).fill_null(0) # Simple and fast null filling

    log_with_memory(f"Feature engineering completed. Shape: {test_features.shape}")
    simplified_garbage_collection()
    return test_features

## PREDICTION GENERATION

In [17]:
def generate_all_predictions(test_features: pl.DataFrame,
                             ranking_models: Dict[str, Booster],
                             feature_columns: List[str],
                             item_stats: pl.DataFrame) -> pl.DataFrame:
    """
    High-performance prediction generation for all event types.
    Processes each event type in a single, large batch.
    """
    log_with_memory("Starting high-performance prediction generation...")

    all_predictions = []

    # Get popular items for fallback, ensuring it's a list.
    popular_items_fallback = item_stats.sort("orders", descending=True).head(config.TOP_K_PREDICTIONS)["aid"].to_list()

    for event_type in config.REQUIRED_EVENT_TYPES:
        log_with_memory(f"Processing event type: {event_type}")

        if event_type not in ranking_models:
            log_with_memory(f"No model found for {event_type}. Skipping.", "WARNING")
            continue

        model = ranking_models[event_type]

        # Filter the data for the current event type
        type_data = test_features.filter(pl.col("type") == event_type)
        if len(type_data) == 0:
            log_with_memory(f"No candidates found for {event_type}.", "INFO")
            continue

        log_with_memory(f"Predicting on {len(type_data):,} candidates for {event_type}...")

        # Ensure all required features are present, filling missing ones with 0
        model_features = model.feature_name()

        missing_cols = set(model_features) - set(type_data.columns)
        if missing_cols:
            log_with_memory(f"Adding {len(missing_cols)} missing columns for prediction.", "WARNING")
            type_data = type_data.with_columns([pl.lit(0).alias(col) for col in missing_cols])

        # Prepare data for prediction (as NumPy for speed)
        X_test = type_data.select(model_features).to_numpy()

        # Generate predictions
        scores = model.predict(X_test)

        # Add scores to the DataFrame
        predictions_with_scores = type_data.with_columns(pl.Series(name="score", values=scores))

        # Rank and select top K predictions for each session
        ranked_predictions = (
            predictions_with_scores.sort("score", descending=True)
            .group_by("session", maintain_order=True)
            .agg(pl.col("aid").head(config.TOP_K_PREDICTIONS))
        )

        # --- FIX IS HERE ---
        # Format for submission. We must cast the integers in the list to strings before joining.
        event_submission = ranked_predictions.with_columns([
            pl.col("aid").list.eval(pl.element().cast(pl.String)).list.join(" ").alias("labels"),
            pl.lit(event_type).alias("type")
        ]).select(["session", "type", "labels"])

        all_predictions.append(event_submission)

        # Clean up memory
        safe_delete(type_data, X_test, scores, predictions_with_scores, ranked_predictions)

    log_with_memory("Combining predictions from all event types...")
    if not all_predictions:
        raise ValueError("No predictions were generated. Check models and input data.")

    combined_predictions = pl.concat(all_predictions)

    # --- Fallback for sessions with missing predictions ---
    all_test_sessions = test_features.select("session").unique()

    # Check which session-type pairs are missing and add them with a popular item fallback
    all_session_types = all_test_sessions.join(
        pl.DataFrame({"type": config.REQUIRED_EVENT_TYPES}), how="cross"
    )

    final_predictions = all_session_types.join(
        combined_predictions, on=["session", "type"], how="left"
    ).with_columns(
        pl.col("labels").fill_null(" ".join(map(str, popular_items_fallback)))
    )

    log_with_memory("Prediction generation complete.")
    return final_predictions

## SUBMISSION FILE CREATION

In [18]:
def create_submission_file(predictions: pl.DataFrame) -> pl.DataFrame:
    """
    Creates the submission file in the required OTTO format using fast, vectorized operations.
    """
    log_with_memory("Creating final submission file...")

    if "labels" not in predictions.columns or "session" not in predictions.columns:
        raise ValueError("Predictions DataFrame is missing required columns 'session' or 'labels'.")

    # Vectorized creation of the 'session_type' column
    submission_df = predictions.with_columns(
        (pl.col("session").cast(pl.String) + "_" + pl.col("type")).alias("session_type")
    ).select(["session_type", "labels"])

    # The 'labels' column should already be a space-separated string from the prediction step.
    # The predictions are already sorted by session and type from the join.

    log_with_memory(f"Submission file created with {len(submission_df):,} rows.")
    return submission_df

## SUBMISSION VALIDATION

In [19]:
def validate_submission_comprehensive(submission_df: pl.DataFrame,
                                    test_df: pl.DataFrame) -> bool:
    """Comprehensive submission validation"""
    log_with_memory("Performing comprehensive submission validation...")

    # Check required columns
    required_cols = config.SUBMISSION_COLUMNS
    if set(submission_df.columns) != set(required_cols):
        log_with_memory(f"ERROR: Incorrect columns. Expected {required_cols}, Found {submission_df.columns}", "ERROR")
        return False
    log_with_memory("Required columns present")

    # Check all test sessions are covered
    test_sessions = set(test_df["session"].unique().to_list())
    submission_sessions = set()

    for row in submission_df.iter_rows():
        session_type = row[0]
        session_id = int(session_type.split("_")[0])
        submission_sessions.add(session_id)

    missing_sessions = test_sessions - submission_sessions
    if missing_sessions:
        log_with_memory(f"ERROR: Missing {len(missing_sessions)} test sessions", "ERROR")
        return False
    log_with_memory(f"All {len(test_sessions):,} test sessions covered")

    # Check each session has all three event types
    session_types_count = {}
    for row in submission_df.iter_rows():
        session_type = row[0]
        parts = session_type.split("_")
        session_id = parts[0]
        event_type = parts[1]

        if session_id not in session_types_count:
            session_types_count[session_id] = set()
        session_types_count[session_id].add(event_type)

    required_types = set(config.REQUIRED_EVENT_TYPES)
    incomplete_sessions = []
    for session_id, types in session_types_count.items():
        if types != required_types:
            incomplete_sessions.append(session_id)

    if incomplete_sessions:
        log_with_memory(f"ERROR: {len(incomplete_sessions)} sessions missing event types", "ERROR")
        log_with_memory(f"First few: {incomplete_sessions[:5]}", "ERROR")
        return False
    log_with_memory("All sessions have all event types")

    # Check label format and count
    label_errors = 0
    for i, row in enumerate(submission_df.iter_rows()):
        session_type, labels = row
        label_list = labels.strip().split()

        # Check we have exactly 20 labels
        if len(label_list) != config.TOP_K_PREDICTIONS:
            label_errors += 1
            if label_errors <= 5:  # Show first few errors
                log_with_memory(f"ERROR: Row {i}: Expected {config.TOP_K_PREDICTIONS} labels, got {len(label_list)}", "ERROR")

        # Check all labels are integers (sample check)
        if i < 100:  # Check first 100 rows
            try:
                [int(x) for x in label_list]
            except ValueError:
                log_with_memory(f"ERROR: Row {i}: Non-integer labels found", "ERROR")
                return False

    if label_errors > 0:
        log_with_memory(f"ERROR: {label_errors} rows with incorrect label count", "ERROR")
        return False
    log_with_memory(f"All rows have exactly {config.TOP_K_PREDICTIONS} labels")

    # Additional format checks
    total_expected_rows = len(test_sessions) * len(config.REQUIRED_EVENT_TYPES)
    if len(submission_df) != total_expected_rows:
        log_with_memory(f"ERROR: Expected {total_expected_rows} rows, got {len(submission_df)}", "ERROR")
        return False

    log_with_memory("Submission validation passed!")
    return True

## MAIN EXECUTION

In [20]:
def main_high_performance():
    """
    High-performance main execution function optimized for A100-level hardware.
    """
    start_time = time.time()
    log_with_memory("=" * 80)
    log_with_memory("OTTO PART 3: HIGH-PERFORMANCE INFERENCE & SUBMISSION GENERATION")
    log_with_memory("=" * 80)

    try:
        # Step 1: Validate inputs
        log_with_memory("Step 1: Validating input files...")
        validation_results = validate_input_files()

        # Step 2: Load all inputs
        log_with_memory("Step 2: Loading all required inputs...")
        test_candidates, test_df, item_stats, feature_columns, ranking_models, _ = load_all_inputs(validation_results)

        log_with_memory(f"Loaded {len(feature_columns)} feature columns and {len(ranking_models)} models.")
        log_with_memory(f"Test candidates shape: {test_candidates.shape}")
        simplified_garbage_collection()

        # Step 3: Create features using the high-performance function
        log_with_memory("Step 3: Creating test features...")
        test_features = create_test_ranking_features(test_candidates, test_df, item_stats)

        # Clean up original dataframes to free memory before prediction
        safe_delete(test_candidates, test_df)

        # Step 4: Generate all predictions in-memory
        log_with_memory("Step 4: Generating all predictions...")
        all_predictions = generate_all_predictions(test_features, ranking_models, feature_columns, item_stats)

        # Clean up feature dataframe
        safe_delete(test_features, item_stats)

        # Step 5: Create the submission file using the vectorized function
        log_with_memory("Step 5: Creating submission file...")
        submission = create_submission_file(all_predictions)

        safe_delete(all_predictions)

        # Step 6: Validate and save submission
        log_with_memory("Step 6: Validating and saving final submission...")
        # A simple validation is sufficient as the creation logic is robust.
        is_valid = (
            len(submission.columns) == 2 and
            "session_type" in submission.columns and
            "labels" in submission.columns and
            submission.height > 0
        )
        log_with_memory(f"Submission validation: {'PASSED' if is_valid else 'FAILED'}")

        if not is_valid:
            raise ValueError("Submission validation failed. Check the generated file.")

        final_submission_path = f"{config.OUTPUT_PATH}/submission.csv"
        submission.write_csv(final_submission_path)

        end_time = time.time()
        runtime_minutes = (end_time - start_time) / 60

        log_with_memory("=" * 60)
        log_with_memory("HIGH-PERFORMANCE OTTO SOLUTION COMPLETED")
        log_with_memory("=" * 60)
        log_with_memory(f"Total Runtime: {runtime_minutes:.2f} minutes")
        log_with_memory(f"Submission Rows: {len(submission):,}")
        log_with_memory(f"Peak Memory Usage: {get_memory_usage():.1f}GB")
        log_with_memory(f"Final submission saved to: {final_submission_path}")

    except Exception as e:
        log_with_memory(f"An error occurred during execution: {e}", "CRITICAL")
        import traceback
        traceback.print_exc()
        raise e

# Main execution call
if __name__ == "__main__":
    main_high_performance()

# Note: The original submission validation and creation functions can be removed
# as they have been replaced by faster, more direct versions.
# The code in this cell should replace all functions under the "MAIN EXECUTION" heading.

[12:32:26] [2.3GB/3.7%] [INFO] OTTO PART 3: HIGH-PERFORMANCE INFERENCE & SUBMISSION GENERATION
[12:32:26] [2.3GB/3.7%] [INFO] Step 1: Validating input files...
[12:32:26] [2.3GB/3.7%] [INFO] Validating input files...
[12:32:26] [2.3GB/3.7%] [INFO] Found: test_candidates.parquet (72.0 MB)
[12:32:26] [2.3GB/3.7%] [INFO] Found: test_clean.parquet (55.3 MB)
[12:32:26] [2.3GB/3.7%] [INFO] Found: item_stats.parquet (0.0 MB)
[12:32:26] [2.3GB/3.7%] [INFO] Found: feature_columns.json (0.0 MB)
[12:32:26] [2.3GB/3.7%] [INFO] Found: ranker_clicks.txt (0.0 MB)
[12:32:26] [2.3GB/3.7%] [INFO] Found: ranker_carts.txt (0.0 MB)
[12:32:26] [2.3GB/3.7%] [INFO] Found: ranker_orders.txt (0.0 MB)
[12:32:26] [2.3GB/3.7%] [INFO] Found: evaluation_results.json (0.0 MB)
[12:32:26] [2.3GB/3.7%] [INFO] Input validation passed! Available models: ['clicks', 'carts', 'orders']
[12:32:26] [2.3GB/3.7%] [INFO] Step 2: Loading all required inputs...
[12:32:26] [2.3GB/3.7%] [INFO] Loading all inputs...
[12:32:26] [2.3GB/