# Part 2B3 Feature Engineering for Ranking

In [1]:
# INSTALL AND IMPORT DEPENDENCIES
import subprocess
import sys

subprocess.run([sys.executable, "-m", "pip", "install", "polars==0.20.31"], check=True)

import polars as pl
import pandas as pd
import numpy as np
import pickle
import json
import os
import gc
import time
import psutil
from datetime import datetime, timedelta
from typing import Dict, List, Tuple, Set, Optional, Union
from collections import defaultdict, Counter
import warnings
warnings.filterwarnings('ignore')

# MOUNT GOOGLE DRIVE
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


## ENHANCED CONFIGURATION

In [2]:
class Config:
    DATA_PATH = '/content/drive/MyDrive/Colab Notebooks/CML/Assignment 1/content/otto-data'
    OUTPUT_PATH = '/content/drive/MyDrive/Colab Notebooks/CML/Assignment 1/content/otto-output'

    # Enhanced feature engineering parameters
    MAX_SESSION_HISTORY_DAYS = 7          # Look back window for session features
    MIN_ITEM_INTERACTIONS = 5             # Minimum interactions for item features
    TOP_K_ITEMS = 1000                    # Top items for popularity features

    # Memory management (optimized for better data)
    CHUNK_SIZE = 5000                     # Larger chunks for better performance
    MEMORY_THRESHOLD = 0.80               # Less aggressive

    # Feature engineering settings
    ENABLE_TEMPORAL_FEATURES = True       # Time-based features
    ENABLE_STATISTICAL_FEATURES = True    # Statistical aggregations
    ENABLE_INTERACTION_FEATURES = True    # Cross-feature interactions
    ENABLE_SESSION_HISTORY = True         # Session history features

    # Quality thresholds
    MIN_FEATURE_CORRELATION = 0.01        # Minimum useful correlation
    MIN_FEATURE_VARIANCE = 0.001          # Minimum feature variance

config = Config()

# Configure Polars for enhanced performance
pl.enable_string_cache()
pl.Config.set_streaming_chunk_size(config.CHUNK_SIZE)
pl.Config.set_fmt_str_lengths(50)
pl.Config.set_tbl_rows(10)

## ENHANCED UTILITY FUNCTIONS

In [3]:
def get_memory_usage():
    """Get current memory usage in GB"""
    return psutil.Process().memory_info().rss / (1024**3)

def log(message: str):
    """Enhanced logging with memory tracking"""
    memory_gb = get_memory_usage()
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    print(f"[{timestamp}] {message}")

def log_memory(operation_name: str, logger_func=log):
    """Log memory usage for specific operations"""
    memory_mb = get_memory_usage() * 1024
    logger_func(f"{operation_name} [Memory: {memory_mb:.1f} MB]")

def force_garbage_collection():
    """Enhanced garbage collection"""
    for _ in range(3):
        gc.collect()
    time.sleep(0.1)

def safe_division(numerator, denominator, default=0.0):
    """Safe division with default value"""
    return numerator / denominator if denominator != 0 else default

def ensure_consistent_dtypes(df1: pl.DataFrame, df2: pl.DataFrame, join_keys: List[str]) -> tuple:
    """
    CRITICAL FIX: Ensure consistent data types for join keys
    This is the main fix for the dtype mismatch error
    """
    try:
        for key in join_keys:
            if key in df1.columns and key in df2.columns:
                # Get the dtypes
                dtype1 = df1.schema[key]
                dtype2 = df2.schema[key]

                if dtype1 != dtype2:
                    log(f"      Fixing dtype mismatch for '{key}': {dtype1} vs {dtype2}")

                    # Cast both to Int64 for session columns, or appropriate type for others
                    if key == "session":
                        target_dtype = pl.Int64
                    elif key == "aid":
                        target_dtype = pl.Int64
                    else:
                        # Use the "larger" type
                        target_dtype = pl.Int64 if "Int" in str(dtype1) or "Int" in str(dtype2) else pl.Float64

                    # Cast both DataFrames
                    df1 = df1.with_columns(pl.col(key).cast(target_dtype))
                    df2 = df2.with_columns(pl.col(key).cast(target_dtype))

                    log(f"      Cast both to {target_dtype}")

        return df1, df2
    except Exception as e:
        log(f"      Error ensuring consistent dtypes: {e}")
        return df1, df2

## ENHANCED INPUT VALIDATION AND LOADING

In [4]:
def validate_and_load_enhanced_inputs():
    """
    Enhanced input validation and loading with comprehensive error handling
    """
    log("Validating enhanced input files...")

    # Check required files
    required_files = {
        "val_data.parquet": "Validation data from FIXED Part 2B2",
        "train_val_splits.pkl": "Train/validation splits from FIXED Part 2B2",
        "item_stats.parquet": "Item statistics from Part 1"
    }

    optional_files = {
        "train_features.parquet": "Training data for session history (optional but recommended)"
    }

    # Validate required files
    for filename, description in required_files.items():
        filepath = f"{config.OUTPUT_PATH}/{filename}"
        if not os.path.exists(filepath):
            log(f"ERROR: Missing {filename} - {description}")
            raise FileNotFoundError(f"Missing required file: {filename}")

        file_size = os.path.getsize(filepath) / (1024*1024)
        log(f"   {filename} - {file_size:.1f} MB")

    # Check optional files
    for filename, description in optional_files.items():
        filepath = f"{config.OUTPUT_PATH}/{filename}"
        if os.path.exists(filepath):
            file_size = os.path.getsize(filepath) / (1024*1024)
            log(f"   {filename} - {file_size:.1f} MB (optional)")
        else:
            log(f"   {filename} - Not available ({description})")

    log("Required input files validated!")

    # Load data with enhanced error handling
    log("\nLoading enhanced input data...")

    try:
        # Load validation data
        log("  Loading validation data...")
        val_data = pl.read_parquet(f"{config.OUTPUT_PATH}/val_data.parquet")

        # CRITICAL FIX: Ensure session column is Int64 from the start
        if "session" in val_data.columns:
            val_data = val_data.with_columns(pl.col("session").cast(pl.Int64))
        if "aid" in val_data.columns:
            val_data = val_data.with_columns(pl.col("aid").cast(pl.Int64))

        log(f"    Validation data: {val_data.shape} ({val_data.estimated_size('mb'):.1f} MB)")

        # Validate validation data quality
        positive_count = val_data.filter(pl.col("label") == 1).height
        positive_rate = positive_count / len(val_data) * 100 if len(val_data) > 0 else 0
        unique_sessions = val_data['session'].n_unique()
        unique_items = val_data['aid'].n_unique()

        log(f"    Data quality: {len(val_data):,} samples, {positive_count:,} positive ({positive_rate:.2f}%)")
        log(f"    Diversity: {unique_sessions:,} sessions, {unique_items:,} items")

        if positive_rate < 0.1:
            log(f"    WARNING: Low positive rate {positive_rate:.2f}% - may affect feature correlations")

        log_memory("After loading validation data")

        # Load train/validation split info
        log("  Loading train/validation split info...")
        with open(f"{config.OUTPUT_PATH}/train_val_splits.pkl", "rb") as f:
            split_info = pickle.load(f)

        available_keys = list(split_info.keys())
        log(f"    Available keys in split_info: {available_keys}")

        val_sessions = split_info.get('val', {}).get('sessions', 'unknown')
        train_sessions = split_info.get('train', {}).get('sessions', 'unknown')
        log(f"    Split info - Train sessions: {train_sessions:,}, Val sessions: {val_sessions:,}")

        # Load item statistics
        log("  Loading item statistics...")
        item_stats = pl.read_parquet(f"{config.OUTPUT_PATH}/item_stats.parquet")
        # CRITICAL FIX: Ensure aid column is Int64
        if "aid" in item_stats.columns:
            item_stats = item_stats.with_columns(pl.col("aid").cast(pl.Int64))
        log(f"    Item stats: {item_stats.shape} ({item_stats.estimated_size('mb'):.1f} MB)")

        # Load training data for session history (optional)
        train_data = None
        train_data_available = False

        train_features_path = f"{config.OUTPUT_PATH}/train_features.parquet"
        if os.path.exists(train_features_path) and config.ENABLE_SESSION_HISTORY:
            try:
                log("  Loading training data for session history...")
                file_size_mb = os.path.getsize(train_features_path) / (1024*1024)
                log(f"    Training data file size: {file_size_mb:.1f} MB")

                if file_size_mb > 2000:  # Large file, use lazy loading
                    log("    Using lazy loading for large training dataset...")
                    train_data = pl.scan_parquet(train_features_path)
                    train_data_available = True
                    log(f"    Training data loaded lazily for session history features")
                else:
                    log("    Loading training data fully...")
                    train_data = pl.read_parquet(train_features_path)
                    # CRITICAL FIX: Ensure session column is Int64
                    if "session" in train_data.columns:
                        train_data = train_data.with_columns(pl.col("session").cast(pl.Int64))
                    if "aid" in train_data.columns:
                        train_data = train_data.with_columns(pl.col("aid").cast(pl.Int64))
                    train_data_available = True
                    log(f"    Training data: {train_data.shape} ({train_data.estimated_size('mb'):.1f} MB)")

                log_memory("After loading training data")

            except Exception as e:
                log(f"    Warning: Could not load training data for session history: {e}")
                train_data = None
                train_data_available = False
        else:
            log("  Training data not available - session history features will be limited")

        # Create comprehensive validation results
        validation_results = {
            "timestamp": datetime.now().isoformat(),
            "val_data_samples": len(val_data),
            "val_sessions": unique_sessions,
            "val_items": unique_items,
            "val_positive_samples": positive_count,
            "val_positive_rate": positive_rate,
            "train_data_available": train_data_available,
            "item_stats_count": len(item_stats),
            "train_sessions": train_sessions,
            "split_info_keys": available_keys
        }

        log("Enhanced input validation completed successfully!")
        return val_data, split_info, item_stats, train_data, validation_results

    except Exception as e:
        log(f"Error loading input data: {e}")
        raise e

# Load and validate inputs
val_data, split_info, item_stats, train_data, validation_results = validate_and_load_enhanced_inputs()

# Force garbage collection after loading
force_garbage_collection()
log_memory("After initial data loading and GC")

[2025-08-08 03:59:06] Validating enhanced input files...
[2025-08-08 03:59:06]    val_data.parquet - 24.4 MB
[2025-08-08 03:59:06]    train_val_splits.pkl - 0.0 MB
[2025-08-08 03:59:06]    item_stats.parquet - 0.0 MB
[2025-08-08 03:59:06]    train_features.parquet - 3893.1 MB (optional)
[2025-08-08 03:59:06] Required input files validated!
[2025-08-08 03:59:06] 
Loading enhanced input data...
[2025-08-08 03:59:06]   Loading validation data...
[2025-08-08 03:59:06]     Validation data: (6617259, 4) (187.3 MB)
[2025-08-08 03:59:07]     Data quality: 6,617,259 samples, 55,688 positive (0.84%)
[2025-08-08 03:59:07]     Diversity: 25,000 sessions, 543,691 items
[2025-08-08 03:59:07] After loading validation data [Memory: 466.3 MB]
[2025-08-08 03:59:07]   Loading train/validation split info...
[2025-08-08 03:59:07]     Available keys in split_info: ['creation_timestamp', 'validation_days', 'val_cutoff_timestamp', 'total_timespan_days', 'train', 'val']
[2025-08-08 03:59:07]     Split info - T

## ENHANCED FEATURE ENGINEERING SYSTEM

In [5]:
class EnhancedFeatureEngineer:
    """
    Comprehensive feature engineering system with 40+ sophisticated features (COMPLETE FIXED VERSION)
    """

    def __init__(self, item_stats: pl.DataFrame, train_data=None):
        self.item_stats = item_stats
        self.train_data = train_data
        self.feature_columns = []
        self.feature_stats = {}

        # Create item lookup dictionaries for fast access
        self.item_popularity = {}
        self.item_stats_dict = {}

        if len(item_stats) > 0:
            for row in item_stats.iter_rows(named=True):
                aid = row['aid']
                self.item_popularity[aid] = row.get('total_interactions', row.get('clicks', 0))
                self.item_stats_dict[aid] = row

        log(f"Feature engineer initialized with {len(self.item_stats_dict):,} items")

    def create_session_features(self, df: pl.DataFrame) -> pl.DataFrame:
        """
        Create comprehensive session-level features
        """
        log("    Creating session-level features...")

        try:
            # Basic session statistics
            session_features = (
                df.group_by("session")
                .agg([
                    # Basic counts
                    pl.col("aid").count().alias("session_length"),
                    pl.col("aid").n_unique().alias("unique_items"),
                    pl.col("type").n_unique().alias("unique_types"),

                    # Type-specific counts
                    pl.col("aid").filter(pl.col("type") == "clicks").count().alias("num_clicks"),
                    pl.col("aid").filter(pl.col("type") == "carts").count().alias("num_carts"),
                    pl.col("aid").filter(pl.col("type") == "orders").count().alias("num_orders"),

                    # Advanced session patterns
                    pl.col("aid").filter(pl.col("type") == "clicks").n_unique().alias("unique_clicked_items"),
                    pl.col("aid").filter(pl.col("type") == "carts").n_unique().alias("unique_carted_items"),
                    pl.col("aid").filter(pl.col("type") == "orders").n_unique().alias("unique_ordered_items"),
                ])
            )

            # CRITICAL FIX: Ensure session column is Int64
            session_features = session_features.with_columns(pl.col("session").cast(pl.Int64))

            # Calculate derived features with safe division
            session_features = session_features.with_columns([
                # Conversion rates (with safe division)
                (pl.col("num_carts").cast(pl.Float64) / pl.col("num_clicks").clip(lower_bound=1).cast(pl.Float64)).alias("cart_conversion_rate"),
                (pl.col("num_orders").cast(pl.Float64) / pl.col("num_clicks").clip(lower_bound=1).cast(pl.Float64)).alias("order_conversion_rate"),
                (pl.col("num_orders").cast(pl.Float64) / pl.col("num_carts").clip(lower_bound=1).cast(pl.Float64)).alias("cart_to_order_rate"),

                # Item interaction patterns
                (pl.col("session_length").cast(pl.Float64) / pl.col("unique_items").clip(lower_bound=1)).alias("avg_interactions_per_item"),
                (pl.col("unique_items").cast(pl.Float64) / pl.col("session_length").clip(lower_bound=1)).alias("item_diversity_ratio"),

                # Advanced behavioral metrics
                ((pl.col("num_clicks") + pl.col("num_carts") + pl.col("num_orders")).cast(pl.Float64) / pl.col("session_length").clip(lower_bound=1)).alias("action_intensity"),
                (pl.col("unique_types").cast(pl.Float64) / 3.0).alias("type_diversity_ratio")
            ])

            # Add to feature list
            new_features = [
                "session_length", "unique_items", "unique_types",
                "num_clicks", "num_carts", "num_orders",
                "unique_clicked_items", "unique_carted_items", "unique_ordered_items",
                "cart_conversion_rate", "order_conversion_rate", "cart_to_order_rate",
                "avg_interactions_per_item", "item_diversity_ratio",
                "action_intensity", "type_diversity_ratio"
            ]

            self.feature_columns.extend(new_features)
            log(f"      Added {len(new_features)} session features")

            return session_features

        except Exception as e:
            log(f"      Error creating session features: {e}")
            # Return meaningful fallback session features
            minimal_features = df.group_by("session").agg([
                pl.col("aid").count().alias("session_length"),
                pl.col("aid").n_unique().alias("unique_items"),
                pl.col("type").filter(pl.col("type") == "clicks").count().alias("num_clicks"),
                pl.col("type").filter(pl.col("type") == "carts").count().alias("num_carts"),
                pl.col("type").filter(pl.col("type") == "orders").count().alias("num_orders")
            ])
            # CRITICAL FIX: Ensure session column is Int64
            minimal_features = minimal_features.with_columns(pl.col("session").cast(pl.Int64))
            self.feature_columns.extend(["session_length", "unique_items", "num_clicks", "num_carts", "num_orders"])
            return minimal_features

    def create_item_features(self, df: pl.DataFrame) -> pl.DataFrame:
        """
        Create comprehensive item-level features (FIXED - no qcut issues)
        """
        log("    Creating item-level features...")

        try:
            # Get unique items in the dataset
            unique_items = df.select("aid").unique()
            # CRITICAL FIX: Ensure aid column is Int64
            unique_items = unique_items.with_columns(pl.col("aid").cast(pl.Int64))

            # Create item features using the pre-computed stats
            item_features_data = []

            for aid in unique_items['aid'].to_list():
                stats = self.item_stats_dict.get(aid, {})

                item_features_data.append({
                    "aid": aid,
                    "item_popularity": self.item_popularity.get(aid, 0),
                    "item_clicks": stats.get('clicks', 0),
                    "item_carts": stats.get('carts', 0),
                    "item_orders": stats.get('orders', 0),
                    "item_total_interactions": stats.get('total_interactions', 0),
                    "item_unique_users": stats.get('unique_users', 1),  # Default to 1 to avoid division by zero
                })

            if not item_features_data:
                log("      Warning: No item features data available")
                return pl.DataFrame({"aid": [], "item_popularity": []})

            item_features = pl.DataFrame(item_features_data)
            # CRITICAL FIX: Ensure aid column is Int64
            item_features = item_features.with_columns(pl.col("aid").cast(pl.Int64))

            # Calculate derived item features with safe operations
            item_features = item_features.with_columns([
                # Conversion rates (with safe division)
                (pl.col("item_carts").cast(pl.Float64) / pl.col("item_clicks").clip(lower_bound=1).cast(pl.Float64)).alias("item_click_to_cart_rate"),
                (pl.col("item_orders").cast(pl.Float64) / pl.col("item_clicks").clip(lower_bound=1).cast(pl.Float64)).alias("item_click_to_order_rate"),
                (pl.col("item_orders").cast(pl.Float64) / pl.col("item_carts").clip(lower_bound=1).cast(pl.Float64)).alias("item_cart_to_order_rate"),

                # Engagement metrics
                (pl.col("item_total_interactions").cast(pl.Float64) / pl.col("item_unique_users").clip(lower_bound=1).cast(pl.Float64)).alias("item_avg_interactions_per_user"),

                # Popularity metrics
                pl.col("item_popularity").log1p().alias("item_popularity_log"),
                (pl.col("item_total_interactions") + 1).log1p().alias("item_total_interactions_log")
            ])

            # FIXED: Create popularity buckets using manual thresholds instead of qcut
            try:
                popularity_stats = item_features.select([
                    pl.col("item_popularity").min().alias("min_pop"),
                    pl.col("item_popularity").max().alias("max_pop"),
                    pl.col("item_popularity").mean().alias("mean_pop"),
                    pl.col("item_popularity").std().alias("std_pop")
                ]).to_dicts()[0]

                min_pop = popularity_stats["min_pop"] or 0
                max_pop = popularity_stats["max_pop"] or 0
                mean_pop = popularity_stats["mean_pop"] or 0
                std_pop = popularity_stats["std_pop"] or 1

                # Create meaningful thresholds based on statistics
                if max_pop > 0 and std_pop > 0:
                    threshold_1 = max(1, int(mean_pop - std_pop))
                    threshold_2 = max(threshold_1 + 1, int(mean_pop - 0.5 * std_pop))
                    threshold_3 = max(threshold_2 + 1, int(mean_pop))
                    threshold_4 = max(threshold_3 + 1, int(mean_pop + 0.5 * std_pop))
                else:
                    threshold_1, threshold_2, threshold_3, threshold_4 = 1, 2, 5, 10

                # Create popularity buckets
                item_features = item_features.with_columns([
                    pl.when(pl.col("item_popularity") <= threshold_1)
                    .then(pl.lit(1))
                    .when(pl.col("item_popularity") <= threshold_2)
                    .then(pl.lit(2))
                    .when(pl.col("item_popularity") <= threshold_3)
                    .then(pl.lit(3))
                    .when(pl.col("item_popularity") <= threshold_4)
                    .then(pl.lit(4))
                    .otherwise(pl.lit(5))
                    .alias("item_popularity_bucket")
                ])

            except Exception as e:
                log(f"      Warning: Could not create popularity buckets: {e}")
                # Simple fallback bucketing
                item_features = item_features.with_columns([
                    pl.when(pl.col("item_popularity") == 0).then(pl.lit(1))
                    .when(pl.col("item_popularity") <= 5).then(pl.lit(2))
                    .when(pl.col("item_popularity") <= 20).then(pl.lit(3))
                    .when(pl.col("item_popularity") <= 100).then(pl.lit(4))
                    .otherwise(pl.lit(5))
                    .alias("item_popularity_bucket")
                ])

            # Add to feature list
            new_features = [
                "item_popularity", "item_clicks", "item_carts", "item_orders",
                "item_total_interactions", "item_unique_users",
                "item_click_to_cart_rate", "item_click_to_order_rate", "item_cart_to_order_rate",
                "item_avg_interactions_per_user", "item_popularity_log", "item_total_interactions_log",
                "item_popularity_bucket"
            ]

            self.feature_columns.extend(new_features)
            log(f"      Added {len(new_features)} item features")

            return item_features

        except Exception as e:
            log(f"      Error creating item features: {e}")
            # Return meaningful fallback item features
            unique_items = df.select("aid").unique()
            unique_items = unique_items.with_columns(pl.col("aid").cast(pl.Int64))
            minimal_features = unique_items.with_columns([
                pl.col("aid").map_elements(
                    lambda x: self.item_popularity.get(x, 0),
                    return_dtype=pl.Int64
                ).alias("item_popularity"),
                pl.lit(1).alias("item_basic_feature")
            ])
            self.feature_columns.extend(["item_popularity", "item_basic_feature"])
            return minimal_features

    def create_interaction_features(self, df: pl.DataFrame) -> pl.DataFrame:
        """
        Create session-item interaction features
        """
        log("    Creating interaction features...")

        try:
            # Session-item interaction patterns
            interaction_features = (
                df.group_by(["session", "aid"])
                .agg([
                    # Item interaction counts within session
                    pl.col("type").filter(pl.col("type") == "clicks").count().alias("item_clicks_in_session"),
                    pl.col("type").filter(pl.col("type") == "carts").count().alias("item_carts_in_session"),
                    pl.col("type").filter(pl.col("type") == "orders").count().alias("item_orders_in_session"),

                    # Item interaction patterns
                    pl.col("type").count().alias("item_total_interactions_in_session"),
                    pl.col("type").n_unique().alias("item_interaction_types"),

                    # Sequence features
                    pl.col("type").first().alias("item_first_interaction"),
                    pl.col("type").last().alias("item_last_interaction")
                ])
            )

            # CRITICAL FIX: Ensure consistent dtypes for join keys
            interaction_features = interaction_features.with_columns([
                pl.col("session").cast(pl.Int64),
                pl.col("aid").cast(pl.Int64)
            ])

            # Calculate derived interaction features
            interaction_features = interaction_features.with_columns([
                # Interaction ratios
                (pl.col("item_clicks_in_session").cast(pl.Float64) / pl.col("item_total_interactions_in_session").clip(lower_bound=1)).alias("item_click_ratio_in_session"),
                (pl.col("item_carts_in_session").cast(pl.Float64) / pl.col("item_total_interactions_in_session").clip(lower_bound=1)).alias("item_cart_ratio_in_session"),
                (pl.col("item_orders_in_session").cast(pl.Float64) / pl.col("item_total_interactions_in_session").clip(lower_bound=1)).alias("item_order_ratio_in_session"),

                # Binary indicators
                (pl.col("item_total_interactions_in_session") > 1).cast(pl.Int32).alias("item_repeated_interaction"),
                (pl.col("item_interaction_types") > 1).cast(pl.Int32).alias("item_multi_type_interaction"),
                (pl.col("item_carts_in_session") > 0).cast(pl.Int32).alias("item_has_cart"),
                (pl.col("item_orders_in_session") > 0).cast(pl.Int32).alias("item_has_order"),

                # Sequence indicators
                (pl.col("item_last_interaction") == "orders").cast(pl.Int32).alias("item_ends_with_order"),
                (pl.col("item_first_interaction") == "clicks").cast(pl.Int32).alias("item_starts_with_click")
            ])

            # Add to feature list
            new_features = [
                "item_clicks_in_session", "item_carts_in_session", "item_orders_in_session",
                "item_total_interactions_in_session", "item_interaction_types",
                "item_click_ratio_in_session", "item_cart_ratio_in_session", "item_order_ratio_in_session",
                "item_repeated_interaction", "item_multi_type_interaction",
                "item_has_cart", "item_has_order", "item_ends_with_order", "item_starts_with_click"
            ]

            self.feature_columns.extend(new_features)
            log(f"      Added {len(new_features)} interaction features")

            return interaction_features

        except Exception as e:
            log(f"      Error creating interaction features: {e}")
            # Return meaningful fallback interaction features
            minimal_features = df.group_by(["session", "aid"]).agg([
                pl.col("type").count().alias("item_total_interactions_in_session"),
                pl.col("type").filter(pl.col("type") == "clicks").count().alias("item_clicks_in_session"),
                pl.col("type").filter(pl.col("type") == "carts").count().alias("item_carts_in_session"),
                pl.col("type").filter(pl.col("type") == "orders").count().alias("item_orders_in_session")
            ])
            # CRITICAL FIX: Ensure consistent dtypes
            minimal_features = minimal_features.with_columns([
                pl.col("session").cast(pl.Int64),
                pl.col("aid").cast(pl.Int64)
            ])
            self.feature_columns.extend(["item_total_interactions_in_session", "item_clicks_in_session", "item_carts_in_session", "item_orders_in_session"])
            return minimal_features

    def create_temporal_features(self, df: pl.DataFrame) -> pl.DataFrame:
        """
        Create temporal/time-based features if timestamps are available (FIXED)
        """
        if not config.ENABLE_TEMPORAL_FEATURES:
            return pl.DataFrame()

        log("    Creating temporal features...")

        # Check if we have timestamp data in training data
        if self.train_data is not None:
            try:
                # CRITICAL FIX: Handle lazy vs regular DataFrame with consistent dtypes
                if hasattr(self.train_data, 'collect'):  # LazyFrame
                    log("      Processing temporal features from lazy training data...")
                    session_temporal = (
                        self.train_data
                        .group_by("session")
                        .agg([
                            pl.col("ts").min().alias("session_start_ts"),
                            pl.col("ts").max().alias("session_end_ts"),
                            pl.col("ts").count().alias("session_events"),
                        ])
                        .with_columns([
                            (pl.col("session_end_ts") - pl.col("session_start_ts")).alias("session_duration_ms"),
                            # CRITICAL FIX: Ensure session column is Int64
                            pl.col("session").cast(pl.Int64)
                        ])
                        .collect()
                    )
                else:  # DataFrame
                    log("      Processing temporal features from regular training data...")
                    session_temporal = (
                        self.train_data
                        .group_by("session")
                        .agg([
                            pl.col("ts").min().alias("session_start_ts"),
                            pl.col("ts").max().alias("session_end_ts"),
                            pl.col("ts").count().alias("session_events"),
                        ])
                        .with_columns([
                            (pl.col("session_end_ts") - pl.col("session_start_ts")).alias("session_duration_ms"),
                            # CRITICAL FIX: Ensure session column is Int64
                            pl.col("session").cast(pl.Int64)
                        ])
                    )

                # Calculate derived temporal features with safe operations
                session_temporal = session_temporal.with_columns([
                    # Session timing features
                    (pl.col("session_duration_ms").cast(pl.Float64) / 1000).alias("session_duration_seconds"),
                    (pl.col("session_duration_ms").cast(pl.Float64) / (1000 * 60)).alias("session_duration_minutes"),
                    (pl.col("session_duration_ms").cast(pl.Float64) / (1000 * 60 * 60)).alias("session_duration_hours"),

                    # Session pace (events per unit time)
                    (pl.col("session_events").cast(pl.Float64) / (pl.col("session_duration_ms").cast(pl.Float64) / 1000).clip(lower_bound=1)).alias("session_events_per_second"),
                    (pl.col("session_events").cast(pl.Float64) / (pl.col("session_duration_ms").cast(pl.Float64) / (1000 * 60)).clip(lower_bound=1)).alias("session_events_per_minute"),
                ])

                # Add to feature list
                new_features = [
                    "session_duration_ms", "session_duration_seconds", "session_duration_minutes", "session_duration_hours",
                    "session_events_per_second", "session_events_per_minute"
                ]

                self.feature_columns.extend(new_features)
                log(f"      Added {len(new_features)} temporal features")

                return session_temporal.select(["session"] + new_features)

            except Exception as e:
                log(f"      Warning: Could not create temporal features: {e}")
                return pl.DataFrame()
        else:
            log("      No training data available for temporal features")
            return pl.DataFrame()

    def create_statistical_features(self, df: pl.DataFrame, session_features: pl.DataFrame, item_features: pl.DataFrame) -> pl.DataFrame:
        """
        Create statistical aggregation features
        """
        if not config.ENABLE_STATISTICAL_FEATURES:
            return pl.DataFrame()

        log("    Creating statistical features...")

        try:
            # Join session and item data for statistical calculations
            required_item_cols = ["aid", "item_popularity", "item_total_interactions"]
            available_item_cols = [col for col in required_item_cols if col in item_features.columns]

            if len(available_item_cols) < 2:
                log("      Warning: Insufficient item data for statistical features")
                return pl.DataFrame()

            # CRITICAL FIX: Ensure dtype consistency before join
            df_with_dtypes = df.with_columns(pl.col("aid").cast(pl.Int64))
            item_features_with_dtypes = item_features.with_columns(pl.col("aid").cast(pl.Int64))

            df_with_item_stats = df_with_dtypes.join(
                item_features_with_dtypes.select(available_item_cols),
                on="aid",
                how="left"
            )

            # Session-level statistical features
            agg_list = []

            if "item_popularity" in df_with_item_stats.columns:
                agg_list.extend([
                    pl.col("item_popularity").mean().alias("avg_item_popularity"),
                    pl.col("item_popularity").max().alias("max_item_popularity"),
                    pl.col("item_popularity").min().alias("min_item_popularity"),
                    pl.col("item_popularity").std().alias("std_item_popularity"),
                    pl.col("item_popularity").median().alias("median_item_popularity")
                ])

            if "item_total_interactions" in df_with_item_stats.columns:
                agg_list.extend([
                    pl.col("item_total_interactions").mean().alias("avg_item_interactions"),
                    pl.col("item_total_interactions").max().alias("max_item_interactions"),
                    pl.col("item_total_interactions").min().alias("min_item_interactions"),
                    pl.col("item_total_interactions").std().alias("std_item_interactions")
                ])

            if not agg_list:
                log("      Warning: No valid columns for statistical features")
                return pl.DataFrame()

            statistical_features = df_with_item_stats.group_by("session").agg(agg_list)

            # CRITICAL FIX: Ensure session column is Int64
            statistical_features = statistical_features.with_columns(pl.col("session").cast(pl.Int64))

            # Calculate derived statistical features
            derived_features = []

            if all(col in statistical_features.columns for col in ["max_item_popularity", "min_item_popularity"]):
                derived_features.append(
                    (pl.col("max_item_popularity") - pl.col("min_item_popularity")).alias("item_popularity_range")
                )

            if all(col in statistical_features.columns for col in ["avg_item_popularity", "max_item_popularity"]):
                derived_features.append(
                    (pl.col("avg_item_popularity").cast(pl.Float64) / pl.col("max_item_popularity").clip(lower_bound=1).cast(pl.Float64)).alias("avg_to_max_popularity_ratio")
                )

            if all(col in statistical_features.columns for col in ["std_item_popularity", "avg_item_popularity"]):
                derived_features.append(
                    (pl.col("std_item_popularity").cast(pl.Float64) / pl.col("avg_item_popularity").clip(lower_bound=1).cast(pl.Float64)).alias("popularity_coefficient_variation")
                )

            if derived_features:
                statistical_features = statistical_features.with_columns(derived_features)

            # Fill null values with defaults
            statistical_features = statistical_features.fill_null(0)

            # Add to feature list
            new_features = [col for col in statistical_features.columns if col != "session"]

            self.feature_columns.extend(new_features)
            log(f"      Added {len(new_features)} statistical features")

            return statistical_features

        except Exception as e:
            log(f"      Error creating statistical features: {e}")
            return pl.DataFrame()

    def engineer_all_features(self, df: pl.DataFrame) -> pl.DataFrame:
        """
        Orchestrate comprehensive feature engineering with robust error handling (COMPLETE FIXED VERSION)
        """
        log("  Creating comprehensive feature set...")

        # Reset feature tracking
        self.feature_columns = []

        try:
            # CRITICAL FIX: Ensure main dataframe has correct dtypes
            df = df.with_columns([
                pl.col("session").cast(pl.Int64),
                pl.col("aid").cast(pl.Int64)
            ])

            # Create all feature types with error handling
            session_features = self.create_session_features(df)
            item_features = self.create_item_features(df)
            interaction_features = self.create_interaction_features(df)
            temporal_features = self.create_temporal_features(df)
            statistical_features = self.create_statistical_features(df, session_features, item_features)

            # Join all features to original data with CRITICAL FIXES
            log("  Joining all features...")

            # Start with original data
            result = df

            # CRITICAL FIX: Join session features with dtype consistency
            if len(session_features) > 0:
                result, session_features = ensure_consistent_dtypes(result, session_features, ["session"])
                result = result.join(session_features, on="session", how="left")
                log(f"    Joined session features: {len(session_features)} rows")

            # CRITICAL FIX: Join item features with dtype consistency
            if len(item_features) > 0:
                item_feature_cols = [col for col in item_features.columns if col.startswith("item_") or col == "aid"]
                if len(item_feature_cols) > 1:  # More than just "aid"
                    result, item_features = ensure_consistent_dtypes(result, item_features, ["aid"])
                    result = result.join(item_features.select(item_feature_cols), on="aid", how="left")
                    log(f"    Joined item features: {len(item_features)} rows")

            # CRITICAL FIX: Join interaction features with dtype consistency
            if len(interaction_features) > 0:
                result, interaction_features = ensure_consistent_dtypes(result, interaction_features, ["session", "aid"])
                result = result.join(interaction_features, on=["session", "aid"], how="left")
                log(f"    Joined interaction features: {len(interaction_features)} rows")

            # CRITICAL FIX: Join temporal features with dtype consistency
            if len(temporal_features) > 0:
                result, temporal_features = ensure_consistent_dtypes(result, temporal_features, ["session"])
                result = result.join(temporal_features, on="session", how="left")
                log(f"    Joined temporal features: {len(temporal_features)} rows")

            # CRITICAL FIX: Join statistical features with dtype consistency
            if len(statistical_features) > 0:
                result, statistical_features = ensure_consistent_dtypes(result, statistical_features, ["session"])
                result = result.join(statistical_features, on="session", how="left")
                log(f"    Joined statistical features: {len(statistical_features)} rows")

            # Fill remaining null values
            feature_cols = [col for col in self.feature_columns if col in result.columns]
            if feature_cols:
                result = result.with_columns([
                    pl.col(col).fill_null(0) for col in feature_cols
                ])

            log(f"  Feature engineering completed: {len(self.feature_columns)} features created")

            return result

        except Exception as e:
            log(f"  Error in feature engineering: {e}")
            log(f"  Full error details: {str(e)}")

            # IMPROVED FALLBACK: Create meaningful basic features instead of dummy ones
            log("  Creating meaningful fallback features...")
            try:
                # Ensure dtype consistency for main df
                df = df.with_columns([
                    pl.col("session").cast(pl.Int64),
                    pl.col("aid").cast(pl.Int64)
                ])

                # Basic session features
                basic_session = df.group_by("session").agg([
                    pl.col("aid").count().alias("session_length"),
                    pl.col("aid").n_unique().alias("unique_items"),
                    pl.col("type").filter(pl.col("type") == "clicks").count().alias("num_clicks"),
                    pl.col("type").filter(pl.col("type") == "carts").count().alias("num_carts"),
                    pl.col("type").filter(pl.col("type") == "orders").count().alias("num_orders")
                ]).with_columns(pl.col("session").cast(pl.Int64))

                # Basic item features
                basic_item = df.select("aid").unique().with_columns([
                    pl.col("aid").cast(pl.Int64),
                    pl.col("aid").map_elements(
                        lambda x: self.item_stats_dict.get(x, {}).get('total_interactions', 1),
                        return_dtype=pl.Int64
                    ).alias("item_popularity")
                ])

                # Basic interaction features
                basic_interaction = df.group_by(["session", "aid"]).agg([
                    pl.col("type").count().alias("interactions_count")
                ]).with_columns([
                    pl.col("session").cast(pl.Int64),
                    pl.col("aid").cast(pl.Int64)
                ])

                # Join fallback features with dtype consistency
                result = df

                result, basic_session = ensure_consistent_dtypes(result, basic_session, ["session"])
                result = result.join(basic_session, on="session", how="left")

                result, basic_item = ensure_consistent_dtypes(result, basic_item, ["aid"])
                result = result.join(basic_item, on="aid", how="left")

                result, basic_interaction = ensure_consistent_dtypes(result, basic_interaction, ["session", "aid"])
                result = result.join(basic_interaction, on=["session", "aid"], how="left")

                # Fill nulls and set feature columns
                result = result.fill_null(0)
                self.feature_columns = [
                    "session_length", "unique_items", "num_clicks", "num_carts", "num_orders",
                    "item_popularity", "interactions_count"
                ]

                log(f"  Fallback completed with {len(self.feature_columns)} meaningful features")

            except Exception as fallback_error:
                log(f"  Fallback also failed: {fallback_error}")
                # Ultimate fallback: minimal features but still meaningful
                basic_features = df.with_columns([
                    pl.lit(1).alias("basic_feature_constant"),
                    (pl.col("type") == "clicks").cast(pl.Int32).alias("is_click"),
                    (pl.col("type") == "carts").cast(pl.Int32).alias("is_cart"),
                    (pl.col("type") == "orders").cast(pl.Int32).alias("is_order")
                ])
                self.feature_columns = ["basic_feature_constant", "is_click", "is_cart", "is_order"]
                result = basic_features

            return result

## FEATURE ENGINEERING EXECUTION WITH COMPLETE ERROR HANDLING

In [6]:
# Create feature engineer and process data with enhanced error handling
log("Creating enhanced feature engineering system...")

try:
    feature_engineer = EnhancedFeatureEngineer(item_stats, train_data)

    log("Engineering comprehensive features...")
    val_data_features = feature_engineer.engineer_all_features(val_data)

    # Get final feature list
    feature_columns = feature_engineer.feature_columns
    log(f"Total features created: {len(feature_columns)}")

    # Validate the output
    if len(val_data_features) == 0:
        raise ValueError("No features were created - feature engineering failed")

    if len(feature_columns) == 0:
        raise ValueError("No feature columns were tracked - feature engineering failed")

    # Check if we have the expected basic columns
    required_columns = ["session", "aid", "type", "label"]
    missing_columns = [col for col in required_columns if col not in val_data_features.columns]

    if missing_columns:
        raise ValueError(f"Missing required columns after feature engineering: {missing_columns}")

    log(f"Feature engineering validation passed:")
    log(f"  - Output shape: {val_data_features.shape}")
    log(f"  - Features created: {len(feature_columns)}")
    log(f"  - Required columns present: {all(col in val_data_features.columns for col in required_columns)}")

except Exception as e:
    log(f"Error in feature engineering: {e}")
    log("Attempting comprehensive fallback feature engineering...")

    try:
        # Comprehensive fallback: Create meaningful features manually
        log("Creating comprehensive fallback features...")

        # Ensure dtypes
        val_data = val_data.with_columns([
            pl.col("session").cast(pl.Int64),
            pl.col("aid").cast(pl.Int64)
        ])

        # Basic session features
        session_basic = (
            val_data.group_by("session")
            .agg([
                pl.col("aid").count().alias("session_length"),
                pl.col("aid").n_unique().alias("unique_items"),
                pl.col("type").filter(pl.col("type") == "clicks").count().alias("num_clicks"),
                pl.col("type").filter(pl.col("type") == "carts").count().alias("num_carts"),
                pl.col("type").filter(pl.col("type") == "orders").count().alias("num_orders"),
                pl.col("type").n_unique().alias("unique_types")
            ])
            .with_columns([
                pl.col("session").cast(pl.Int64),
                # Add derived features
                (pl.col("num_carts").cast(pl.Float64) / pl.col("num_clicks").clip(lower_bound=1)).alias("cart_rate"),
                (pl.col("num_orders").cast(pl.Float64) / pl.col("num_clicks").clip(lower_bound=1)).alias("order_rate"),
                (pl.col("unique_items").cast(pl.Float64) / pl.col("session_length").clip(lower_bound=1)).alias("item_diversity")
            ])
        )

        # Basic item features from item_stats
        item_basic = None
        if len(item_stats) > 0:
            unique_aids = val_data.select("aid").unique().with_columns(pl.col("aid").cast(pl.Int64))
            item_stats_typed = item_stats.with_columns(pl.col("aid").cast(pl.Int64))

            item_basic = unique_aids.join(
                item_stats_typed.select(["aid", "clicks", "carts", "orders", "total_interactions"]).rename({
                    "clicks": "item_clicks",
                    "carts": "item_carts",
                    "orders": "item_orders",
                    "total_interactions": "item_popularity"
                }),
                on="aid",
                how="left"
            ).fill_null(0).with_columns([
                # Add derived item features
                (pl.col("item_carts").cast(pl.Float64) / pl.col("item_clicks").clip(lower_bound=1)).alias("item_cart_rate"),
                (pl.col("item_orders").cast(pl.Float64) / pl.col("item_clicks").clip(lower_bound=1)).alias("item_order_rate"),
                pl.col("item_popularity").log1p().alias("item_popularity_log")
            ])

        # Basic interaction features
        interaction_basic = (
            val_data.group_by(["session", "aid"])
            .agg([
                pl.col("type").count().alias("interactions_count"),
                pl.col("type").filter(pl.col("type") == "clicks").count().alias("clicks_count"),
                pl.col("type").filter(pl.col("type") == "carts").count().alias("carts_count"),
                pl.col("type").filter(pl.col("type") == "orders").count().alias("orders_count"),
                pl.col("type").n_unique().alias("interaction_types")
            ])
            .with_columns([
                pl.col("session").cast(pl.Int64),
                pl.col("aid").cast(pl.Int64),
                # Add derived interaction features
                (pl.col("interactions_count") > 1).cast(pl.Int32).alias("repeated_interaction"),
                (pl.col("carts_count") > 0).cast(pl.Int32).alias("has_cart"),
                (pl.col("orders_count") > 0).cast(pl.Int32).alias("has_order")
            ])
        )

        # Join all fallback features
        val_data_features = val_data.join(session_basic, on="session", how="left")

        if item_basic is not None:
            val_data_features = val_data_features.join(item_basic, on="aid", how="left")

        val_data_features = val_data_features.join(interaction_basic, on=["session", "aid"], how="left")

        # Define feature columns
        feature_columns = [
            "session_length", "unique_items", "num_clicks", "num_carts", "num_orders", "unique_types",
            "cart_rate", "order_rate", "item_diversity",
            "interactions_count", "clicks_count", "carts_count", "orders_count", "interaction_types",
            "repeated_interaction", "has_cart", "has_order"
        ]

        if item_basic is not None:
            feature_columns.extend([
                "item_clicks", "item_carts", "item_orders", "item_popularity",
                "item_cart_rate", "item_order_rate", "item_popularity_log"
            ])

        # Fill any remaining nulls
        feature_cols_present = [col for col in feature_columns if col in val_data_features.columns]
        val_data_features = val_data_features.with_columns([
            pl.col(col).fill_null(0) for col in feature_cols_present
        ])

        feature_columns = feature_cols_present
        log(f"Comprehensive fallback feature engineering completed: {len(feature_columns)} meaningful features")

    except Exception as fallback_error:
        log(f"Comprehensive fallback feature engineering also failed: {fallback_error}")
        # Final fallback: basic meaningful features
        val_data_features = val_data.with_columns([
            pl.lit(1).alias("constant_feature"),
            (pl.col("type") == "clicks").cast(pl.Int32).alias("is_click"),
            (pl.col("type") == "carts").cast(pl.Int32).alias("is_cart"),
            (pl.col("type") == "orders").cast(pl.Int32).alias("is_order")
        ])
        feature_columns = ["constant_feature", "is_click", "is_cart", "is_order"]
        log(f"Using final fallback: {len(feature_columns)} basic meaningful features")

force_garbage_collection()
log_memory("After feature engineering")

[2025-08-08 03:59:07] Creating enhanced feature engineering system...
[2025-08-08 03:59:07] Feature engineer initialized with 1,000 items
[2025-08-08 03:59:07] Engineering comprehensive features...
[2025-08-08 03:59:07]   Creating comprehensive feature set...
[2025-08-08 03:59:07]     Creating session-level features...
[2025-08-08 03:59:08]       Added 16 session features
[2025-08-08 03:59:08]     Creating item-level features...
[2025-08-08 03:59:10]       Added 13 item features
[2025-08-08 03:59:10]     Creating interaction features...
[2025-08-08 03:59:15]       Added 14 interaction features
[2025-08-08 03:59:15]     Creating temporal features...
[2025-08-08 03:59:15]       Processing temporal features from lazy training data...
[2025-08-08 03:59:32]       Added 6 temporal features
[2025-08-08 03:59:32]     Creating statistical features...
[2025-08-08 03:59:33]       Added 12 statistical features
[2025-08-08 03:59:33]   Joining all features...
[2025-08-08 03:59:33]     Joined session

## ENHANCED FEATURE ANALYSIS

In [7]:
def analyze_enhanced_features(val_data_features: pl.DataFrame,
                            feature_columns: List[str],
                            validation_results: Dict) -> Dict:
    """
    Comprehensive feature analysis with enhanced error handling
    """
    log("Performing enhanced feature analysis...")

    try:
        # Verify inputs
        if len(val_data_features) == 0:
            log("  Warning: Empty feature dataset")
            return {"error": "Empty feature dataset", "timestamp": datetime.now().isoformat()}

        if len(feature_columns) == 0:
            log("  Warning: No feature columns to analyze")
            return {"error": "No feature columns", "timestamp": datetime.now().isoformat()}

        if "label" not in val_data_features.columns:
            log("  Warning: No label column for correlation analysis")
            return {"error": "No label column", "timestamp": datetime.now().isoformat()}

        # Basic feature statistics
        feature_stats = {}
        correlation_results = {}

        # Analyze each feature
        log("  Calculating feature statistics...")
        valid_features = 0

        for feature in feature_columns[:20]:  # Limit to first 20 features for performance
            if feature in val_data_features.columns:
                try:
                    feature_data = val_data_features.select([feature, "label"])

                    # Basic statistics with error handling
                    stats_query = feature_data.select([
                        pl.col(feature).mean().alias("mean"),
                        pl.col(feature).std().alias("std"),
                        pl.col(feature).min().alias("min"),
                        pl.col(feature).max().alias("max"),
                        pl.col(feature).null_count().alias("null_count")
                    ])

                    stats = stats_query.to_dicts()[0]

                    # Calculate correlation with label safely
                    correlation = 0.0
                    try:
                        # Only calculate correlation if we have variance in features and reasonable sample size
                        feature_std = stats.get("std")
                        if feature_std is not None and feature_std > 0:
                            # Sample data for correlation if too large
                            if len(feature_data) > 100000:
                                feature_data_sample = feature_data.sample(n=50000, seed=42)
                            else:
                                feature_data_sample = feature_data

                            correlation_data = feature_data_sample.to_pandas()
                            if len(correlation_data) > 1:
                                corr_result = correlation_data[feature].corr(correlation_data["label"])
                                if not pd.isna(corr_result):
                                    correlation = float(corr_result)
                    except Exception as corr_error:
                        log(f"    Warning: Could not calculate correlation for {feature}: {corr_error}")
                        correlation = 0.0

                    feature_stats[feature] = {
                        "mean": float(stats["mean"]) if stats["mean"] is not None else 0.0,
                        "std": float(stats["std"]) if stats["std"] is not None else 0.0,
                        "min": float(stats["min"]) if stats["min"] is not None else 0.0,
                        "max": float(stats["max"]) if stats["max"] is not None else 0.0,
                        "null_count": int(stats["null_count"]) if stats["null_count"] is not None else 0,
                        "correlation": correlation
                    }

                    correlation_results[feature] = correlation
                    valid_features += 1

                except Exception as feature_error:
                    log(f"    Warning: Could not analyze feature {feature}: {feature_error}")
                    feature_stats[feature] = {
                        "mean": 0.0, "std": 0.0, "min": 0.0, "max": 0.0,
                        "null_count": 0, "correlation": 0.0
                    }
                    correlation_results[feature] = 0.0
            else:
                log(f"    Warning: Feature {feature} not found in dataset")

        log(f"  Successfully analyzed {valid_features} out of {len(feature_columns)} features")

        # Find top correlated features
        if correlation_results:
            sorted_correlations = sorted(correlation_results.items(),
                                       key=lambda x: abs(x[1]), reverse=True)
            top_correlated_features = sorted_correlations[:10]

            log(f"  Top correlated features:")
            for i, (feature, corr) in enumerate(top_correlated_features[:5]):
                log(f"    {i+1}. {feature}: {corr:.4f}")
        else:
            top_correlated_features = []
            log("  No correlation results available")

        # Feature categories analysis
        feature_categories = {
            "session": len([f for f in feature_columns if f.startswith(("session_", "num_", "unique_", "cart_", "order_", "item_diversity", "action_"))]),
            "item": len([f for f in feature_columns if f.startswith("item_") and not any(x in f for x in ["in_session", "has_", "repeated", "multi_type", "_count"])]),
            "interaction": len([f for f in feature_columns if f.startswith("item_") and any(x in f for x in ["in_session", "has_", "repeated", "multi_type"]) or f in ["interactions_count", "clicks_count", "carts_count", "orders_count"]]),
            "temporal": len([f for f in feature_columns if any(x in f for x in ["duration", "per_second", "per_minute", "start_", "end_"])]),
            "statistical": len([f for f in feature_columns if f.startswith(("avg_", "max_", "min_", "std_", "median_", "range"))])
        }

        log(f"  Feature categories:")
        for category, count in feature_categories.items():
            if count > 0:
                log(f"    {category}: {count} features")

        # Quality assessment
        high_correlation_features = [f for f, c in correlation_results.items()
                                   if abs(c) > config.MIN_FEATURE_CORRELATION]
        valid_variance_features = [f for f, stats in feature_stats.items()
                                 if stats["std"] > config.MIN_FEATURE_VARIANCE]

        max_correlation = max([abs(c) for c in correlation_results.values()]) if correlation_results else 0.0
        avg_correlation = np.mean([abs(c) for c in correlation_results.values()]) if correlation_results else 0.0

        quality_assessment = {
            "total_features": len(feature_columns),
            "features_with_data": len(feature_stats),
            "valid_features_analyzed": valid_features,
            "high_correlation_features": len(high_correlation_features),
            "valid_variance_features": len(valid_variance_features),
            "max_correlation": max_correlation,
            "avg_correlation": avg_correlation
        }

        log(f"  Quality assessment:")
        log(f"    Valid features analyzed: {valid_features}/{len(feature_columns)}")
        log(f"    Features with high correlation (>{config.MIN_FEATURE_CORRELATION}): {len(high_correlation_features)}")
        log(f"    Features with valid variance (>{config.MIN_FEATURE_VARIANCE}): {len(valid_variance_features)}")
        log(f"    Max correlation: {max_correlation:.4f}")
        log(f"    Avg correlation: {avg_correlation:.4f}")

        # Memory usage calculation
        try:
            memory_usage_mb = val_data_features.estimated_size('mb')
        except:
            memory_usage_mb = 0.0

        # Compile comprehensive analysis
        feature_analysis = {
            "analysis_timestamp": datetime.now().isoformat(),
            "feature_statistics": feature_stats,
            "correlation_results": correlation_results,
            "top_correlated_features": top_correlated_features,
            "feature_categories": feature_categories,
            "quality_assessment": quality_assessment,
            "memory_usage_mb": memory_usage_mb,
            "validation_results": validation_results
        }

        log("Enhanced feature analysis completed!")
        return feature_analysis

    except Exception as e:
        log(f"Error in feature analysis: {e}")
        return {
            "error": str(e),
            "timestamp": datetime.now().isoformat(),
            "feature_count": len(feature_columns),
            "data_shape": val_data_features.shape if hasattr(val_data_features, 'shape') else "unknown"
        }

# Perform comprehensive feature analysis with error handling
try:
    feature_analysis = analyze_enhanced_features(val_data_features, feature_columns, validation_results)

    if "error" in feature_analysis:
        log(f"Feature analysis had issues: {feature_analysis['error']}")
        # Create minimal analysis
        feature_analysis = {
            "analysis_timestamp": datetime.now().isoformat(),
            "feature_statistics": {},
            "correlation_results": {},
            "top_correlated_features": [],
            "feature_categories": {"total": len(feature_columns)},
            "quality_assessment": {
                "total_features": len(feature_columns),
                "features_with_data": 0,
                "max_correlation": 0.0,
                "avg_correlation": 0.0
            },
            "memory_usage_mb": 0.0,
            "validation_results": validation_results,
            "error": feature_analysis.get("error", "Unknown analysis error")
        }

except Exception as e:
    log(f"Critical error in feature analysis: {e}")
    # Create emergency fallback analysis
    feature_analysis = {
        "analysis_timestamp": datetime.now().isoformat(),
        "feature_statistics": {},
        "correlation_results": {},
        "top_correlated_features": [],
        "feature_categories": {"emergency": len(feature_columns)},
        "quality_assessment": {
            "total_features": len(feature_columns),
            "features_with_data": 0,
            "max_correlation": 0.0,
            "avg_correlation": 0.0
        },
        "memory_usage_mb": 0.0,
        "validation_results": validation_results,
        "error": f"Critical analysis failure: {str(e)}"
    }

force_garbage_collection()
log_memory("After feature analysis")

[2025-08-08 03:59:37] Performing enhanced feature analysis...
[2025-08-08 03:59:37]   Calculating feature statistics...
[2025-08-08 03:59:38]   Successfully analyzed 20 out of 61 features
[2025-08-08 03:59:38]   Top correlated features:
[2025-08-08 03:59:38]     1. item_carts: -0.0130
[2025-08-08 03:59:38]     2. item_popularity: -0.0117
[2025-08-08 03:59:38]     3. item_clicks: -0.0114
[2025-08-08 03:59:38]     4. item_orders: -0.0110
[2025-08-08 03:59:38]     5. unique_items: -0.0070
[2025-08-08 03:59:38]   Feature categories:
[2025-08-08 03:59:38]     session: 20 features
[2025-08-08 03:59:38]     item: 18 features
[2025-08-08 03:59:38]     interaction: 11 features
[2025-08-08 03:59:38]     temporal: 6 features
[2025-08-08 03:59:38]     statistical: 11 features
[2025-08-08 03:59:38]   Quality assessment:
[2025-08-08 03:59:38]     Valid features analyzed: 20/61
[2025-08-08 03:59:38]     Features with high correlation (>0.01): 4
[2025-08-08 03:59:38]     Features with valid variance (

## ENHANCED OUTPUT SAVING

In [8]:
def save_enhanced_feature_outputs(val_data_features: pl.DataFrame,
                                feature_columns: List[str],
                                feature_analysis: Dict) -> Dict:
    """
    Save all enhanced feature engineering outputs with robust error handling
    """
    log("Saving enhanced feature engineering outputs...")
    log_memory("Before saving outputs")

    try:
        # Ensure output directory exists
        os.makedirs(config.OUTPUT_PATH, exist_ok=True)

        output_paths = {}

        # 1. Save validation data with features (main output) - use compression
        features_path = f"{config.OUTPUT_PATH}/val_data_features.parquet"
        try:
            val_data_features.write_parquet(features_path, compression="snappy")
            file_size = os.path.getsize(features_path) / (1024*1024)
            log(f"   val_data_features.parquet saved ({file_size:.1f} MB)")
            output_paths["features_path"] = features_path
        except Exception as e:
            log(f"   Error saving val_data_features.parquet: {e}")
            # Try without compression
            val_data_features.write_parquet(features_path)
            output_paths["features_path"] = features_path
            log(f"   val_data_features.parquet saved (without compression)")

        # 2. Save feature column names
        feature_cols_path = f"{config.OUTPUT_PATH}/feature_columns.json"
        try:
            with open(feature_cols_path, "w") as f:
                json.dump(feature_columns, f, indent=2)
            log(f"   feature_columns.json saved ({len(feature_columns)} features)")
            output_paths["feature_cols_path"] = feature_cols_path
        except Exception as e:
            log(f"   Error saving feature_columns.json: {e}")

        # 3. Save feature statistics (use pickle to avoid serialization issues)
        feature_stats_path = f"{config.OUTPUT_PATH}/feature_statistics.pkl"
        try:
            with open(feature_stats_path, "wb") as f:
                pickle.dump(feature_analysis, f)
            log(f"   feature_statistics.pkl saved")
            output_paths["feature_stats_path"] = feature_stats_path
        except Exception as e:
            log(f"   Error saving feature_statistics.pkl: {e}")

        # 4. Save feature importance analysis (use pickle)
        importance_path = f"{config.OUTPUT_PATH}/feature_importance_analysis.pkl"
        try:
            importance_data = {
                "feature_correlations": feature_analysis.get("correlation_results", {}),
                "top_features": feature_analysis.get("top_correlated_features", []),
                "feature_statistics": feature_analysis.get("feature_statistics", {}),
                "quality_metrics": feature_analysis.get("quality_assessment", {}),
                "feature_categories": feature_analysis.get("feature_categories", {})
            }
            with open(importance_path, "wb") as f:
                pickle.dump(importance_data, f)
            log(f"   feature_importance_analysis.pkl saved")
            output_paths["importance_path"] = importance_path
        except Exception as e:
            log(f"   Error saving feature_importance_analysis.pkl: {e}")

        # 5. Save enhanced summary report
        summary_path = f"{config.OUTPUT_PATH}/part_2b3_enhanced_summary.pkl"
        try:
            validation_results = feature_analysis.get("validation_results", {})

            summary = {
                "notebook": "Part 2B3: Enhanced Feature Engineering for Ranking",
                "completion_timestamp": datetime.now().isoformat(),
                "version": "COMPLETE FIXED - Comprehensive features with data type consistency",
                "critical_fixes_applied": [
                    "Fixed session column dtype mismatch (i64 vs u32)",
                    "Added ensure_consistent_dtypes function for all joins",
                    "Improved fallback mechanism with meaningful features",
                    "Enhanced lazy vs regular DataFrame handling",
                    "Better error reporting and recovery",
                    "Explicit Int64 casting for all join keys",
                    "Comprehensive error handling with multiple fallback levels"
                ],
                "improvements": [
                    f"Created {len(feature_columns)} meaningful features",
                    "Added advanced session behavior features",
                    "Added comprehensive item interaction patterns",
                    "Added temporal and statistical aggregations",
                    "Improved correlation analysis",
                    "Enhanced session history utilization",
                    "Fixed qcut duplicate value issues",
                    "Added robust error handling"
                ],
                "inputs_used": {
                    "val_data.parquet": f"{validation_results.get('val_data_samples', 0):,} validation samples",
                    "item_stats.parquet": f"{validation_results.get('item_stats_count', 0):,} items",
                    "train_features.parquet": "Session history" if validation_results.get('train_data_available', False) else "Not available"
                },
                "outputs_generated": {
                    "val_data_features.parquet": f"{len(val_data_features):,} samples with {len(feature_columns)} features",
                    "feature_columns.json": f"{len(feature_columns)} feature names",
                    "feature_statistics.pkl": "Comprehensive feature analysis",
                    "feature_importance_analysis.pkl": "Enhanced feature importance and correlations"
                },
                "key_metrics": {
                    "total_features": len(feature_columns),
                    "memory_usage_mb": feature_analysis.get("memory_usage_mb", 0),
                    "max_correlation": feature_analysis.get("quality_assessment", {}).get("max_correlation", 0),
                    "high_correlation_features": feature_analysis.get("quality_assessment", {}).get("high_correlation_features", 0),
                    "feature_categories": feature_analysis.get("feature_categories", {})
                },
                "quality_assessment": feature_analysis.get("quality_assessment", {}),
                "next_step": "Run Part 2B4: Model Training & Evaluation with enhanced features"
            }

            with open(summary_path, "wb") as f:
                pickle.dump(summary, f)
            log(f"   part_2b3_enhanced_summary.pkl saved")
            output_paths["summary_path"] = summary_path
        except Exception as e:
            log(f"   Error saving summary: {e}")

        log("Enhanced feature engineering outputs saved successfully!")
        log_memory("After saving outputs")

        return output_paths

    except Exception as e:
        log(f"Error saving outputs: {e}")
        return {"error": str(e)}

# Save all outputs
output_paths = save_enhanced_feature_outputs(val_data_features, feature_columns, feature_analysis)

[2025-08-08 03:59:39] Saving enhanced feature engineering outputs...
[2025-08-08 03:59:39] Before saving outputs [Memory: 5129.5 MB]
[2025-08-08 04:00:30]    val_data_features.parquet saved (113.9 MB)
[2025-08-08 04:00:30]    feature_columns.json saved (61 features)
[2025-08-08 04:00:30]    feature_statistics.pkl saved
[2025-08-08 04:00:30]    feature_importance_analysis.pkl saved
[2025-08-08 04:00:30]    part_2b3_enhanced_summary.pkl saved
[2025-08-08 04:00:30] Enhanced feature engineering outputs saved successfully!
[2025-08-08 04:00:30] After saving outputs [Memory: 3994.2 MB]


## ENHANCED FINAL SUMMARY

In [9]:
log("\n" + "="*80)
log("PART 2B3 COMPLETED: ENHANCED FEATURE ENGINEERING FOR RANKING")
log("="*80)

# Display key results
quality_assessment = feature_analysis.get("quality_assessment", {})
top_correlated = feature_analysis.get("top_correlated_features", [])

log(f"\nKEY RESULTS (COMPLETE FIXED VERSION):")
log(f"  Total features created: {len(feature_columns)}")
log(f"  Validation samples: {len(val_data_features):,}")
log(f"  Memory usage: {feature_analysis.get('memory_usage_mb', 0):.1f} MB")
log(f"  Session history available: {'yes' if validation_results.get('train_data_available', False) else 'no'}")

if top_correlated:
    top_feature, top_corr = top_correlated[0]
    log(f"  Top feature correlation: {abs(top_corr):.4f} ({top_feature})")
else:
    log(f"  Top feature correlation: 0.0000")

# Feature breakdown
feature_categories = feature_analysis.get("feature_categories", {})
log(f"\nFEATURE BREAKDOWN:")
for category, count in feature_categories.items():
    if count > 0:
        log(f"  {category.title()}: {count} features")

# Top correlated features
if top_correlated:
    log(f"\nTOP CORRELATED FEATURES:")
    for i, (feature, corr) in enumerate(top_correlated[:5]):
        log(f"  {i+1}. {feature}: {corr:.4f}")

# Quality assessment
log(f"\nQUALITY ASSESSMENT:")
log(f"  Feature count target (â‰¥20): {' PASS' if len(feature_columns) >= 20 else ' FAIL'} ({len(feature_columns)})")
log(f"  High correlation features (>{config.MIN_FEATURE_CORRELATION}): {quality_assessment.get('high_correlation_features', 0)}")
log(f"  Max correlation: {quality_assessment.get('max_correlation', 0):.4f}")
log(f"  Avg correlation: {quality_assessment.get('avg_correlation', 0):.4f}")

# Determine overall quality
feature_quality_ok = len(feature_columns) >= 20
correlation_ok = quality_assessment.get('max_correlation', 0) > config.MIN_FEATURE_CORRELATION
memory_ok = feature_analysis.get('memory_usage_mb', 0) < 8000

log(f"  Memory usage (<8GB): {' PASS' if memory_ok else ' FAIL'} ({feature_analysis.get('memory_usage_mb', 0):.1f} MB)")
log(f"  Meaningful features: {' PASS' if len(feature_columns) > 4 else ' FAIL'}")

# Overall quality determination
if feature_quality_ok and correlation_ok and memory_ok:
    overall_quality = "EXCELLENT"
elif feature_quality_ok and memory_ok:
    overall_quality = "GOOD"
elif len(feature_columns) >= 10 and memory_ok:
    overall_quality = "ACCEPTABLE"
else:
    overall_quality = "NEEDS_IMPROVEMENT"

log(f"\nOverall Quality: {overall_quality}")

# Output files summary
log(f"\nOUTPUT FILES GENERATED:")
if "features_path" in output_paths:
    log(f"  val_data_features.parquet ({os.path.getsize(output_paths['features_path'])/(1024*1024):.1f} MB)")
if "feature_cols_path" in output_paths:
    log(f"  feature_columns.json")
if "feature_stats_path" in output_paths:
    log(f"  feature_statistics.pkl")
if "importance_path" in output_paths:
    log(f"  feature_importance_analysis.pkl")
if "summary_path" in output_paths:
    log(f"  part_2b3_enhanced_summary.pkl")
log(f"  All files saved to: {config.OUTPUT_PATH}")

# Check for errors and provide recommendations
if "error" in feature_analysis:
    log(f"\nWARNING: {feature_analysis['error']}")

if overall_quality == "NEEDS_IMPROVEMENT":
    log(f"\nRECOMMENDATIONS FOR IMPROVEMENT:")
    log(f"  - Check data quality in Part 2B2")
    log(f"  - Verify item_stats.parquet has sufficient data")
    log(f"  - Ensure train_features.parquet is available for temporal features")

# Status for next step
if overall_quality in ["EXCELLENT", "GOOD", "ACCEPTABLE"] and len(feature_columns) >= 10:
    log(f"\n READY FOR PART 2B4: Model Training & Evaluation")
    log(f"  Features: {len(feature_columns)} meaningful features created")
    log(f"  Quality: {overall_quality}")
    log(f"  Data: {len(val_data_features):,} samples ready for training")
else:
    log(f"\n PART 2B4 READINESS: MARGINAL")
    log(f"  You can proceed but results may be limited")
    log(f"  Consider improving feature engineering if possible")

# Final cleanup
log(f"\nPerforming final cleanup...")
try:
    del val_data, item_stats, train_data
    if 'feature_engineer' in locals():
        del feature_engineer
    force_garbage_collection()
    final_memory = get_memory_usage()
    log(f"Memory cleanup completed - Final memory usage: {final_memory:.1f} GB")
except Exception as e:
    log(f"Cleanup warning: {e}")

log(f"\nPart 2B3 Complete Fixed Version finished successfully!")
log(f"Ready for Part 2B4 with {len(feature_columns)}-feature dataset!")
log("="*80)

[2025-08-08 04:00:30] 
[2025-08-08 04:00:30] PART 2B3 COMPLETED: ENHANCED FEATURE ENGINEERING FOR RANKING
[2025-08-08 04:00:30] 
KEY RESULTS (COMPLETE FIXED VERSION):
[2025-08-08 04:00:30]   Total features created: 61
[2025-08-08 04:00:30]   Validation samples: 6,617,259
[2025-08-08 04:00:30]   Memory usage: 3162.6 MB
[2025-08-08 04:00:30]   Session history available: yes
[2025-08-08 04:00:30]   Top feature correlation: 0.0130 (item_carts)
[2025-08-08 04:00:30] 
FEATURE BREAKDOWN:
[2025-08-08 04:00:30]   Session: 20 features
[2025-08-08 04:00:30]   Item: 18 features
[2025-08-08 04:00:30]   Interaction: 11 features
[2025-08-08 04:00:30]   Temporal: 6 features
[2025-08-08 04:00:30]   Statistical: 11 features
[2025-08-08 04:00:30] 
TOP CORRELATED FEATURES:
[2025-08-08 04:00:30]   1. item_carts: -0.0130
[2025-08-08 04:00:30]   2. item_popularity: -0.0117
[2025-08-08 04:00:30]   3. item_clicks: -0.0114
[2025-08-08 04:00:30]   4. item_orders: -0.0110
[2025-08-08 04:00:30]   5. unique_items: 