# Part 2A3 Click to Buy & Buy to Buy Matrix Generation

In [1]:
# Install required packages
!pip install polars==0.20.31
!pip install psutil

import polars as pl
import pandas as pd
import numpy as np
import gc
import os
import pickle
import json
import time
import psutil
from typing import Dict, List, Tuple, Optional
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')



In [2]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Configuration
class Config:
    DATA_PATH = '/content/drive/MyDrive/Colab Notebooks/CML/Assignment 1/content/otto-data'
    OUTPUT_PATH = '/content/drive/MyDrive/Colab Notebooks/CML/Assignment 1/content/otto-output'

    # Matrix generation parameters
    MAX_CANDIDATES_PER_ITEM = 40      # Maximum candidates to store per source item
    CLICK_TO_BUY_TIME_WINDOW_DAYS = 14  # Time window for click-to-buy relationships
    BUY_TO_BUY_TIME_WINDOW_DAYS = 7   # Time window for buy-to-buy relationships

    # Memory management
    MEMORY_CHECK_INTERVAL = 20        # Check memory every N chunks
    CHUNK_SIZE_MULTIPLIER = 2         # Can use larger chunks than click-to-click

config = Config()

## LOGGING SETUP

In [4]:
def setup_logging_and_monitoring():
    """Setup comprehensive logging and memory monitoring with emergency handling"""
    log_file = f"{config.OUTPUT_PATH}/buy_matrices_generation_log.txt"
    memory_log = []

    def log_message(message):
        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        log_entry = f"[{timestamp}] {message}"
        print(log_entry)

        # Also write to file
        with open(log_file, "a") as f:
            f.write(log_entry + "\n")

    def check_memory_usage():
        """Check current memory usage and log critical levels"""
        memory = psutil.virtual_memory()
        memory_pct = memory.percent
        available_gb = memory.available / (1024**3)

        memory_entry = {
            "timestamp": datetime.now().isoformat(),
            "memory_percent": memory_pct,
            "available_gb": available_gb,
            "used_gb": memory.used / (1024**3)
        }
        memory_log.append(memory_entry)

        if memory_pct > 90:
            log_message(f"CRITICAL MEMORY WARNING: {memory_pct:.1f}% used, {available_gb:.1f} GB available")
            return "CRITICAL"
        elif memory_pct > 80:
            log_message(f"HIGH MEMORY USAGE: {memory_pct:.1f}% used, {available_gb:.1f} GB available")
            return "HIGH"
        elif memory_pct > 60:
            log_message(f"Memory usage: {memory_pct:.1f}% used, {available_gb:.1f} GB available")
            return "NORMAL"
        else:
            return "LOW"

    return log_message, check_memory_usage, memory_log

log, check_memory, memory_log = setup_logging_and_monitoring()

log("="*80)
log("OTTO PART 2A3: CLICK-TO-BUY & BUY-TO-BUY MATRIX GENERATION STARTED")
log("="*80)

# Initial memory check
initial_memory_status = check_memory()
log(f"Initial memory status: {initial_memory_status}")

[2025-08-07 18:41:43] OTTO PART 2A3: CLICK-TO-BUY & BUY-TO-BUY MATRIX GENERATION STARTED
[2025-08-07 18:41:43] Initial memory status: LOW


## INPUT VALIDATION AND LOADING

In [5]:
def validate_and_load_inputs():
    """
    Validate and load all required inputs

    Returns:
        tuple: (prepared_data, chunking_strategy, session_analysis, validation_results)
    """
    log(" Validating and loading inputs...")

    # Required input files
    required_files = {
        "covisit_data_prepared.parquet": "Optimized training data from Part 2A1",
        "chunking_strategy.json": "Memory management configuration from Part 2A1",
        "session_analysis.json": "Session analysis results from Part 2A1"
    }

    # Check if files exist
    missing_files = []
    for filename, description in required_files.items():
        filepath = f"{config.OUTPUT_PATH}/{filename}"
        if not os.path.exists(filepath):
            missing_files.append(f" {filename} - {description}")
        else:
            file_size = os.path.getsize(filepath) / (1024*1024)  # MB
            log(f" {filename} - {file_size:.1f} MB")

    if missing_files:
        log(" MISSING REQUIRED INPUT FILES:")
        for missing in missing_files:
            log(f"   {missing}")
        raise FileNotFoundError("Required input files are missing!")

    log(" All required input files found!")

    # Load data
    log("\n Loading input data...")

    try:
        # Load prepared data
        log("   Loading optimized training data...")
        prepared_data = pl.read_parquet(f"{config.OUTPUT_PATH}/covisit_data_prepared.parquet")
        log(f"    Prepared data: {prepared_data.shape} ({prepared_data.estimated_size('mb'):.1f} MB)")

        # Load chunking strategy
        log("   Loading chunking strategy...")
        with open(f"{config.OUTPUT_PATH}/chunking_strategy.json", "r") as f:
            chunking_strategy = json.load(f)
        log(f"    Chunking strategy loaded")

        # Load session analysis
        log("   Loading session analysis...")
        with open(f"{config.OUTPUT_PATH}/session_analysis.json", "r") as f:
            session_analysis = json.load(f)
        log(f"    Session analysis loaded")

        # Validate data for buy matrix generation
        log("   Validating data for buy matrix generation...")

        # Check event type distribution
        event_dist = prepared_data.group_by("type").agg([
            pl.count().alias("count"),
            pl.col("session").n_unique().alias("unique_sessions")
        ])

        log(f"    Event distribution validation:")
        event_stats = {}
        for row in event_dist.iter_rows():
            event_type, count, sessions = row
            event_stats[event_type] = {"count": count, "sessions": sessions}
            log(f"      {event_type}: {count:,} events, {sessions:,} sessions")

        # Check for conversion opportunities
        click_sessions = event_stats.get("clicks", {}).get("sessions", 0)
        cart_sessions = event_stats.get("carts", {}).get("sessions", 0)
        order_sessions = event_stats.get("orders", {}).get("sessions", 0)

        # Sessions with multiple event types (conversion opportunities)
        mixed_sessions = prepared_data.group_by("session").agg([
            pl.col("type").n_unique().alias("unique_types")
        ]).filter(pl.col("unique_types") > 1).height

        log(f"    Conversion opportunities:")
        log(f"      Sessions with multiple event types: {mixed_sessions:,}")
        log(f"      Potential click-to-cart conversions: {min(click_sessions, cart_sessions):,}")
        log(f"      Potential click-to-order conversions: {min(click_sessions, order_sessions):,}")

        # Determine chunk sizes
        base_chunk_size = chunking_strategy.get("chunk_sizes", {}).get("click_to_buy", 15000)
        click_to_buy_chunk_size = int(base_chunk_size * config.CHUNK_SIZE_MULTIPLIER)
        buy_to_buy_chunk_size = int(base_chunk_size * config.CHUNK_SIZE_MULTIPLIER * 1.5)  # Even larger for buy-to-buy

        log(f"    Chunk sizes:")
        log(f"      Click-to-buy: {click_to_buy_chunk_size:,} sessions")
        log(f"      Buy-to-buy: {buy_to_buy_chunk_size:,} sessions")

        validation_results = {
            "timestamp": datetime.now().isoformat(),
            "event_statistics": event_stats,
            "mixed_sessions": mixed_sessions,
            "chunk_sizes": {
                "click_to_buy": click_to_buy_chunk_size,
                "buy_to_buy": buy_to_buy_chunk_size
            },
            "time_windows": {
                "click_to_buy_days": config.CLICK_TO_BUY_TIME_WINDOW_DAYS,
                "buy_to_buy_days": config.BUY_TO_BUY_TIME_WINDOW_DAYS
            }
        }

        log(" Input validation completed successfully!")
        return prepared_data, chunking_strategy, session_analysis, validation_results

    except Exception as e:
        log(f" Error loading input data: {e}")
        raise e

# Load and validate inputs
prepared_data, chunking_strategy, session_analysis, validation_results = validate_and_load_inputs()

[2025-08-07 18:41:43]  Validating and loading inputs...
[2025-08-07 18:41:43]  covisit_data_prepared.parquet - 1605.4 MB
[2025-08-07 18:41:43]  chunking_strategy.json - 0.0 MB
[2025-08-07 18:41:43]  session_analysis.json - 0.0 MB
[2025-08-07 18:41:43]  All required input files found!
[2025-08-07 18:41:43] 
 Loading input data...
[2025-08-07 18:41:43]    Loading optimized training data...
[2025-08-07 18:41:52]     Prepared data: (216384937, 7) (5159.0 MB)
[2025-08-07 18:41:52]    Loading chunking strategy...
[2025-08-07 18:41:52]     Chunking strategy loaded
[2025-08-07 18:41:52]    Loading session analysis...
[2025-08-07 18:41:52]     Session analysis loaded
[2025-08-07 18:41:52]    Validating data for buy matrix generation...
[2025-08-07 18:42:07]     Event distribution validation:
[2025-08-07 18:42:07]       clicks: 194,625,054 events, 12,899,779 sessions
[2025-08-07 18:42:07]       carts: 16,887,925 events, 3,810,706 sessions
[2025-08-07 18:42:07]       orders: 4,871,958 events, 1,6

## CLICK-TO-BUY MATRIX GENERATOR

In [6]:
class ClickToBuyMatrixGenerator:
    """
    Memory-efficient click-to-buy co-visitation matrix generator with progressive saving
    """

    def __init__(self, chunk_size: int, time_window_days: int):
        self.chunk_size = chunk_size
        self.time_window_ms = time_window_days * 24 * 60 * 60 * 1000  # Convert to milliseconds
        self.covisitation_counts = {}
        self.temporal_stats = {"valid_transitions": 0, "invalid_transitions": 0, "total_sessions": 0}

        # Memory management
        self.processed_chunks = 0
        self.start_time = time.time()
        self.last_save_time = time.time()
        self.max_pairs_in_memory = 2000000  # 2M pairs before cleanup
        self.save_interval_chunks = 15      # Save every 15 chunks

        log("  Initializing memory-efficient click-to-buy matrix generator...")
        log(f"   Chunk size: {self.chunk_size:,} sessions")
        log(f"   Time window: {time_window_days} days ({self.time_window_ms:,} ms)")
        log(f"   Max pairs in memory: {self.max_pairs_in_memory:,}")
        log(f"   Save interval: {self.save_interval_chunks} chunks")

    def estimate_memory_usage(self) -> float:
        """Estimate current memory usage in MB"""
        try:
            total_pairs = sum(len(targets) for targets in self.covisitation_counts.values())
            # Each pair takes ~50 bytes (item IDs + count + overhead)
            estimated_mb = total_pairs * 50 / (1024 * 1024)
            return estimated_mb
        except:
            return 0

    def cleanup_memory(self, force_aggressive: bool = False):
        """Clean up memory by keeping only top candidates"""
        log(f"   Performing click-to-buy memory cleanup (aggressive={force_aggressive})...")

        before_size = len(self.covisitation_counts)
        before_memory = self.estimate_memory_usage()

        # Keep only top candidates per source item
        max_candidates = config.MAX_CANDIDATES_PER_ITEM if not force_aggressive else 20

        cleaned_counts = {}
        for source_aid, targets in self.covisitation_counts.items():
            if len(targets) > max_candidates:
                sorted_targets = sorted(targets.items(), key=lambda x: x[1], reverse=True)
                top_targets = dict(sorted_targets[:max_candidates])
                cleaned_counts[source_aid] = top_targets
            else:
                cleaned_counts[source_aid] = targets

        self.covisitation_counts = cleaned_counts
        gc.collect()

        after_memory = self.estimate_memory_usage()
        log(f"   CTB memory cleanup: {before_memory:.1f}MB → {after_memory:.1f}MB")

    def save_intermediate_results(self, force_save: bool = False):
        """Save intermediate results and clean up memory"""
        current_time = time.time()

        if force_save or (self.processed_chunks % self.save_interval_chunks == 0 and self.processed_chunks > 0):
            log(f"   Saving CTB intermediate results (chunk {self.processed_chunks})...")

            try:
                temp_path = f"{config.OUTPUT_PATH}/ctb_matrix_temp_chunk_{self.processed_chunks}.pkl"

                # Convert to final format before saving
                temp_matrix = {}
                for source_aid, targets in self.covisitation_counts.items():
                    if targets:
                        sorted_targets = sorted(targets.items(), key=lambda x: x[1], reverse=True)
                        top_targets = sorted_targets[:config.MAX_CANDIDATES_PER_ITEM]
                        temp_matrix[source_aid] = top_targets

                with open(temp_path, "wb") as f:
                    pickle.dump({
                        "partial_matrix": temp_matrix,
                        "processed_chunks": self.processed_chunks,
                        "temporal_stats": self.temporal_stats,
                        "timestamp": datetime.now().isoformat()
                    }, f)

                file_size = os.path.getsize(temp_path) / (1024*1024)
                log(f"   CTB intermediate results saved: {temp_path} ({file_size:.1f} MB)")
                self.last_save_time = current_time

                # Clear memory after saving
                self.covisitation_counts.clear()
                gc.collect()
                log(f"   CTB memory cleared after save")

            except Exception as e:
                log(f"   Failed to save CTB intermediate results: {e}")

    def process_session_chunk(self, session_chunk: List[int], data: pl.DataFrame) -> Dict:
        """Process a chunk of sessions for click-to-buy relationships with memory limits"""
        chunk_covisitations = {}

        # Filter data for this chunk
        chunk_data = data.filter(pl.col("session").is_in(session_chunk))

        if len(chunk_data) == 0:
            return chunk_covisitations

        # Process each session with limits
        session_groups = chunk_data.group_by("session").agg([
            pl.col("aid").alias("aids"),
            pl.col("ts").alias("timestamps"),
            pl.col("type").alias("types")
        ])

        pairs_in_chunk = 0
        max_pairs_per_chunk = 500000  # Limit pairs per chunk

        for row in session_groups.iter_rows():
            session_id, aids, timestamps, types = row
            self.temporal_stats["total_sessions"] += 1

            if len(aids) < 2:
                continue

            # Limit session size to prevent memory explosion
            if len(aids) > 50:  # Skip very long sessions
                continue

            # Create event list with timestamps
            events = list(zip(aids, timestamps, types))
            events.sort(key=lambda x: x[1])  # Sort by timestamp

            # Find click-to-buy relationships with limits
            for i in range(len(events)):
                for j in range(i + 1, len(events)):
                    if pairs_in_chunk >= max_pairs_per_chunk:
                        break

                    aid1, ts1, type1 = events[i]
                    aid2, ts2, type2 = events[j]

                    # Check for click-to-buy pattern
                    if type1 == "clicks" and type2 in ["carts", "orders"]:
                        time_diff = ts2 - ts1

                        # Check temporal constraint
                        if 0 <= time_diff <= self.time_window_ms:
                            self.temporal_stats["valid_transitions"] += 1

                            if aid1 != aid2:  # Different items
                                chunk_covisitations[(aid1, aid2)] = chunk_covisitations.get((aid1, aid2), 0) + 1
                                pairs_in_chunk += 1
                        else:
                            self.temporal_stats["invalid_transitions"] += 1

                if pairs_in_chunk >= max_pairs_per_chunk:
                    break

        return chunk_covisitations

    def merge_chunk_results(self, chunk_covisitations: Dict):
        """Merge chunk results with memory management"""
        for (aid1, aid2), count in chunk_covisitations.items():
            if aid1 not in self.covisitation_counts:
                self.covisitation_counts[aid1] = {}
            self.covisitation_counts[aid1][aid2] = self.covisitation_counts[aid1].get(aid2, 0) + count

    def generate_matrix(self, data: pl.DataFrame) -> Dict:
        """Generate complete click-to-buy co-visitation matrix with memory management"""
        log("  Starting memory-efficient click-to-buy matrix generation...")

        # Get sessions with both clicks and purchases
        session_type_counts = data.group_by("session").agg([
            pl.col("type").filter(pl.col("type") == "clicks").count().alias("clicks"),
            pl.col("type").filter(pl.col("type").is_in(["carts", "orders"])).count().alias("purchases")
        ]).filter((pl.col("clicks") > 0) & (pl.col("purchases") > 0))

        conversion_sessions = session_type_counts["session"].to_list()
        total_sessions = len(conversion_sessions)

        log(f"    Processing {total_sessions:,} sessions with conversion opportunities")
        log(f"    Using {self.chunk_size:,} sessions per chunk")

        num_chunks = (total_sessions + self.chunk_size - 1) // self.chunk_size
        log(f"    Total chunks to process: {num_chunks}")

        # Process sessions in chunks with memory management
        for chunk_idx in range(num_chunks):
            chunk_start_time = time.time()

            # Memory check with emergency handling
            if chunk_idx % config.MEMORY_CHECK_INTERVAL == 0:
                memory_status = check_memory()
                if memory_status == "CRITICAL":
                    log(f"CRITICAL MEMORY - EMERGENCY CTB SAVE")
                    self.save_intermediate_results(force_save=True)
                    break

            # Get session chunk
            start_idx = chunk_idx * self.chunk_size
            end_idx = min(start_idx + self.chunk_size, total_sessions)
            session_chunk = conversion_sessions[start_idx:end_idx]

            log(f"    Processing CTB chunk {chunk_idx + 1}/{num_chunks} ({end_idx/total_sessions*100:.1f}%)")

            try:
                # Process chunk
                chunk_covisitations = self.process_session_chunk(session_chunk, data)
                self.merge_chunk_results(chunk_covisitations)
                self.processed_chunks += 1

                chunk_time = time.time() - chunk_start_time
                pairs_found = len(chunk_covisitations)
                current_memory = self.estimate_memory_usage()

                log(f"       CTB chunk {chunk_idx + 1}: {pairs_found:,} pairs, {chunk_time:.1f}s, ~{current_memory:.0f}MB")

                # Memory management
                if (current_memory > 3000 or  # 3GB limit
                    self.processed_chunks % self.save_interval_chunks == 0):
                    self.save_intermediate_results()

                # Cleanup
                del chunk_covisitations
                gc.collect()

            except Exception as e:
                log(f"       Error processing CTB chunk {chunk_idx + 1}: {e}")
                continue

        # Final processing - collect all intermediate results
        log("    Collecting CTB intermediate results...")
        final_matrix = self._merge_intermediate_results()

        total_time = time.time() - self.start_time
        log(f"  Click-to-buy matrix generation completed!")
        log(f"     Total time: {total_time:.1f} seconds ({total_time/60:.1f} minutes)")
        log(f"     Final matrix size: {len(final_matrix):,} source items")

        return final_matrix

    def _merge_intermediate_results(self) -> Dict:
        """Merge all intermediate result files into final matrix"""
        log("    Merging CTB intermediate results...")

        final_matrix = {}
        temp_files = []

        # Find all temporary files
        import glob
        temp_pattern = f"{config.OUTPUT_PATH}/ctb_matrix_temp_chunk_*.pkl"
        temp_files = glob.glob(temp_pattern)

        log(f"    Found {len(temp_files)} CTB intermediate files to merge")

        # Merge current memory state
        if self.covisitation_counts:
            log("    Adding current CTB memory state...")
            for source_aid, targets in self.covisitation_counts.items():
                if targets:
                    sorted_targets = sorted(targets.items(), key=lambda x: x[1], reverse=True)
                    top_targets = sorted_targets[:config.MAX_CANDIDATES_PER_ITEM]
                    final_matrix[source_aid] = top_targets

        # Merge intermediate files
        for temp_file in temp_files:
            try:
                log(f"    Merging {os.path.basename(temp_file)}...")
                with open(temp_file, "rb") as f:
                    temp_data = pickle.load(f)
                    temp_matrix = temp_data.get("partial_matrix", {})

                    # Update temporal stats
                    if "temporal_stats" in temp_data:
                        temp_stats = temp_data["temporal_stats"]
                        for key in self.temporal_stats:
                            if key in temp_stats:
                                self.temporal_stats[key] += temp_stats[key]

                # Merge into final matrix
                for source_aid, candidates in temp_matrix.items():
                    if source_aid in final_matrix:
                        # Merge candidates
                        existing_dict = dict(final_matrix[source_aid])
                        for target_aid, score in candidates:
                            existing_dict[target_aid] = existing_dict.get(target_aid, 0) + score

                        # Keep top candidates
                        sorted_candidates = sorted(existing_dict.items(), key=lambda x: x[1], reverse=True)
                        final_matrix[source_aid] = sorted_candidates[:config.MAX_CANDIDATES_PER_ITEM]
                    else:
                        final_matrix[source_aid] = candidates[:config.MAX_CANDIDATES_PER_ITEM]

                # Clean up temp file
                os.remove(temp_file)
                log(f"    Cleaned up {os.path.basename(temp_file)}")

            except Exception as e:
                log(f"    Error processing CTB {temp_file}: {e}")

        log(f"    Final CTB matrix merged: {len(final_matrix):,} source items")
        return final_matrix

## BUY-TO-BUY MATRIX GENERATOR

In [7]:
class BuyToBuyMatrixGenerator:
    """
    Memory-efficient buy-to-buy co-visitation matrix generator with progressive saving
    """

    def __init__(self, chunk_size: int, time_window_days: int):
        self.chunk_size = chunk_size
        self.time_window_ms = time_window_days * 24 * 60 * 60 * 1000
        self.covisitation_counts = {}

        # Memory management
        self.processed_chunks = 0
        self.start_time = time.time()
        self.last_save_time = time.time()
        self.max_pairs_in_memory = 2000000  # 2M pairs before cleanup
        self.save_interval_chunks = 15      # Save every 15 chunks

        log("  Initializing memory-efficient buy-to-buy matrix generator...")
        log(f"   Chunk size: {self.chunk_size:,} sessions")
        log(f"   Time window: {time_window_days} days")
        log(f"   Max pairs in memory: {self.max_pairs_in_memory:,}")
        log(f"   Save interval: {self.save_interval_chunks} chunks")

    def estimate_memory_usage(self) -> float:
        """Estimate current memory usage in MB"""
        try:
            total_pairs = sum(len(targets) for targets in self.covisitation_counts.values())
            estimated_mb = total_pairs * 50 / (1024 * 1024)
            return estimated_mb
        except:
            return 0

    def cleanup_memory(self, force_aggressive: bool = False):
        """Clean up memory by keeping only top candidates"""
        log(f"   Performing buy-to-buy memory cleanup (aggressive={force_aggressive})...")

        before_memory = self.estimate_memory_usage()
        max_candidates = config.MAX_CANDIDATES_PER_ITEM if not force_aggressive else 20

        cleaned_counts = {}
        for source_aid, targets in self.covisitation_counts.items():
            if len(targets) > max_candidates:
                sorted_targets = sorted(targets.items(), key=lambda x: x[1], reverse=True)
                top_targets = dict(sorted_targets[:max_candidates])
                cleaned_counts[source_aid] = top_targets
            else:
                cleaned_counts[source_aid] = targets

        self.covisitation_counts = cleaned_counts
        gc.collect()

        after_memory = self.estimate_memory_usage()
        log(f"   BTB memory cleanup: {before_memory:.1f}MB → {after_memory:.1f}MB")

    def save_intermediate_results(self, force_save: bool = False):
        """Save intermediate results and clean up memory"""
        current_time = time.time()

        if force_save or (self.processed_chunks % self.save_interval_chunks == 0 and self.processed_chunks > 0):
            log(f"   Saving BTB intermediate results (chunk {self.processed_chunks})...")

            try:
                temp_path = f"{config.OUTPUT_PATH}/btb_matrix_temp_chunk_{self.processed_chunks}.pkl"

                # Convert to final format before saving
                temp_matrix = {}
                for source_aid, targets in self.covisitation_counts.items():
                    if targets:
                        sorted_targets = sorted(targets.items(), key=lambda x: x[1], reverse=True)
                        top_targets = sorted_targets[:config.MAX_CANDIDATES_PER_ITEM]
                        temp_matrix[source_aid] = top_targets

                with open(temp_path, "wb") as f:
                    pickle.dump({
                        "partial_matrix": temp_matrix,
                        "processed_chunks": self.processed_chunks,
                        "timestamp": datetime.now().isoformat()
                    }, f)

                file_size = os.path.getsize(temp_path) / (1024*1024)
                log(f"   BTB intermediate results saved: {temp_path} ({file_size:.1f} MB)")
                self.last_save_time = current_time

                # Clear memory after saving
                self.covisitation_counts.clear()
                gc.collect()
                log(f"   BTB memory cleared after save")

            except Exception as e:
                log(f"   Failed to save BTB intermediate results: {e}")

    def process_session_chunk(self, session_chunk: List[int], buy_data: pl.DataFrame) -> Dict:
        """Process a chunk of sessions for buy-to-buy relationships with memory limits"""
        chunk_covisitations = {}

        # Filter data for this chunk
        chunk_data = buy_data.filter(pl.col("session").is_in(session_chunk))

        if len(chunk_data) == 0:
            return chunk_covisitations

        # Process each session with limits
        session_groups = chunk_data.group_by("session").agg([
            pl.col("aid").alias("aids"),
            pl.col("ts").alias("timestamps")
        ])

        pairs_in_chunk = 0
        max_pairs_per_chunk = 500000  # Limit pairs per chunk

        for row in session_groups.iter_rows():
            session_id, aids, timestamps = row

            if len(aids) < 2:  # Need at least 2 purchase events
                continue

            # Limit session size to prevent memory explosion
            if len(aids) > 30:  # Skip very long purchase sessions
                continue

            # Create aid-timestamp pairs
            events = list(zip(aids, timestamps))
            events.sort(key=lambda x: x[1])  # Sort by timestamp

            # Generate buy-to-buy pairs within time window with limits
            for i in range(len(events)):
                for j in range(i + 1, len(events)):
                    if pairs_in_chunk >= max_pairs_per_chunk:
                        break

                    aid1, ts1 = events[i]
                    aid2, ts2 = events[j]

                    # Check time window
                    if (ts2 - ts1) <= self.time_window_ms and aid1 != aid2:
                        # Add both directions
                        chunk_covisitations[(aid1, aid2)] = chunk_covisitations.get((aid1, aid2), 0) + 1
                        chunk_covisitations[(aid2, aid1)] = chunk_covisitations.get((aid2, aid1), 0) + 1
                        pairs_in_chunk += 2

                if pairs_in_chunk >= max_pairs_per_chunk:
                    break

        return chunk_covisitations

    def merge_chunk_results(self, chunk_covisitations: Dict):
        """Merge chunk results with memory management"""
        for (aid1, aid2), count in chunk_covisitations.items():
            if aid1 not in self.covisitation_counts:
                self.covisitation_counts[aid1] = {}
            self.covisitation_counts[aid1][aid2] = self.covisitation_counts[aid1].get(aid2, 0) + count

    def generate_matrix(self, data: pl.DataFrame) -> Dict:
        """Generate complete buy-to-buy co-visitation matrix with memory management"""
        log("  Starting memory-efficient buy-to-buy matrix generation...")

        # Filter for purchase events (carts and orders)
        buy_data = data.filter(pl.col("type").is_in(["carts", "orders"])).sort(["session", "ts"])

        # Get sessions with multiple purchase events
        purchase_session_counts = buy_data.group_by("session").agg([
            pl.count().alias("purchase_count")
        ]).filter(pl.col("purchase_count") > 1)

        multi_purchase_sessions = purchase_session_counts["session"].to_list()
        total_sessions = len(multi_purchase_sessions)

        log(f"    Processing {total_sessions:,} sessions with multiple purchases")
        log(f"    Purchase events: {len(buy_data):,}")

        num_chunks = (total_sessions + self.chunk_size - 1) // self.chunk_size
        log(f"    Total chunks to process: {num_chunks}")

        # Process sessions in chunks with memory management
        for chunk_idx in range(num_chunks):
            chunk_start_time = time.time()

            # Memory check with emergency handling
            if chunk_idx % config.MEMORY_CHECK_INTERVAL == 0:
                memory_status = check_memory()
                if memory_status == "CRITICAL":
                    log(f"CRITICAL MEMORY - EMERGENCY BTB SAVE")
                    self.save_intermediate_results(force_save=True)
                    break

            # Get session chunk
            start_idx = chunk_idx * self.chunk_size
            end_idx = min(start_idx + self.chunk_size, total_sessions)
            session_chunk = multi_purchase_sessions[start_idx:end_idx]

            log(f"    Processing BTB chunk {chunk_idx + 1}/{num_chunks} ({end_idx/total_sessions*100:.1f}%)")

            try:
                # Process chunk
                chunk_covisitations = self.process_session_chunk(session_chunk, buy_data)
                self.merge_chunk_results(chunk_covisitations)
                self.processed_chunks += 1

                chunk_time = time.time() - chunk_start_time
                pairs_found = len(chunk_covisitations)
                current_memory = self.estimate_memory_usage()

                log(f"       BTB chunk {chunk_idx + 1}: {pairs_found:,} pairs, {chunk_time:.1f}s, ~{current_memory:.0f}MB")

                # Memory management
                if (current_memory > 3000 or  # 3GB limit
                    self.processed_chunks % self.save_interval_chunks == 0):
                    self.save_intermediate_results()

                # Cleanup
                del chunk_covisitations
                gc.collect()

            except Exception as e:
                log(f"       Error processing BTB chunk {chunk_idx + 1}: {e}")
                continue

        # Final processing - collect all intermediate results
        log("    Collecting BTB intermediate results...")
        final_matrix = self._merge_intermediate_results()

        total_time = time.time() - self.start_time
        log(f"  Buy-to-buy matrix generation completed!")
        log(f"     Total time: {total_time:.1f} seconds ({total_time/60:.1f} minutes)")
        log(f"     Final matrix size: {len(final_matrix):,} source items")

        return final_matrix

    def _merge_intermediate_results(self) -> Dict:
        """Merge all intermediate result files into final matrix"""
        log("    Merging BTB intermediate results...")

        final_matrix = {}
        temp_files = []

        # Find all temporary files
        import glob
        temp_pattern = f"{config.OUTPUT_PATH}/btb_matrix_temp_chunk_*.pkl"
        temp_files = glob.glob(temp_pattern)

        log(f"    Found {len(temp_files)} BTB intermediate files to merge")

        # Merge current memory state
        if self.covisitation_counts:
            log("    Adding current BTB memory state...")
            for source_aid, targets in self.covisitation_counts.items():
                if targets:
                    sorted_targets = sorted(targets.items(), key=lambda x: x[1], reverse=True)
                    top_targets = sorted_targets[:config.MAX_CANDIDATES_PER_ITEM]
                    final_matrix[source_aid] = top_targets

        # Merge intermediate files
        for temp_file in temp_files:
            try:
                log(f"    Merging {os.path.basename(temp_file)}...")
                with open(temp_file, "rb") as f:
                    temp_data = pickle.load(f)
                    temp_matrix = temp_data.get("partial_matrix", {})

                # Merge into final matrix
                for source_aid, candidates in temp_matrix.items():
                    if source_aid in final_matrix:
                        # Merge candidates
                        existing_dict = dict(final_matrix[source_aid])
                        for target_aid, score in candidates:
                            existing_dict[target_aid] = existing_dict.get(target_aid, 0) + score

                        # Keep top candidates
                        sorted_candidates = sorted(existing_dict.items(), key=lambda x: x[1], reverse=True)
                        final_matrix[source_aid] = sorted_candidates[:config.MAX_CANDIDATES_PER_ITEM]
                    else:
                        final_matrix[source_aid] = candidates[:config.MAX_CANDIDATES_PER_ITEM]

                # Clean up temp file
                os.remove(temp_file)
                log(f"    Cleaned up {os.path.basename(temp_file)}")

            except Exception as e:
                log(f"    Error processing BTB {temp_file}: {e}")

        log(f"    Final BTB matrix merged: {len(final_matrix):,} source items")
        return final_matrix

## MATRIX GENERATION EXECUTION

In [8]:
# Prepare data for matrix generation with memory optimization
log("\n Preparing data for matrix generation...")

# Memory check before starting
initial_memory = check_memory()
log(f"Initial memory status: {initial_memory}")

# Clear any unnecessary data early (but keep prepared_data for now)
if 'chunking_strategy' in locals():
    del chunking_strategy
if 'session_analysis' in locals():
    del session_analysis
gc.collect()

# Initialize generators with enhanced memory management
click_to_buy_chunk_size = validation_results["chunk_sizes"]["click_to_buy"]
buy_to_buy_chunk_size = validation_results["chunk_sizes"]["buy_to_buy"]

log("\n Initializing memory-efficient matrix generators...")

click_to_buy_generator = ClickToBuyMatrixGenerator(
    click_to_buy_chunk_size,
    config.CLICK_TO_BUY_TIME_WINDOW_DAYS
)

buy_to_buy_generator = BuyToBuyMatrixGenerator(
    buy_to_buy_chunk_size,
    config.BUY_TO_BUY_TIME_WINDOW_DAYS
)

# Initialize success flags
click_to_buy_success = False
buy_to_buy_success = False
click_to_buy_matrix = {}
buy_to_buy_matrix = {}
ctb_generation_time = 0
btb_generation_time = 0

# Generate click-to-buy matrix with comprehensive error handling
log("\n" + "="*60)
log(" GENERATING CLICK-TO-BUY MATRIX")
log("="*60)

try:
    ctb_start_time = time.time()

    # Pre-generation memory check
    pre_ctb_memory = check_memory()
    log(f"Pre-CTB generation memory: {pre_ctb_memory}")

    click_to_buy_matrix = click_to_buy_generator.generate_matrix(prepared_data)

    ctb_generation_time = time.time() - ctb_start_time

    if click_to_buy_matrix and len(click_to_buy_matrix) > 0:
        click_to_buy_success = True
        log(f" Click-to-buy matrix generated successfully!")
        log(f"   Source items: {len(click_to_buy_matrix):,}")
        log(f"   Generation time: {ctb_generation_time:.1f} seconds")
    else:
        log("  Click-to-buy matrix generation completed but resulted in empty matrix")
        click_to_buy_success = False

except MemoryError as e:
    log(f" MEMORY ERROR during click-to-buy generation: {e}")
    log("   Attempting to recover partial results...")
    click_to_buy_success = False

    # Try to get partial results from intermediate saves
    try:
        import glob
        temp_files = glob.glob(f"{config.OUTPUT_PATH}/ctb_matrix_temp_chunk_*.pkl")
        if temp_files:
            log(f"   Found {len(temp_files)} CTB intermediate files - attempting recovery...")
            latest_file = max(temp_files, key=os.path.getctime)
            with open(latest_file, "rb") as f:
                temp_data = pickle.load(f)
                click_to_buy_matrix = temp_data.get("partial_matrix", {})
            log(f"   Recovered partial CTB matrix: {len(click_to_buy_matrix):,} items")
        else:
            click_to_buy_matrix = {}
    except Exception as recovery_error:
        log(f"   CTB recovery failed: {recovery_error}")
        click_to_buy_matrix = {}

except Exception as e:
    log(f" Unexpected error during click-to-buy generation: {e}")
    click_to_buy_success = False

    # Try to save current state
    try:
        if hasattr(click_to_buy_generator, 'covisitation_counts') and click_to_buy_generator.covisitation_counts:
            click_to_buy_generator.save_intermediate_results(force_save=True)
            # Convert current state to matrix format
            current_matrix = {}
            for source_aid, targets in click_to_buy_generator.covisitation_counts.items():
                if targets:
                    sorted_targets = sorted(targets.items(), key=lambda x: x[1], reverse=True)
                    current_matrix[source_aid] = sorted_targets[:config.MAX_CANDIDATES_PER_ITEM]
            click_to_buy_matrix = current_matrix
            log(f"   Saved partial CTB results: {len(click_to_buy_matrix):,} items")
        else:
            click_to_buy_matrix = {}
    except Exception as save_error:
        log(f"   Failed to save partial CTB results: {save_error}")
        click_to_buy_matrix = {}

finally:
    # Clean up click-to-buy generator memory
    try:
        if hasattr(click_to_buy_generator, 'covisitation_counts'):
            click_to_buy_generator.covisitation_counts.clear()
        gc.collect()
        log("   CTB generator memory cleaned up")
    except Exception as cleanup_error:
        log(f"   CTB cleanup error: {cleanup_error}")

# Post-CTB memory check
post_ctb_memory = check_memory()
log(f"Post-CTB generation memory: {post_ctb_memory}")

# Generate buy-to-buy matrix with comprehensive error handling
log("\n" + "="*60)
log(" GENERATING BUY-TO-BUY MATRIX")
log("="*60)

try:
    btb_start_time = time.time()

    # Pre-generation memory check
    pre_btb_memory = check_memory()
    log(f"Pre-BTB generation memory: {pre_btb_memory}")

    buy_to_buy_matrix = buy_to_buy_generator.generate_matrix(prepared_data)

    btb_generation_time = time.time() - btb_start_time

    if buy_to_buy_matrix and len(buy_to_buy_matrix) > 0:
        buy_to_buy_success = True
        log(f" Buy-to-buy matrix generated successfully!")
        log(f"   Source items: {len(buy_to_buy_matrix):,}")
        log(f"   Generation time: {btb_generation_time:.1f} seconds")
    else:
        log("  Buy-to-buy matrix generation completed but resulted in empty matrix")
        buy_to_buy_success = False

except MemoryError as e:
    log(f" MEMORY ERROR during buy-to-buy generation: {e}")
    log("   Attempting to recover partial results...")
    buy_to_buy_success = False

    # Try to get partial results from intermediate saves
    try:
        import glob
        temp_files = glob.glob(f"{config.OUTPUT_PATH}/btb_matrix_temp_chunk_*.pkl")
        if temp_files:
            log(f"   Found {len(temp_files)} BTB intermediate files - attempting recovery...")
            latest_file = max(temp_files, key=os.path.getctime)
            with open(latest_file, "rb") as f:
                temp_data = pickle.load(f)
                buy_to_buy_matrix = temp_data.get("partial_matrix", {})
            log(f"   Recovered partial BTB matrix: {len(buy_to_buy_matrix):,} items")
        else:
            buy_to_buy_matrix = {}
    except Exception as recovery_error:
        log(f"   BTB recovery failed: {recovery_error}")
        buy_to_buy_matrix = {}

except Exception as e:
    log(f" Unexpected error during buy-to-buy generation: {e}")
    buy_to_buy_success = False

    # Try to save current state
    try:
        if hasattr(buy_to_buy_generator, 'covisitation_counts') and buy_to_buy_generator.covisitation_counts:
            buy_to_buy_generator.save_intermediate_results(force_save=True)
            # Convert current state to matrix format
            current_matrix = {}
            for source_aid, targets in buy_to_buy_generator.covisitation_counts.items():
                if targets:
                    sorted_targets = sorted(targets.items(), key=lambda x: x[1], reverse=True)
                    current_matrix[source_aid] = sorted_targets[:config.MAX_CANDIDATES_PER_ITEM]
            buy_to_buy_matrix = current_matrix
            log(f"   Saved partial BTB results: {len(buy_to_buy_matrix):,} items")
        else:
            buy_to_buy_matrix = {}
    except Exception as save_error:
        log(f"   Failed to save partial BTB results: {save_error}")
        buy_to_buy_matrix = {}

finally:
    # Clean up buy-to-buy generator memory
    try:
        if hasattr(buy_to_buy_generator, 'covisitation_counts'):
            buy_to_buy_generator.covisitation_counts.clear()
        gc.collect()
        log("   BTB generator memory cleaned up")
    except Exception as cleanup_error:
        log(f"   BTB cleanup error: {cleanup_error}")

# Post-BTB memory check
post_btb_memory = check_memory()
log(f"Post-BTB generation memory: {post_btb_memory}")

# Generation results summary
log(f"\n MATRIX GENERATION SUMMARY:")
log(f"   Click-to-buy success: {click_to_buy_success}")
log(f"   Buy-to-buy success: {buy_to_buy_success}")
log(f"   CTB matrix size: {len(click_to_buy_matrix):,} source items")
log(f"   BTB matrix size: {len(buy_to_buy_matrix):,} source items")
log(f"   Total generation time: {ctb_generation_time + btb_generation_time:.1f} seconds")

# Final validation
both_matrices_successful = click_to_buy_success and buy_to_buy_success
if both_matrices_successful:
    log(" Both matrices generated successfully!")
elif click_to_buy_success or buy_to_buy_success:
    log("  Partial success - at least one matrix generated")
else:
    log(" Matrix generation failed - check logs for errors")

# Keep prepared_data for conversion analysis - will be cleaned up later
log(" Keeping prepared_data in memory for conversion analysis...")

[2025-08-07 18:42:17] 
 Preparing data for matrix generation...
[2025-08-07 18:42:17] Initial memory status: LOW
[2025-08-07 18:42:17] 
 Initializing memory-efficient matrix generators...
[2025-08-07 18:42:17]   Initializing memory-efficient click-to-buy matrix generator...
[2025-08-07 18:42:17]    Chunk size: 100,000 sessions
[2025-08-07 18:42:17]    Time window: 14 days (1,209,600,000 ms)
[2025-08-07 18:42:17]    Max pairs in memory: 2,000,000
[2025-08-07 18:42:17]    Save interval: 15 chunks
[2025-08-07 18:42:17]   Initializing memory-efficient buy-to-buy matrix generator...
[2025-08-07 18:42:17]    Chunk size: 150,000 sessions
[2025-08-07 18:42:17]    Time window: 7 days
[2025-08-07 18:42:17]    Max pairs in memory: 2,000,000
[2025-08-07 18:42:17]    Save interval: 15 chunks
[2025-08-07 18:42:17] 
[2025-08-07 18:42:17]  GENERATING CLICK-TO-BUY MATRIX
[2025-08-07 18:42:17] Pre-CTB generation memory: LOW
[2025-08-07 18:42:17]   Starting memory-efficient click-to-buy matrix generation

## MATRIX ANALYSIS AND VALIDATION

In [9]:
def analyze_buy_matrices(click_to_buy_matrix: Dict,
                        buy_to_buy_matrix: Dict,
                        click_to_buy_generator: ClickToBuyMatrixGenerator) -> Dict:
    """
    Comprehensive analysis of both buy matrices

    Args:
        click_to_buy_matrix: Generated click-to-buy matrix
        buy_to_buy_matrix: Generated buy-to-buy matrix
        click_to_buy_generator: Generator with temporal stats

    Returns:
        dict: Analysis results
    """
    log(" Analyzing generated buy matrices...")

    analysis_results = {
        "analysis_timestamp": datetime.now().isoformat(),
        "click_to_buy_analysis": {},
        "buy_to_buy_analysis": {},
        "temporal_analysis": {},
        "comparison_analysis": {}
    }

    # Analyze click-to-buy matrix
    if click_to_buy_matrix:
        ctb_source_items = len(click_to_buy_matrix)
        ctb_total_pairs = sum(len(candidates) for candidates in click_to_buy_matrix.values())
        ctb_avg_candidates = ctb_total_pairs / ctb_source_items if ctb_source_items > 0 else 0

        analysis_results["click_to_buy_analysis"] = {
            "source_items": ctb_source_items,
            "total_pairs": ctb_total_pairs,
            "avg_candidates_per_item": ctb_avg_candidates,
            "generation_successful": click_to_buy_success
        }

        log(f"    Click-to-buy matrix:")
        log(f"      Source items: {ctb_source_items:,}")
        log(f"      Total pairs: {ctb_total_pairs:,}")
        log(f"      Avg candidates: {ctb_avg_candidates:.1f}")

    # Analyze buy-to-buy matrix
    if buy_to_buy_matrix:
        btb_source_items = len(buy_to_buy_matrix)
        btb_total_pairs = sum(len(candidates) for candidates in buy_to_buy_matrix.values())
        btb_avg_candidates = btb_total_pairs / btb_source_items if btb_source_items > 0 else 0

        analysis_results["buy_to_buy_analysis"] = {
            "source_items": btb_source_items,
            "total_pairs": btb_total_pairs,
            "avg_candidates_per_item": btb_avg_candidates,
            "generation_successful": buy_to_buy_success
        }

        log(f"    Buy-to-buy matrix:")
        log(f"      Source items: {btb_source_items:,}")
        log(f"      Total pairs: {btb_total_pairs:,}")
        log(f"      Avg candidates: {btb_avg_candidates:.1f}")

    # Temporal analysis for click-to-buy
    if hasattr(click_to_buy_generator, 'temporal_stats'):
        temporal_stats = click_to_buy_generator.temporal_stats
        total_transitions = temporal_stats["valid_transitions"] + temporal_stats["invalid_transitions"]

        if total_transitions > 0:
            valid_pct = temporal_stats["valid_transitions"] / total_transitions * 100

            analysis_results["temporal_analysis"] = {
                "valid_transitions": temporal_stats["valid_transitions"],
                "invalid_transitions": temporal_stats["invalid_transitions"],
                "total_transitions": total_transitions,
                "valid_percentage": valid_pct,
                "sessions_processed": temporal_stats["total_sessions"]
            }

            log(f"     Temporal constraint analysis:")
            log(f"      Valid transitions: {temporal_stats['valid_transitions']:,} ({valid_pct:.1f}%)")
            log(f"      Invalid transitions: {temporal_stats['invalid_transitions']:,}")
            log(f"      Sessions processed: {temporal_stats['total_sessions']:,}")

    # Comparison analysis
    if click_to_buy_matrix and buy_to_buy_matrix:
        # Find overlapping source items
        ctb_items = set(click_to_buy_matrix.keys())
        btb_items = set(buy_to_buy_matrix.keys())
        overlap = len(ctb_items.intersection(btb_items))

        analysis_results["comparison_analysis"] = {
            "ctb_unique_items": len(ctb_items - btb_items),
            "btb_unique_items": len(btb_items - ctb_items),
            "overlapping_items": overlap,
            "total_unique_items": len(ctb_items.union(btb_items))
        }

        log(f"    Matrix comparison:")
        log(f"      Items only in click-to-buy: {len(ctb_items - btb_items):,}")
        log(f"      Items only in buy-to-buy: {len(btb_items - ctb_items):,}")
        log(f"      Overlapping items: {overlap:,}")

    log(" Matrix analysis completed!")
    return analysis_results

# Perform analysis
matrix_analysis = analyze_buy_matrices(click_to_buy_matrix, buy_to_buy_matrix, click_to_buy_generator)

[2025-08-07 18:47:51]  Analyzing generated buy matrices...
[2025-08-07 18:47:51]     Click-to-buy matrix:
[2025-08-07 18:47:51]       Source items: 841,226
[2025-08-07 18:47:51]       Total pairs: 6,851,523
[2025-08-07 18:47:51]       Avg candidates: 8.1
[2025-08-07 18:47:51]     Buy-to-buy matrix:
[2025-08-07 18:47:51]       Source items: 311,156
[2025-08-07 18:47:51]       Total pairs: 3,483,335
[2025-08-07 18:47:51]       Avg candidates: 11.2
[2025-08-07 18:47:51]      Temporal constraint analysis:
[2025-08-07 18:47:51]       Valid transitions: 49,261,768 (93.6%)
[2025-08-07 18:47:51]       Invalid transitions: 3,378,105
[2025-08-07 18:47:51]       Sessions processed: 8,346,669
[2025-08-07 18:47:51]     Matrix comparison:
[2025-08-07 18:47:52]       Items only in click-to-buy: 590,708
[2025-08-07 18:47:52]       Items only in buy-to-buy: 60,638
[2025-08-07 18:47:52]       Overlapping items: 250,518
[2025-08-07 18:47:52]  Matrix analysis completed!


## CONVERSION PATTERN ANALYSIS

In [10]:
def analyze_conversion_patterns_safe(data: pl.DataFrame = None) -> Dict:
    """
    Analyze click-to-buy conversion patterns with robust error handling

    Args:
        data: Training data (optional, will attempt to reload if not provided)

    Returns:
        dict: Conversion pattern analysis results
    """
    log(" Analyzing conversion patterns...")

    # Check if data is available
    if data is None:
        log("   No data provided - attempting to reload prepared data...")
        try:
            data = pl.read_parquet(f"{config.OUTPUT_PATH}/covisit_data_prepared.parquet")
            log(f"   Successfully reloaded data: {data.shape}")
        except Exception as reload_error:
            log(f"   Failed to reload data: {reload_error}")
            log("   Returning minimal conversion analysis...")
            return {
                "analysis_timestamp": datetime.now().isoformat(),
                "error": "Could not load data for conversion analysis",
                "sample_size": 0,
                "conversion_rates": {
                    "click_to_cart_percent": 0.0,
                    "click_to_order_percent": 0.0,
                    "cart_to_order_percent": 0.0
                },
                "session_counts": {
                    "sessions_with_clicks": 0,
                    "sessions_with_carts": 0,
                    "sessions_with_orders": 0
                },
                "conversion_timing": {
                    "avg_hours_to_conversion": 0.0,
                    "median_hours_to_conversion": 0.0,
                    "conversion_samples": 0
                }
            }

    try:
        # Sample sessions for analysis (reduced sample size to save memory)
        log("   Sampling sessions for analysis...")
        unique_sessions = data.select("session").unique()
        sample_size = min(5000, len(unique_sessions))  # Reduced from 10000 to 5000

        sample_sessions = unique_sessions.sample(sample_size, seed=42)["session"].to_list()
        sample_data = data.filter(pl.col("session").is_in(sample_sessions))
        log(f"   Using sample of {sample_size:,} sessions")

        # Conversion funnel analysis
        log("   Calculating conversion funnel...")
        session_conversion = sample_data.group_by("session").agg([
            pl.col("type").filter(pl.col("type") == "clicks").count().alias("clicks"),
            pl.col("type").filter(pl.col("type") == "carts").count().alias("carts"),
            pl.col("type").filter(pl.col("type") == "orders").count().alias("orders"),
            pl.col("aid").n_unique().alias("unique_items")
        ])

        # Calculate conversion rates
        sessions_with_clicks = session_conversion.filter(pl.col("clicks") > 0).height
        sessions_with_carts = session_conversion.filter(pl.col("carts") > 0).height
        sessions_with_orders = session_conversion.filter(pl.col("orders") > 0).height

        click_to_cart_rate = sessions_with_carts / sessions_with_clicks * 100 if sessions_with_clicks > 0 else 0
        click_to_order_rate = sessions_with_orders / sessions_with_clicks * 100 if sessions_with_clicks > 0 else 0
        cart_to_order_rate = sessions_with_orders / sessions_with_carts * 100 if sessions_with_carts > 0 else 0

        # Time to conversion analysis (smaller sample to save memory and time)
        log("   Analyzing conversion timing...")
        conversion_times = []

        # Convert to pandas for time analysis (smaller sample)
        time_analysis_sample = sample_sessions[:500]  # Even smaller sample for timing analysis
        time_sample_data = sample_data.filter(pl.col("session").is_in(time_analysis_sample))
        sample_pd = time_sample_data.to_pandas()

        for session_id in time_analysis_sample:
            try:
                session_events = sample_pd[sample_pd['session'] == session_id].sort_values('ts')

                if len(session_events) > 1:
                    clicks_data = session_events[session_events['type'] == 'clicks']
                    purchases_data = session_events[session_events['type'].isin(['carts', 'orders'])]

                    if len(clicks_data) > 0 and len(purchases_data) > 0:
                        first_click = clicks_data['ts'].min()
                        first_purchase = purchases_data['ts'].min()

                        if pd.notna(first_click) and pd.notna(first_purchase) and first_purchase > first_click:
                            time_to_conversion = (first_purchase - first_click) / (1000 * 60 * 60)  # Hours
                            if 0 < time_to_conversion < 24 * 30:  # Reasonable range: 0-30 days
                                conversion_times.append(time_to_conversion)
            except Exception as session_error:
                # Skip problematic sessions
                continue

        # Conversion time statistics
        if conversion_times:
            try:
                avg_conversion_hours = np.mean(conversion_times)
                median_conversion_hours = np.median(conversion_times)
            except Exception as stats_error:
                log(f"   Error calculating conversion time stats: {stats_error}")
                avg_conversion_hours = median_conversion_hours = 0
        else:
            avg_conversion_hours = median_conversion_hours = 0

        log(f"    Conversion analysis results:")
        log(f"      Click-to-cart rate: {click_to_cart_rate:.1f}%")
        log(f"      Click-to-order rate: {click_to_order_rate:.1f}%")
        log(f"      Cart-to-order rate: {cart_to_order_rate:.1f}%")
        log(f"      Avg time to conversion: {avg_conversion_hours:.1f} hours")
        log(f"      Timing samples analyzed: {len(conversion_times):,}")

        conversion_analysis = {
            "analysis_timestamp": datetime.now().isoformat(),
            "sample_size": sample_size,
            "conversion_rates": {
                "click_to_cart_percent": float(click_to_cart_rate),
                "click_to_order_percent": float(click_to_order_rate),
                "cart_to_order_percent": float(cart_to_order_rate)
            },
            "session_counts": {
                "sessions_with_clicks": int(sessions_with_clicks),
                "sessions_with_carts": int(sessions_with_carts),
                "sessions_with_orders": int(sessions_with_orders)
            },
            "conversion_timing": {
                "avg_hours_to_conversion": float(avg_conversion_hours),
                "median_hours_to_conversion": float(median_conversion_hours),
                "conversion_samples": len(conversion_times)
            }
        }

        log(" Conversion pattern analysis completed!")

        # Clean up sample data to save memory
        try:
            del sample_data, sample_pd, time_sample_data
            gc.collect()
            log("   Sample data cleaned up")
        except Exception as cleanup_error:
            log(f"   Cleanup warning: {cleanup_error}")

        return conversion_analysis

    except Exception as e:
        log(f" Error during conversion analysis: {e}")
        log("   Returning fallback conversion analysis...")

        # Return fallback analysis
        return {
            "analysis_timestamp": datetime.now().isoformat(),
            "error": f"Conversion analysis failed: {str(e)}",
            "sample_size": 0,
            "conversion_rates": {
                "click_to_cart_percent": 0.0,
                "click_to_order_percent": 0.0,
                "cart_to_order_percent": 0.0
            },
            "session_counts": {
                "sessions_with_clicks": 0,
                "sessions_with_carts": 0,
                "sessions_with_orders": 0
            },
            "conversion_timing": {
                "avg_hours_to_conversion": 0.0,
                "median_hours_to_conversion": 0.0,
                "conversion_samples": 0
            }
        }

# Analyze conversion patterns with robust error handling
try:
    # Check if prepared_data is still available in the current scope
    if 'prepared_data' in globals() and prepared_data is not None:
        log(" Using existing prepared_data for conversion analysis...")
        conversion_analysis = analyze_conversion_patterns_safe(prepared_data)
    else:
        log(" prepared_data not available in global scope - attempting to reload...")
        conversion_analysis = analyze_conversion_patterns_safe(None)

except Exception as analysis_error:
    log(f" Critical error in conversion analysis: {analysis_error}")
    # Create minimal fallback analysis
    conversion_analysis = {
        "analysis_timestamp": datetime.now().isoformat(),
        "error": f"Critical conversion analysis failure: {str(analysis_error)}",
        "sample_size": 0,
        "conversion_rates": {
            "click_to_cart_percent": 0.0,
            "click_to_order_percent": 0.0,
            "cart_to_order_percent": 0.0
        },
        "session_counts": {
            "sessions_with_clicks": 0,
            "sessions_with_carts": 0,
            "sessions_with_orders": 0
        },
        "conversion_timing": {
            "avg_hours_to_conversion": 0.0,
            "median_hours_to_conversion": 0.0,
            "conversion_samples": 0
        }
    }

# Now clean up prepared_data to free memory
try:
    if 'prepared_data' in globals():
        del prepared_data
        gc.collect()
        log(" prepared_data cleared from memory after conversion analysis")
except Exception as cleanup_error:
    log(f"   Cleanup warning: {cleanup_error}")

[2025-08-07 18:47:52]  Using existing prepared_data for conversion analysis...
[2025-08-07 18:47:52]  Analyzing conversion patterns...
[2025-08-07 18:47:52]    Sampling sessions for analysis...
[2025-08-07 18:47:57]    Using sample of 5,000 sessions
[2025-08-07 18:47:57]    Calculating conversion funnel...
[2025-08-07 18:47:57]    Analyzing conversion timing...
[2025-08-07 18:47:58]     Conversion analysis results:
[2025-08-07 18:47:58]       Click-to-cart rate: 29.6%
[2025-08-07 18:47:58]       Click-to-order rate: 13.0%
[2025-08-07 18:47:58]       Cart-to-order rate: 43.8%
[2025-08-07 18:47:58]       Avg time to conversion: 59.6 hours
[2025-08-07 18:47:58]       Timing samples analyzed: 154
[2025-08-07 18:47:58]  Conversion pattern analysis completed!
[2025-08-07 18:47:59]    Sample data cleaned up
[2025-08-07 18:48:02]  prepared_data cleared from memory after conversion analysis


## SAVE OUTPUTS

In [11]:
def convert_to_json_serializable(obj):
    """
    Convert non-JSON serializable objects to JSON-compatible types

    Args:
        obj: Object to convert

    Returns:
        JSON-serializable version of the object
    """
    if isinstance(obj, np.ndarray):
        return obj.tolist()
    elif isinstance(obj, np.integer):
        return int(obj)
    elif isinstance(obj, np.floating):
        return float(obj)
    elif isinstance(obj, np.bool_):
        return bool(obj)
    elif isinstance(obj, dict):
        return {key: convert_to_json_serializable(value) for key, value in obj.items()}
    elif isinstance(obj, list):
        return [convert_to_json_serializable(item) for item in obj]
    elif isinstance(obj, tuple):
        return [convert_to_json_serializable(item) for item in obj]
    elif hasattr(obj, 'item'):  # Handle numpy scalars
        return obj.item()
    else:
        return obj

def save_buy_matrices_outputs_enhanced(click_to_buy_matrix: Dict,
                                     buy_to_buy_matrix: Dict,
                                     matrix_analysis: Dict,
                                     conversion_analysis: Dict,
                                     validation_results: Dict,
                                     memory_log: List,
                                     click_to_buy_success: bool,
                                     buy_to_buy_success: bool,
                                     ctb_generation_time: float,
                                     btb_generation_time: float):
    """
    Save all outputs from buy matrices generation with proper JSON handling and error recovery

    Args:
        click_to_buy_matrix: Generated click-to-buy matrix
        buy_to_buy_matrix: Generated buy-to-buy matrix
        matrix_analysis: Matrix analysis results
        conversion_analysis: Conversion pattern analysis
        validation_results: Input validation results
        memory_log: Memory usage log
        click_to_buy_success: Success status of CTB generation
        buy_to_buy_success: Success status of BTB generation
        ctb_generation_time: Time taken for CTB generation
        btb_generation_time: Time taken for BTB generation
    """
    log(" Saving buy matrices outputs with enhanced error handling...")

    try:
        saved_files = {}

        # 1. Save click-to-buy matrix
        ctb_path = f"{config.OUTPUT_PATH}/click_to_buy_matrix.pkl"
        with open(ctb_path, "wb") as f:
            pickle.dump(click_to_buy_matrix, f)

        ctb_size = os.path.getsize(ctb_path) / (1024*1024)
        log(f"    click_to_buy_matrix.pkl saved ({ctb_size:.1f} MB)")
        saved_files["ctb_path"] = ctb_path

        # 2. Save buy-to-buy matrix
        btb_path = f"{config.OUTPUT_PATH}/buy_to_buy_matrix.pkl"
        with open(btb_path, "wb") as f:
            pickle.dump(buy_to_buy_matrix, f)

        btb_size = os.path.getsize(btb_path) / (1024*1024)
        log(f"    buy_to_buy_matrix.pkl saved ({btb_size:.1f} MB)")
        saved_files["btb_path"] = btb_path

        # 3. Save matrix statistics (with JSON serialization fix)
        stats_path = f"{config.OUTPUT_PATH}/buy_matrices_statistics.json"

        # Convert analysis to JSON-serializable format
        json_safe_analysis = convert_to_json_serializable(matrix_analysis)

        with open(stats_path, "w") as f:
            json.dump(json_safe_analysis, f, indent=2, default=str)
        log(f"    buy_matrices_statistics.json saved")
        saved_files["stats_path"] = stats_path

        # 4. Save temporal validation results
        temporal_path = f"{config.OUTPUT_PATH}/temporal_validation.json"
        temporal_data = {
            "generation_timestamp": datetime.now().isoformat(),
            "time_windows": convert_to_json_serializable(validation_results.get("time_windows", {})),
            "temporal_analysis": convert_to_json_serializable(matrix_analysis.get("temporal_analysis", {})),
            "validation_notes": [
                f"Click-to-buy window: {config.CLICK_TO_BUY_TIME_WINDOW_DAYS} days",
                f"Buy-to-buy window: {config.BUY_TO_BUY_TIME_WINDOW_DAYS} days",
                "Temporal constraints enforced for all relationships"
            ]
        }

        with open(temporal_path, "w") as f:
            json.dump(temporal_data, f, indent=2, default=str)
        log(f"    temporal_validation.json saved")
        saved_files["temporal_path"] = temporal_path

        # 5. Save conversion analysis (with JSON conversion and error handling)
        conversion_path = f"{config.OUTPUT_PATH}/conversion_analysis.json"

        try:
            # Validate conversion_analysis before saving
            if conversion_analysis and isinstance(conversion_analysis, dict):
                json_safe_conversion = convert_to_json_serializable(conversion_analysis)
            else:
                log("    Invalid conversion analysis - creating fallback")
                json_safe_conversion = {
                    "analysis_timestamp": datetime.now().isoformat(),
                    "error": "Conversion analysis data was invalid or missing",
                    "sample_size": 0,
                    "conversion_rates": {"click_to_cart_percent": 0.0, "click_to_order_percent": 0.0, "cart_to_order_percent": 0.0},
                    "session_counts": {"sessions_with_clicks": 0, "sessions_with_carts": 0, "sessions_with_orders": 0},
                    "conversion_timing": {"avg_hours_to_conversion": 0.0, "median_hours_to_conversion": 0.0, "conversion_samples": 0}
                }
        except Exception as conv_prep_error:
            log(f"    Error preparing conversion analysis for save: {conv_prep_error}")
            json_safe_conversion = {
                "analysis_timestamp": datetime.now().isoformat(),
                "error": f"Error preparing conversion data: {str(conv_prep_error)}",
                "sample_size": 0,
                "conversion_rates": {"click_to_cart_percent": 0.0, "click_to_order_percent": 0.0, "cart_to_order_percent": 0.0},
                "session_counts": {"sessions_with_clicks": 0, "sessions_with_carts": 0, "sessions_with_orders": 0},
                "conversion_timing": {"avg_hours_to_conversion": 0.0, "median_hours_to_conversion": 0.0, "conversion_samples": 0}
            }

        with open(conversion_path, "w") as f:
            json.dump(json_safe_conversion, f, indent=2, default=str)
        log(f"    conversion_analysis.json saved")
        saved_files["conversion_path"] = conversion_path

        # 6. Save memory usage log (with JSON conversion)
        memory_path = f"{config.OUTPUT_PATH}/buy_matrices_memory_log.json"

        try:
            json_safe_memory_log = convert_to_json_serializable(memory_log)
        except Exception as memory_convert_error:
            log(f"    Error converting memory log: {memory_convert_error}")
            json_safe_memory_log = []

        memory_data = {
            "generation_timestamp": datetime.now().isoformat(),
            "memory_log": json_safe_memory_log,
            "generation_successful": {
                "click_to_buy": click_to_buy_success,
                "buy_to_buy": buy_to_buy_success,
                "both_matrices": click_to_buy_success and buy_to_buy_success
            },
            "generation_times": {
                "click_to_buy_seconds": ctb_generation_time,
                "buy_to_buy_seconds": btb_generation_time
            }
        }

        with open(memory_path, "w") as f:
            json.dump(memory_data, f, indent=2, default=str)
        log(f"    buy_matrices_memory_log.json saved")
        saved_files["memory_path"] = memory_path

        # 7. Save comprehensive summary
        try:
            summary = {
                "notebook": "Part 2A3: Click-to-Buy & Buy-to-Buy Matrix Generation",
                "completion_timestamp": datetime.now().isoformat(),
                "generation_results": {
                    "click_to_buy_successful": click_to_buy_success,
                    "buy_to_buy_successful": buy_to_buy_success,
                    "both_matrices_generated": click_to_buy_success and buy_to_buy_success
                },
                "inputs_used": {
                    "covisit_data_prepared.parquet": "Optimized training data from Part 2A1",
                    "chunking_strategy.json": f"Chunk sizes: CTB={validation_results.get('chunk_sizes', {}).get('click_to_buy', 0):,}, BTB={validation_results.get('chunk_sizes', {}).get('buy_to_buy', 0):,}",
                    "session_analysis.json": "Session insights for optimization"
                },
                "outputs_generated": {
                    "click_to_buy_matrix.pkl": f"{matrix_analysis.get('click_to_buy_analysis', {}).get('source_items', 0):,} source items",
                    "buy_to_buy_matrix.pkl": f"{matrix_analysis.get('buy_to_buy_analysis', {}).get('source_items', 0):,} source items",
                    "buy_matrices_statistics.json": "Comprehensive matrix analysis",
                    "temporal_validation.json": "Temporal constraint validation",
                    "conversion_analysis.json": "Click-to-buy conversion patterns (with error handling)",
                    "buy_matrices_memory_log.json": "Memory usage tracking"
                },
                "key_metrics": convert_to_json_serializable({
                    "ctb_source_items": matrix_analysis.get("click_to_buy_analysis", {}).get("source_items", 0),
                    "btb_source_items": matrix_analysis.get("buy_to_buy_analysis", {}).get("source_items", 0),
                    "total_file_size_mb": float(ctb_size + btb_size),
                    "valid_temporal_transitions": matrix_analysis.get("temporal_analysis", {}).get("valid_transitions", 0),
                    "peak_memory_percent": max([entry.get("memory_percent", 0) for entry in memory_log]) if memory_log else 0,
                    "conversion_analysis_status": "completed" if not conversion_analysis.get("error") else "completed_with_errors"
                }),
                "memory_management": {
                    "intermediate_saves_used": True,
                    "emergency_saves_triggered": any(entry.get("memory_percent", 0) > 90 for entry in memory_log) if memory_log else False,
                    "memory_efficient_processing": True
                },
                "data_quality": {
                    "conversion_analysis_available": "error" not in conversion_analysis if conversion_analysis else False,
                    "temporal_validation_passed": matrix_analysis.get("temporal_analysis", {}).get("valid_percentage", 0) > 50
                },
                "next_step": "Run Part 2A4: Matrix Consolidation & Validation" if (click_to_buy_success and buy_to_buy_success) else "Review errors and retry with adjusted parameters"
            }
        except Exception as summary_error:
            log(f"    Error creating comprehensive summary: {summary_error}")
            # Create minimal summary
            summary = {
                "notebook": "Part 2A3: Click-to-Buy & Buy-to-Buy Matrix Generation",
                "completion_timestamp": datetime.now().isoformat(),
                "error": f"Summary creation error: {str(summary_error)}",
                "basic_status": "Matrices saved but summary incomplete"
            }

        summary_path = f"{config.OUTPUT_PATH}/part_2a3_summary.json"
        with open(summary_path, "w") as f:
            json.dump(summary, f, indent=2, default=str)
        log(f"    part_2a3_summary.json saved")
        saved_files["summary_path"] = summary_path

        log(" All buy matrices outputs saved successfully!")
        return saved_files

    except Exception as e:
        log(f" Error saving outputs: {e}")
        log(f"   Error type: {type(e).__name__}")
        log("   Attempting fallback saves...")

        # Fallback: save essential files only
        try:
            fallback_files = {}

            # Save matrices (most important)
            if click_to_buy_matrix:
                ctb_fallback_path = f"{config.OUTPUT_PATH}/click_to_buy_matrix.pkl"
                with open(ctb_fallback_path, "wb") as f:
                    pickle.dump(click_to_buy_matrix, f)
                fallback_files["ctb_path"] = ctb_fallback_path
                ctb_size = os.path.getsize(ctb_fallback_path) / (1024*1024)
                log(f"    Fallback: click_to_buy_matrix.pkl saved ({ctb_size:.1f} MB)")

            if buy_to_buy_matrix:
                btb_fallback_path = f"{config.OUTPUT_PATH}/buy_to_buy_matrix.pkl"
                with open(btb_fallback_path, "wb") as f:
                    pickle.dump(buy_to_buy_matrix, f)
                fallback_files["btb_path"] = btb_fallback_path
                btb_size = os.path.getsize(btb_fallback_path) / (1024*1024)
                log(f"    Fallback: buy_to_buy_matrix.pkl saved ({btb_size:.1f} MB)")

            # Save basic summary as text (avoiding JSON issues)
            summary_txt_path = f"{config.OUTPUT_PATH}/part_2a3_summary.txt"
            with open(summary_txt_path, "w") as f:
                f.write(f"Part 2A3 Summary - Emergency Save\n")
                f.write(f"Generated: {datetime.now().isoformat()}\n")
                f.write(f"Click-to-buy Success: {click_to_buy_success}\n")
                f.write(f"Buy-to-buy Success: {buy_to_buy_success}\n")
                f.write(f"CTB Source Items: {len(click_to_buy_matrix):,}\n")
                f.write(f"BTB Source Items: {len(buy_to_buy_matrix):,}\n")
                f.write(f"CTB File Size: {ctb_size:.1f} MB\n")
                f.write(f"BTB File Size: {btb_size:.1f} MB\n")
                f.write(f"Total File Size: {(ctb_size + btb_size):.1f} MB\n")
                f.write(f"Note: This is an emergency save due to JSON serialization issues\n")
                f.write(f"Main matrices were saved successfully\n")
            fallback_files["summary_txt_path"] = summary_txt_path
            log(f"    Fallback: part_2a3_summary.txt saved")

            return fallback_files

        except Exception as fallback_error:
            log(f" Fallback save also failed: {fallback_error}")
            raise e

# Save all outputs with enhanced error handling
try:
    output_paths = save_buy_matrices_outputs_enhanced(
        click_to_buy_matrix,
        buy_to_buy_matrix,
        matrix_analysis,
        conversion_analysis,
        validation_results,
        memory_log,
        click_to_buy_success,
        buy_to_buy_success,
        ctb_generation_time,
        btb_generation_time
    )
    log(" Output saving completed successfully!")
except Exception as save_error:
    log(f" Critical error saving outputs: {save_error}")
    # Create minimal output paths for summary section
    output_paths = {
        "ctb_path": f"{config.OUTPUT_PATH}/click_to_buy_matrix.pkl",
        "btb_path": f"{config.OUTPUT_PATH}/buy_to_buy_matrix.pkl",
        "stats_path": f"{config.OUTPUT_PATH}/buy_matrices_statistics.json",
        "temporal_path": f"{config.OUTPUT_PATH}/temporal_validation.json",
        "conversion_path": f"{config.OUTPUT_PATH}/conversion_analysis.json",
        "summary_path": f"{config.OUTPUT_PATH}/part_2a3_summary.json"
    }

[2025-08-07 18:48:02]  Saving buy matrices outputs with enhanced error handling...
[2025-08-07 18:48:07]     click_to_buy_matrix.pkl saved (65.3 MB)
[2025-08-07 18:48:10]     buy_to_buy_matrix.pkl saved (32.3 MB)
[2025-08-07 18:48:10]     buy_matrices_statistics.json saved
[2025-08-07 18:48:10]     temporal_validation.json saved
[2025-08-07 18:48:10]     conversion_analysis.json saved
[2025-08-07 18:48:10]     buy_matrices_memory_log.json saved
[2025-08-07 18:48:10]     part_2a3_summary.json saved
[2025-08-07 18:48:10]  All buy matrices outputs saved successfully!
[2025-08-07 18:48:10]  Output saving completed successfully!


## FINAL SUMMARY AND NEXT STEPS

In [12]:
log("\n" + "="*80)
log("PART 2A3 COMPLETED: CLICK-TO-BUY & BUY-TO-BUY MATRIX GENERATION")
log("="*80)

log(f"\n MATRIX GENERATION RESULTS:")
if matrix_analysis.get("click_to_buy_analysis"):
    ctb_analysis = matrix_analysis["click_to_buy_analysis"]
    log(f" Click-to-buy matrix:")
    log(f"    Source items: {ctb_analysis['source_items']:,}")
    log(f"    Total pairs: {ctb_analysis['total_pairs']:,}")
    log(f"    Avg candidates: {ctb_analysis['avg_candidates_per_item']:.1f}")

if matrix_analysis.get("buy_to_buy_analysis"):
    btb_analysis = matrix_analysis["buy_to_buy_analysis"]
    log(f" Buy-to-buy matrix:")
    log(f"    Source items: {btb_analysis['source_items']:,}")
    log(f"    Total pairs: {btb_analysis['total_pairs']:,}")
    log(f"    Avg candidates: {btb_analysis['avg_candidates_per_item']:.1f}")

if matrix_analysis.get("temporal_analysis"):
    temporal = matrix_analysis["temporal_analysis"]
    log(f" Temporal validation:")
    log(f"     Valid transitions: {temporal.get('valid_transitions', 0):,}")
    log(f"     Invalid transitions: {temporal.get('invalid_transitions', 0):,}")
    log(f"     Valid percentage: {temporal.get('valid_percentage', 0):.1f}%")

log(f"\n CONVERSION INSIGHTS:")
# Handle cases where conversion analysis might have errors
try:
    conv_rates = conversion_analysis.get("conversion_rates", {})
    conv_timing = conversion_analysis.get("conversion_timing", {})

    # Check if there was an error in conversion analysis
    if "error" in conversion_analysis:
        log(f"  Conversion analysis had issues: {conversion_analysis['error']}")
        log(f" Using fallback conversion metrics:")
    else:
        log(f" Conversion analysis results:")

    log(f"   Click-to-cart rate: {conv_rates.get('click_to_cart_percent', 0):.1f}%")
    log(f"   Click-to-order rate: {conv_rates.get('click_to_order_percent', 0):.1f}%")
    log(f"   Cart-to-order rate: {conv_rates.get('cart_to_order_percent', 0):.1f}%")
    log(f"   Avg time to conversion: {conv_timing.get('avg_hours_to_conversion', 0):.1f} hours")
    log(f"   Sample size: {conversion_analysis.get('sample_size', 0):,} sessions")

except Exception as conv_error:
    log(f"  Error displaying conversion insights: {conv_error}")
    # Provide minimal fallback display
    log(f" Conversion analysis: Not available due to errors")

log(f"\n OUTPUT FILES GENERATED:")
try:
    if 'output_paths' in globals() and output_paths:
        for description, path in output_paths.items():
            filename = os.path.basename(path)
            try:
                if path.endswith('.pkl') and os.path.exists(path):
                    file_size = os.path.getsize(path) / (1024*1024)
                    log(f"    {filename} ({file_size:.1f} MB)")
                elif os.path.exists(path):
                    log(f"    {filename}")
                else:
                    log(f"    {filename} (not found)")
            except Exception as file_error:
                log(f"    {filename} (error checking: {file_error})")
        log(f" Files location: {config.OUTPUT_PATH}")
    else:
        log("    Output paths not available - files may not have been saved properly")

except Exception as file_list_error:
    log(f"  Error listing output files: {file_list_error}")

# Quality assessment with robust error handling
log(f"\n QUALITY ASSESSMENT:")
try:
    matrices_generated = click_to_buy_success and buy_to_buy_success
    temporal_valid = matrix_analysis.get("temporal_analysis", {}).get("valid_percentage", 0) > 50

    # Handle conversion rate check safely
    try:
        conversion_reasonable = conv_rates.get("click_to_cart_percent", 0) > 1
        conv_rate_display = conv_rates.get("click_to_cart_percent", 0)
    except:
        conversion_reasonable = False
        conv_rate_display = 0

    log(f"   Both matrices generated: {' yes' if matrices_generated else ' no'}")
    log(f"   Temporal validation (>50%): {' yes' if temporal_valid else ' no'} ({matrix_analysis.get('temporal_analysis', {}).get('valid_percentage', 0):.1f}%)")
    log(f"   Conversion rates reasonable: {' yes' if conversion_reasonable else ' no'} ({conv_rate_display:.1f}%)")

    # File size check with error handling
    try:
        if 'output_paths' in globals() and output_paths and 'ctb_path' in output_paths and 'btb_path' in output_paths:
            total_size = sum(os.path.getsize(output_paths[key]) for key in ['ctb_path', 'btb_path'] if key in output_paths and os.path.exists(output_paths[key]))
            file_sizes_ok = total_size < 500 * 1024 * 1024  # 500MB limit
            total_size_mb = total_size / (1024 * 1024)
            log(f"   File sizes appropriate: {' yes' if file_sizes_ok else ' large'} ({total_size_mb:.1f} MB total)")
        else:
            log(f"   File sizes appropriate:  unknown")
            file_sizes_ok = True  # Assume OK if we can't check
    except Exception as size_error:
        log(f"   File sizes appropriate:  error checking ({size_error})")
        file_sizes_ok = True

    # Overall quality assessment
    quality_checks = [matrices_generated, temporal_valid, conversion_reasonable, file_sizes_ok]
    quality_score = sum(quality_checks)

    if quality_score >= 3:
        overall_quality = " EXCELLENT"
    elif quality_score >= 2:
        overall_quality = " GOOD"
    elif quality_score >= 1:
        overall_quality = "  ACCEPTABLE"
    else:
        overall_quality = " NEEDS REVIEW"

    log(f"\n Overall Quality: {overall_quality} ({quality_score}/4 criteria met)")

except Exception as quality_error:
    log(f"  Error in quality assessment: {quality_error}")
    log(f" Overall Quality:  Unable to assess due to errors")

# Performance metrics
try:
    total_generation_time = ctb_generation_time + btb_generation_time
    if total_generation_time > 0:
        log(f"\n  PERFORMANCE METRICS:")
        log(f"   Total generation time: {total_generation_time:.1f} seconds ({total_generation_time/60:.1f} minutes)")
        log(f"   Click-to-buy time: {ctb_generation_time:.1f} seconds")
        log(f"   Buy-to-buy time: {btb_generation_time:.1f} seconds")

        # Processing speed estimate
        try:
            total_sessions_processed = validation_results.get("mixed_sessions", 0)
            if total_sessions_processed > 0:
                sessions_per_second = total_sessions_processed / total_generation_time
                log(f"   Processing speed: ~{sessions_per_second:.1f} sessions/second")
        except Exception as speed_error:
            log(f"   Processing speed: Unable to calculate")

except Exception as perf_error:
    log(f"  Error calculating performance metrics: {perf_error}")

# Next steps recommendations
log(f"\n RECOMMENDATIONS:")
try:
    if both_matrices_successful:
        log(f"    Matrix generation completed successfully!")
        log(f"    Ready to proceed to Part 2A4: Matrix Consolidation & Validation")
        log(f"    Both matrices have good quality and are suitable for recommendations")
    else:
        log(f"     Matrix generation incomplete:")
        if click_to_buy_success:
            log(f"       Click-to-buy matrix: Success")
        else:
            log(f"       Click-to-buy matrix: Failed")
        if buy_to_buy_success:
            log(f"       Buy-to-buy matrix: Success")
        else:
            log(f"       Buy-to-buy matrix: Failed")
        log(f"    Review logs for error details and retry if needed")

    # Memory optimization suggestions
    try:
        if 'memory_log' in globals() and memory_log:
            peak_memory = max([entry.get("memory_percent", 0) for entry in memory_log])
            if peak_memory > 85:
                log(f"    Memory usage was high ({peak_memory:.1f}% peak)")
                log(f"      Consider reducing chunk sizes for future runs")
            elif peak_memory > 0:
                log(f"    Memory usage was acceptable ({peak_memory:.1f}% peak)")
        else:
            log(f"    Memory usage: No data available")
    except Exception as memory_rec_error:
        log(f"    Memory usage: Unable to analyze")

except Exception as rec_error:
    log(f"  Error generating recommendations: {rec_error}")

# Final cleanup
log(f"\n PERFORMING FINAL CLEANUP...")
cleanup_items = []

try:
    # Clean up any remaining large objects
    items_to_cleanup = ['click_to_buy_matrix', 'buy_to_buy_matrix', 'click_to_buy_generator', 'buy_to_buy_generator']

    for item in items_to_cleanup:
        if item in globals():
            try:
                item_obj = globals()[item]
                item_size = len(item_obj) if hasattr(item_obj, '__len__') else 'unknown'
                del globals()[item]
                cleanup_items.append(f"{item} ({item_size} items)" if item_size != 'unknown' else item)
            except Exception as item_cleanup_error:
                cleanup_items.append(f"{item} (cleanup error)")

    # Force garbage collection
    for i in range(3):
        gc.collect()

    if cleanup_items:
        log(f"     Cleaned up: {', '.join(cleanup_items)}")

    final_memory_status = check_memory()
    log(f"    Final memory status: {final_memory_status}")

except Exception as final_cleanup_error:
    log(f"    Cleanup error: {final_cleanup_error}")

log(f"\n Part 2A3 processing completed!")
log(f" Check all output files in: {config.OUTPUT_PATH}")

# Save final execution summary
try:
    execution_summary = {
        "notebook": "Part 2A3: Click-to-Buy & Buy-to-Buy Matrix Generation",
        "completion_timestamp": datetime.now().isoformat(),
        "success_status": {
            "click_to_buy_successful": click_to_buy_success,
            "buy_to_buy_successful": buy_to_buy_success,
            "both_successful": click_to_buy_success and buy_to_buy_success
        },
        "performance": {
            "total_generation_time_seconds": total_generation_time if 'total_generation_time' in locals() else 0,
            "peak_memory_percent": peak_memory if 'peak_memory' in locals() else 0
        },
        "quality_score": quality_score if 'quality_score' in locals() else 0,
        "next_step": "Part 2A4: Matrix Consolidation & Validation" if both_matrices_successful else "Review errors and retry"
    }

    with open(f"{config.OUTPUT_PATH}/part_2a3_execution_summary.json", "w") as f:
        json.dump(execution_summary, f, indent=2, default=str)

    log(f" Execution summary saved: part_2a3_execution_summary.json")

except Exception as summary_save_error:
    log(f"  Could not save execution summary: {summary_save_error}")

log(f" Part 2A3 finished!")

[2025-08-07 18:48:10] 
[2025-08-07 18:48:10] PART 2A3 COMPLETED: CLICK-TO-BUY & BUY-TO-BUY MATRIX GENERATION
[2025-08-07 18:48:10] 
 MATRIX GENERATION RESULTS:
[2025-08-07 18:48:10]  Click-to-buy matrix:
[2025-08-07 18:48:10]     Source items: 841,226
[2025-08-07 18:48:10]     Total pairs: 6,851,523
[2025-08-07 18:48:10]     Avg candidates: 8.1
[2025-08-07 18:48:10]  Buy-to-buy matrix:
[2025-08-07 18:48:10]     Source items: 311,156
[2025-08-07 18:48:10]     Total pairs: 3,483,335
[2025-08-07 18:48:10]     Avg candidates: 11.2
[2025-08-07 18:48:10]  Temporal validation:
[2025-08-07 18:48:10]      Valid transitions: 49,261,768
[2025-08-07 18:48:10]      Invalid transitions: 3,378,105
[2025-08-07 18:48:10]      Valid percentage: 93.6%
[2025-08-07 18:48:10] 
 CONVERSION INSIGHTS:
[2025-08-07 18:48:10]  Conversion analysis results:
[2025-08-07 18:48:10]    Click-to-cart rate: 29.6%
[2025-08-07 18:48:10]    Click-to-order rate: 13.0%
[2025-08-07 18:48:10]    Cart-to-order rate: 43.8%
[2025-