# Part 2A2 Click to Click Matrix Generation

In [1]:
# Install required packages
!pip install polars==0.20.31
!pip install psutil

import polars as pl
import pandas as pd
import numpy as np
import gc
import os
import pickle
import json
import time
import psutil
from typing import Dict, List, Tuple, Optional
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')



In [2]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Configuration
class Config:
    DATA_PATH = '/content/drive/MyDrive/Colab Notebooks/CML/Assignment 1/content/otto-data'
    OUTPUT_PATH = '/content/drive/MyDrive/Colab Notebooks/CML/Assignment 1/content/otto-output'

    # Matrix generation parameters
    MAX_CANDIDATES_PER_ITEM = 40      # Maximum candidates to store per source item
    MEMORY_CHECK_INTERVAL = 50        # Check memory every N chunks
    EMERGENCY_MEMORY_THRESHOLD = 90   # Emergency stop if memory > 90%
    TIME_WINDOW_HOURS = 24           # Click co-visitation time window

    # Safety parameters
    MAX_PROCESSING_TIME_HOURS = 3    # Maximum processing time before timeout
    AUTO_SAVE_INTERVAL = 1000        # Auto-save every N chunks

config = Config()

## LOGGING AND MONITORING SETUP

In [4]:
def setup_logging_and_monitoring():
    """Setup comprehensive logging and memory monitoring"""
    log_file = f"{config.OUTPUT_PATH}/click_matrix_generation_log.txt"
    memory_log = []

    def log_message(message):
        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        log_entry = f"[{timestamp}] {message}"
        print(log_entry)

        # Also write to file
        with open(log_file, "a") as f:
            f.write(log_entry + "\n")

    def check_memory_usage():
        """Check current memory usage and log critical levels"""
        memory = psutil.virtual_memory()
        memory_pct = memory.percent
        available_gb = memory.available / (1024**3)

        memory_entry = {
            "timestamp": datetime.now().isoformat(),
            "memory_percent": memory_pct,
            "available_gb": available_gb,
            "used_gb": memory.used / (1024**3)
        }
        memory_log.append(memory_entry)

        if memory_pct > config.EMERGENCY_MEMORY_THRESHOLD:
            log_message(f"CRITICAL MEMORY WARNING: {memory_pct:.1f}% used, {available_gb:.1f} GB available")
            return "CRITICAL"
        elif memory_pct > 80:
            log_message(f"HIGH MEMORY USAGE: {memory_pct:.1f}% used, {available_gb:.1f} GB available")
            return "HIGH"
        elif memory_pct > 60:
            log_message(f"Memory usage: {memory_pct:.1f}% used, {available_gb:.1f} GB available")
            return "NORMAL"
        else:
            return "LOW"

    return log_message, check_memory_usage, memory_log

log, check_memory, memory_log = setup_logging_and_monitoring()

log("="*80)
log("OTTO PART 2A2: CLICK-TO-CLICK MATRIX GENERATION STARTED")
log("="*80)

# Initial memory check
initial_memory_status = check_memory()
log(f"Initial memory status: {initial_memory_status}")

[2025-08-07 16:13:19] OTTO PART 2A2: CLICK-TO-CLICK MATRIX GENERATION STARTED
[2025-08-07 16:13:19] Initial memory status: LOW


## INPUT VALIDATION AND LOADING

In [5]:
def validate_and_load_inputs():
    """
    Validate and load all required inputs from Part 2A1

    Returns:
        tuple: (prepared_data, chunking_strategy, session_analysis, validation_results)
    """
    log("Validating and loading inputs from Part 2A1...")

    # Required input files
    required_files = {
        "covisit_data_prepared.parquet": "Optimized training data from Part 2A1",
        "chunking_strategy.json": "Memory management configuration from Part 2A1",
        "session_analysis.json": "Session analysis results from Part 2A1"
    }

    # Check if files exist
    missing_files = []
    for filename, description in required_files.items():
        filepath = f"{config.OUTPUT_PATH}/{filename}"
        if not os.path.exists(filepath):
            missing_files.append(f"{filename} - {description}")
        else:
            file_size = os.path.getsize(filepath) / (1024*1024)  # MB
            log(f"{filename} - {file_size:.1f} MB")

    if missing_files:
        log("MISSING REQUIRED INPUT FILES:")
        for missing in missing_files:
            log(f"   {missing}")
        log("\nTO FIX THIS:")
        log("   Run Part 2A1 (Data Preparation & Session Analysis) to generate required files")
        raise FileNotFoundError("Required input files are missing!")

    log("All required input files found!")

    # Load data
    log("\nLoading input data...")

    try:
        # Load prepared data
        log("   Loading optimized training data...")
        prepared_data = pl.read_parquet(f"{config.OUTPUT_PATH}/covisit_data_prepared.parquet")
        log(f"   Prepared data: {prepared_data.shape} ({prepared_data.estimated_size('mb'):.1f} MB)")

        # Load chunking strategy
        log("   Loading chunking strategy...")
        with open(f"{config.OUTPUT_PATH}/chunking_strategy.json", "r") as f:
            chunking_strategy = json.load(f)
        log(f"   Chunking strategy loaded")

        # Load session analysis
        log("   Loading session analysis...")
        with open(f"{config.OUTPUT_PATH}/session_analysis.json", "r") as f:
            session_analysis = json.load(f)
        log(f"   Session analysis loaded")

        # Validate data for click matrix generation
        log("   Validating data for click matrix generation...")

        # Check for click events
        click_events = prepared_data.filter(pl.col("type") == "clicks")
        click_sessions = click_events.select("session").n_unique()
        total_clicks = len(click_events)

        log(f"      Click events validation:")
        log(f"      Total click events: {total_clicks:,}")
        log(f"      Sessions with clicks: {click_sessions:,}")
        log(f"      Avg clicks per session: {total_clicks / click_sessions:.1f}")

        if total_clicks == 0:
            raise ValueError("No click events found in prepared data!")

        # Check chunking strategy for click-to-click
        if "click_to_click" not in chunking_strategy.get("chunk_sizes", {}):
            log("   No click-to-click chunk size in strategy, using default")
            chunk_size = 10000
        else:
            chunk_size = chunking_strategy["chunk_sizes"]["click_to_click"]

        log(f"   Using chunk size: {chunk_size:,} sessions")

        validation_results = {
            "timestamp": datetime.now().isoformat(),
            "total_click_events": total_clicks,
            "click_sessions": click_sessions,
            "avg_clicks_per_session": total_clicks / click_sessions,
            "chunk_size": chunk_size,
            "memory_pressure": chunking_strategy.get("memory_pressure", "UNKNOWN")
        }

        log("Input validation completed successfully!")
        return prepared_data, chunking_strategy, session_analysis, validation_results

    except Exception as e:
        log(f"Error loading input data: {e}")
        raise e

# Load and validate inputs
prepared_data, chunking_strategy, session_analysis, validation_results = validate_and_load_inputs()

[2025-08-07 16:13:19] Validating and loading inputs from Part 2A1...
[2025-08-07 16:13:19] covisit_data_prepared.parquet - 1605.4 MB
[2025-08-07 16:13:19] chunking_strategy.json - 0.0 MB
[2025-08-07 16:13:19] session_analysis.json - 0.0 MB
[2025-08-07 16:13:19] All required input files found!
[2025-08-07 16:13:19] 
Loading input data...
[2025-08-07 16:13:19]    Loading optimized training data...
[2025-08-07 16:13:32]    Prepared data: (216384937, 7) (5159.0 MB)
[2025-08-07 16:13:32]    Loading chunking strategy...
[2025-08-07 16:13:32]    Chunking strategy loaded
[2025-08-07 16:13:32]    Loading session analysis...
[2025-08-07 16:13:32]    Session analysis loaded
[2025-08-07 16:13:32]    Validating data for click matrix generation...
[2025-08-07 16:13:39]       Click events validation:
[2025-08-07 16:13:39]       Total click events: 194,625,054
[2025-08-07 16:13:39]       Sessions with clicks: 12,899,779
[2025-08-07 16:13:39]       Avg clicks per session: 15.1
[2025-08-07 16:13:39]    

## CLICK-TO-CLICK MATRIX GENERATOR

In [6]:
class ClickToClickMatrixGenerator:
    """
    Memory-optimized click-to-click co-visitation matrix generator
    Uses progressive saving and memory management to handle large datasets
    """

    def __init__(self, chunk_size: int):
        self.chunk_size = chunk_size
        self.covisitation_counts = {}
        self.processed_sessions = 0
        self.processed_chunks = 0
        self.start_time = time.time()
        self.last_save_time = time.time()
        self.last_memory_cleanup = time.time()

        # Memory management settings
        self.max_pairs_in_memory = 5000000  # Max 5M pairs before cleanup
        self.cleanup_threshold_mb = 8000    # Cleanup if memory > 8GB
        self.save_interval_chunks = 25      # Save every 25 chunks

        log("    Initializing memory-efficient click-to-click matrix generator...")
        log(f"   Chunk size: {self.chunk_size:,} sessions")
        log(f"   Max candidates per item: {config.MAX_CANDIDATES_PER_ITEM}")
        log(f"   Time window: {config.TIME_WINDOW_HOURS} hours")
        log(f"   Memory management: {self.max_pairs_in_memory:,} max pairs, cleanup every {self.save_interval_chunks} chunks")

    def process_session_chunk(self, session_chunk: List[int], click_data: pl.DataFrame) -> Dict:
        """
        Process a chunk of sessions with memory-efficient pair generation
        """
        chunk_start_time = time.time()
        chunk_covisitations = {}

        # Filter data for this chunk
        chunk_data = click_data.filter(pl.col("session").is_in(session_chunk))

        if len(chunk_data) == 0:
            return chunk_covisitations

        # Process each session with limits
        session_groups = chunk_data.group_by("session").agg([
            pl.col("aid").alias("aids"),
            pl.col("ts").alias("timestamps")
        ])

        for row in session_groups.iter_rows():
            session_id, aids, timestamps = row

            if len(aids) < 2:  # Need at least 2 items
                continue

            # Limit session size to prevent memory explosion
            if len(aids) > 100:  # Skip very long sessions to save memory
                continue

            # Create aid-timestamp pairs
            aid_ts_pairs = list(zip(aids, timestamps))
            time_window_ms = config.TIME_WINDOW_HOURS * 60 * 60 * 1000

            # Generate co-visitation pairs (with limits)
            pairs_in_session = 0
            max_pairs_per_session = 1000  # Limit pairs per session

            for i in range(len(aid_ts_pairs)):
                for j in range(i + 1, len(aid_ts_pairs)):
                    if pairs_in_session >= max_pairs_per_session:
                        break

                    aid1, ts1 = aid_ts_pairs[i]
                    aid2, ts2 = aid_ts_pairs[j]

                    # Check if within time window
                    if abs(ts2 - ts1) <= time_window_ms and aid1 != aid2:
                        # Add both directions but with limits
                        chunk_covisitations[(aid1, aid2)] = chunk_covisitations.get((aid1, aid2), 0) + 1
                        chunk_covisitations[(aid2, aid1)] = chunk_covisitations.get((aid2, aid1), 0) + 1
                        pairs_in_session += 2

                if pairs_in_session >= max_pairs_per_session:
                    break

        return chunk_covisitations

    def merge_chunk_results(self, chunk_covisitations: Dict):
        """
        Merge chunk results with memory management
        """
        for (aid1, aid2), count in chunk_covisitations.items():
            if aid1 not in self.covisitation_counts:
                self.covisitation_counts[aid1] = {}

            self.covisitation_counts[aid1][aid2] = self.covisitation_counts[aid1].get(aid2, 0) + count

    def estimate_memory_usage(self) -> float:
        """Estimate current memory usage in MB"""
        try:
            total_pairs = sum(len(targets) for targets in self.covisitation_counts.values())
            # Rough estimate: each pair takes ~50 bytes (item IDs + count + overhead)
            estimated_mb = total_pairs * 50 / (1024 * 1024)
            return estimated_mb
        except:
            return 0

    def cleanup_memory(self, force_aggressive: bool = False):
        """
        Clean up memory by keeping only top candidates
        """
        log(f"   Performing memory cleanup (aggressive={force_aggressive})...")

        before_size = len(self.covisitation_counts)
        before_memory = self.estimate_memory_usage()

        # Keep only top candidates per source item
        max_candidates = config.MAX_CANDIDATES_PER_ITEM if not force_aggressive else 20

        cleaned_counts = {}
        for source_aid, targets in self.covisitation_counts.items():
            if len(targets) > max_candidates:
                # Keep only top candidates
                sorted_targets = sorted(targets.items(), key=lambda x: x[1], reverse=True)
                top_targets = dict(sorted_targets[:max_candidates])
                cleaned_counts[source_aid] = top_targets
            else:
                cleaned_counts[source_aid] = targets

        self.covisitation_counts = cleaned_counts

        # Force garbage collection
        gc.collect()

        after_memory = self.estimate_memory_usage()
        log(f"   Memory cleanup completed: {before_memory:.1f}MB → {after_memory:.1f}MB")
        self.last_memory_cleanup = time.time()

    def save_intermediate_results(self, force_save: bool = False):
        """
        Save intermediate results and clean up memory
        """
        current_time = time.time()

        if force_save or (self.processed_chunks % self.save_interval_chunks == 0 and self.processed_chunks > 0):
            log(f"   Saving intermediate results (chunk {self.processed_chunks})...")

            try:
                temp_path = f"{config.OUTPUT_PATH}/click_matrix_temp_chunk_{self.processed_chunks}.pkl"

                # Convert to final format before saving
                temp_matrix = {}
                for source_aid, targets in self.covisitation_counts.items():
                    if targets:
                        sorted_targets = sorted(targets.items(), key=lambda x: x[1], reverse=True)
                        top_targets = sorted_targets[:config.MAX_CANDIDATES_PER_ITEM]
                        temp_matrix[source_aid] = top_targets

                with open(temp_path, "wb") as f:
                    pickle.dump({
                        "partial_matrix": temp_matrix,
                        "processed_sessions": self.processed_sessions,
                        "processed_chunks": self.processed_chunks,
                        "timestamp": datetime.now().isoformat()
                    }, f)

                file_size = os.path.getsize(temp_path) / (1024*1024)
                log(f"   Intermediate results saved: {temp_path} ({file_size:.1f} MB)")
                self.last_save_time = current_time

                # Clear memory after saving
                self.covisitation_counts.clear()
                gc.collect()
                log(f"   Memory cleared after save")

            except Exception as e:
                log(f"   Failed to save intermediate results: {e}")

    def generate_matrix(self, click_data: pl.DataFrame) -> Dict:
        """
        Generate complete click-to-click co-visitation matrix with memory management
        """
        log("Starting memory-efficient click-to-click matrix generation...")

        # Get unique sessions with clicks
        click_sessions = click_data.select("session").unique().sort("session")
        total_sessions = len(click_sessions)

        log(f"   Processing {total_sessions:,} sessions with clicks")
        log(f"   Using {self.chunk_size:,} sessions per chunk")

        num_chunks = (total_sessions + self.chunk_size - 1) // self.chunk_size
        log(f"   Total chunks to process: {num_chunks}")

        # Process sessions in chunks
        session_list = click_sessions["session"].to_list()
        intermediate_files = []

        for chunk_idx in range(num_chunks):
            chunk_start_time = time.time()

            # Check processing time limit
            total_elapsed = time.time() - self.start_time
            if total_elapsed > (config.MAX_PROCESSING_TIME_HOURS * 3600):
                log(f"PROCESSING TIME LIMIT REACHED ({config.MAX_PROCESSING_TIME_HOURS} hours)")
                break

            # Memory check
            if chunk_idx % config.MEMORY_CHECK_INTERVAL == 0:
                memory_status = check_memory()
                if memory_status == "CRITICAL":
                    log(f"CRITICAL MEMORY USAGE - EMERGENCY SAVE AND CLEANUP")
                    self.save_intermediate_results(force_save=True)
                    break

            # Get session chunk
            start_idx = chunk_idx * self.chunk_size
            end_idx = min(start_idx + self.chunk_size, total_sessions)
            session_chunk = session_list[start_idx:end_idx]

            log(f"   Processing chunk {chunk_idx + 1}/{num_chunks} ({end_idx/total_sessions*100:.1f}%)")

            try:
                # Process chunk
                chunk_covisitations = self.process_session_chunk(session_chunk, click_data)
                self.merge_chunk_results(chunk_covisitations)

                # Update counters
                self.processed_sessions += len(session_chunk)
                self.processed_chunks += 1

                chunk_time = time.time() - chunk_start_time
                pairs_found = len(chunk_covisitations)
                current_memory = self.estimate_memory_usage()

                log(f"      Chunk completed: {pairs_found:,} pairs, {chunk_time:.1f}s, ~{current_memory:.0f}MB")

                # Memory management
                if (current_memory > self.cleanup_threshold_mb or
                    self.processed_chunks % self.save_interval_chunks == 0):
                    self.save_intermediate_results()

                # Cleanup
                del chunk_covisitations
                gc.collect()

            except Exception as e:
                log(f"      Error processing chunk {chunk_idx + 1}: {e}")
                continue

        # Final processing - collect all intermediate results
        log("   Collecting and merging intermediate results...")
        final_matrix = self._merge_intermediate_results()

        total_time = time.time() - self.start_time
        log(f"Click-to-click matrix generation completed!")
        log(f"   Total time: {total_time:.1f} seconds ({total_time/60:.1f} minutes)")
        log(f"   Processed sessions: {self.processed_sessions:,}")
        log(f"   Final matrix size: {len(final_matrix):,} source items")

        return final_matrix

    def _merge_intermediate_results(self) -> Dict:
        """
        Merge all intermediate result files into final matrix
        """
        log("   Merging intermediate results...")

        final_matrix = {}
        temp_files = []

        # Find all temporary files
        import glob
        temp_pattern = f"{config.OUTPUT_PATH}/click_matrix_temp_chunk_*.pkl"
        temp_files = glob.glob(temp_pattern)

        log(f"   Found {len(temp_files)} intermediate files to merge")

        # Merge current memory state
        if self.covisitation_counts:
            log("   Adding current memory state to final matrix...")
            for source_aid, targets in self.covisitation_counts.items():
                if targets:
                    sorted_targets = sorted(targets.items(), key=lambda x: x[1], reverse=True)
                    top_targets = sorted_targets[:config.MAX_CANDIDATES_PER_ITEM]
                    final_matrix[source_aid] = top_targets

        # Merge intermediate files
        for temp_file in temp_files:
            try:
                log(f"   Merging {os.path.basename(temp_file)}...")
                with open(temp_file, "rb") as f:
                    temp_data = pickle.load(f)
                    temp_matrix = temp_data.get("partial_matrix", {})

                # Merge into final matrix
                for source_aid, candidates in temp_matrix.items():
                    if source_aid in final_matrix:
                        # Merge candidates
                        existing_dict = dict(final_matrix[source_aid])
                        for target_aid, score in candidates:
                            existing_dict[target_aid] = existing_dict.get(target_aid, 0) + score

                        # Keep top candidates
                        sorted_candidates = sorted(existing_dict.items(), key=lambda x: x[1], reverse=True)
                        final_matrix[source_aid] = sorted_candidates[:config.MAX_CANDIDATES_PER_ITEM]
                    else:
                        final_matrix[source_aid] = candidates[:config.MAX_CANDIDATES_PER_ITEM]

                # Clean up temp file
                os.remove(temp_file)
                log(f"   Cleaned up {os.path.basename(temp_file)}")

            except Exception as e:
                log(f"   Error processing {temp_file}: {e}")

        log(f"   Final matrix merged: {len(final_matrix):,} source items")
        return final_matrix

## MATRIX GENERATION EXECUTION

In [7]:
log("\nPreparing click data for matrix generation...")

# Memory check before starting
initial_memory = check_memory()
log(f"Initial memory status: {initial_memory}")

# Prepare click data with memory optimization
log("   Filtering and optimizing click data...")
click_data = prepared_data.filter(pl.col("type") == "clicks").select(["session", "aid", "ts"]).sort(["session", "ts"])
log(f"Click data prepared: {click_data.shape} ({click_data.estimated_size('mb'):.1f} MB)")

# Clear original data from memory immediately
del prepared_data
gc.collect()
log("   Original prepared data cleared from memory")

# Memory check after cleanup
post_cleanup_memory = check_memory()
log(f"Memory after cleanup: {post_cleanup_memory}")

# Initialize memory-efficient matrix generator
chunk_size = validation_results["chunk_size"]
log(f"\nInitializing memory-efficient matrix generator...")
generator = ClickToClickMatrixGenerator(chunk_size)

# Pre-generation memory check
pre_generation_memory = check_memory()
log(f"Pre-generation memory status: {pre_generation_memory}")

# Set up emergency memory monitoring
def emergency_memory_check():
    """Emergency memory check during generation"""
    memory = psutil.virtual_memory()
    if memory.percent > 95:
        log(f"EMERGENCY: Memory usage at {memory.percent:.1f}%")
        return True
    return False

# Generate matrix with comprehensive error handling
generation_successful = False
click_to_click_matrix = {}

try:
    log("Starting matrix generation with memory management...")

    # Monitor memory during generation
    generation_start = time.time()

    # Generate matrix with automatic memory management
    click_to_click_matrix = generator.generate_matrix(click_data)

    if click_to_click_matrix and len(click_to_click_matrix) > 0:
        generation_successful = True
        log(f"Matrix generation completed successfully!")
        log(f"Generated matrix with {len(click_to_click_matrix):,} source items")
    else:
        log("Matrix generation completed but resulted in empty matrix")
        generation_successful = False

except MemoryError as e:
    log(f"MEMORY ERROR during matrix generation: {e}")
    log("Attempting to recover partial results...")
    generation_successful = False

    # Try to get partial results from intermediate saves
    try:
        import glob
        temp_files = glob.glob(f"{config.OUTPUT_PATH}/click_matrix_temp_chunk_*.pkl")
        if temp_files:
            log(f"Found {len(temp_files)} intermediate files - attempting partial recovery...")
            # Load the most recent intermediate result
            latest_file = max(temp_files, key=os.path.getctime)
            with open(latest_file, "rb") as f:
                temp_data = pickle.load(f)
                click_to_click_matrix = temp_data.get("partial_matrix", {})
            log(f"Recovered partial matrix with {len(click_to_click_matrix):,} items from {os.path.basename(latest_file)}")
        else:
            click_to_click_matrix = {}
    except Exception as recovery_error:
        log(f"Recovery failed: {recovery_error}")
        click_to_click_matrix = {}

except Exception as e:
    log(f"Unexpected error during matrix generation: {e}")
    log("Attempting to save partial results...")
    generation_successful = False

    # Try to save current state
    try:
        if hasattr(generator, 'covisitation_counts') and generator.covisitation_counts:
            generator.save_intermediate_results(force_save=True)
            # Convert current state to matrix format
            current_matrix = {}
            for source_aid, targets in generator.covisitation_counts.items():
                if targets:
                    sorted_targets = sorted(targets.items(), key=lambda x: x[1], reverse=True)
                    current_matrix[source_aid] = sorted_targets[:config.MAX_CANDIDATES_PER_ITEM]
            click_to_click_matrix = current_matrix
            log(f"Saved partial results with {len(click_to_click_matrix):,} items")
        else:
            click_to_click_matrix = {}
    except Exception as save_error:
        log(f"Failed to save partial results: {save_error}")
        click_to_click_matrix = {}

finally:
    # Final cleanup
    try:
        if 'click_data' in locals():
            del click_data
        if 'generator' in locals() and hasattr(generator, 'covisitation_counts'):
            generator.covisitation_counts.clear()
        gc.collect()
        log("Final cleanup completed")
    except Exception as cleanup_error:
        log(f"Cleanup error: {cleanup_error}")

# Post-generation memory check
post_generation_memory = check_memory()
log(f"Post-generation memory status: {post_generation_memory}")

# Generation results summary
generation_time = time.time() - generation_start if 'generation_start' in locals() else 0
log(f"\nGeneration Results:")
log(f"  Success: {generation_successful}")
log(f"  Matrix size: {len(click_to_click_matrix):,} source items")
log(f"  Generation time: {generation_time:.1f} seconds ({generation_time/60:.1f} minutes)")
log(f"  Memory status: {post_generation_memory}")

# Final validation
if click_to_click_matrix:
    total_pairs = sum(len(candidates) for candidates in click_to_click_matrix.values())
    log(f"  Total pairs: {total_pairs:,}")
    log(f"  Avg candidates per item: {total_pairs/len(click_to_click_matrix):.1f}")
else:
    log("  WARNING: No matrix generated - check logs for errors")
    total_pairs = 0

[2025-08-07 16:13:39] 
Preparing click data for matrix generation...
[2025-08-07 16:13:39] Initial memory status: LOW
[2025-08-07 16:13:39]    Filtering and optimizing click data...
[2025-08-07 16:13:50] Click data prepared: (194625054, 3) (2969.7 MB)
[2025-08-07 16:13:50]    Original prepared data cleared from memory
[2025-08-07 16:13:50] Memory after cleanup: LOW
[2025-08-07 16:13:50] 
Initializing memory-efficient matrix generator...
[2025-08-07 16:13:50]     Initializing memory-efficient click-to-click matrix generator...
[2025-08-07 16:13:50]    Chunk size: 50,000 sessions
[2025-08-07 16:13:50]    Max candidates per item: 40
[2025-08-07 16:13:50]    Time window: 24 hours
[2025-08-07 16:13:50]    Memory management: 5,000,000 max pairs, cleanup every 25 chunks
[2025-08-07 16:13:50] Pre-generation memory status: LOW
[2025-08-07 16:13:50] Starting matrix generation with memory management...
[2025-08-07 16:13:50] Starting memory-efficient click-to-click matrix generation...
[2025-08-07

## MATRIX ANALYSIS AND VALIDATION

In [8]:
def analyze_click_matrix(matrix: Dict, memory_log: List, generation_successful: bool = True) -> Dict:
    """
    Analyze the generated click-to-click matrix with robust error handling

    Args:
        matrix: Generated co-visitation matrix
        memory_log: Memory usage log during generation
        generation_successful: Whether generation completed successfully

    Returns:
        dict: Matrix analysis results
    """
    log("Analyzing generated click-to-click matrix...")

    if not matrix:
        log("Empty or no matrix - providing basic analysis")
        return {
            "error": "Empty matrix",
            "generation_successful": generation_successful,
            "analysis_timestamp": datetime.now().isoformat()
        }

    # Basic statistics with error handling
    try:
        source_items = len(matrix)
        total_pairs = sum(len(candidates) for candidates in matrix.values()) if matrix else 0
        avg_candidates = total_pairs / source_items if source_items > 0 else 0

        log(f"    Basic statistics:")
        log(f"    Source items: {source_items:,}")
        log(f"    Total pairs: {total_pairs:,}")
        log(f"    Avg candidates per item: {avg_candidates:.1f}")

        # Candidate count distribution
        candidate_counts = [len(candidates) for candidates in matrix.values()]

        count_stats = {}
        if candidate_counts:
            count_stats = {
                "min": min(candidate_counts),
                "max": max(candidate_counts),
                "mean": np.mean(candidate_counts),
                "median": np.median(candidate_counts),
                "std": np.std(candidate_counts)
            }

            log(f"      Candidate count distribution:")
            log(f"      Min: {count_stats['min']}, Max: {count_stats['max']}")
            log(f"      Mean: {count_stats['mean']:.1f}, Median: {count_stats['median']:.1f}")

        # Score distribution (sample to avoid memory issues)
        all_scores = []
        sample_size = min(100, len(matrix))  # Smaller sample to save memory
        sample_items = list(matrix.keys())[:sample_size]

        score_stats = {}
        try:
            for item in sample_items:
                if isinstance(matrix[item], list):
                    # Handle list of tuples format [(aid, score), ...]
                    scores = [score for _, score in matrix[item] if isinstance(score, (int, float))]
                    all_scores.extend(scores)

            if all_scores:
                score_stats = {
                    "min": min(all_scores),
                    "max": max(all_scores),
                    "mean": np.mean(all_scores),
                    "median": np.median(all_scores)
                }

                log(f"      Co-visitation score distribution (sample of {sample_size} items):")
                log(f"      Min: {score_stats['min']}, Max: {score_stats['max']}")
                log(f"      Mean: {score_stats['mean']:.1f}, Median: {score_stats['median']:.1f}")

        except Exception as score_error:
            log(f"      Error analyzing scores: {score_error}")
            score_stats = {"error": str(score_error)}

        # Memory usage analysis
        memory_analysis = {}
        if memory_log:
            try:
                memory_usage = [entry["memory_percent"] for entry in memory_log if "memory_percent" in entry]
                if memory_usage:
                    memory_analysis = {
                        "peak_memory_percent": max(memory_usage),
                        "avg_memory_percent": np.mean(memory_usage),
                        "memory_checks": len(memory_log),
                        "critical_memory_events": sum(1 for usage in memory_usage if usage > 90)
                    }

                    log(f"      Memory usage during generation:")
                    log(f"      Peak usage: {memory_analysis['peak_memory_percent']:.1f}%")
                    log(f"      Average usage: {memory_analysis['avg_memory_percent']:.1f}%")
                    log(f"      Critical events: {memory_analysis['critical_memory_events']}")
            except Exception as memory_error:
                log(f"      Error analyzing memory usage: {memory_error}")
                memory_analysis = {"error": str(memory_error)}

        # Sample relationships for validation (smaller sample)
        sample_relationships = {}
        sample_count = min(5, len(matrix))  # Even smaller sample

        if sample_count > 0:
            try:
                sample_items = list(matrix.keys())[:sample_count]
                for item in sample_items:
                    if isinstance(matrix[item], list):
                        # Take top 3 candidates only for display
                        top_candidates = matrix[item][:3]
                        sample_relationships[str(item)] = top_candidates

                log(f"   Sample relationships:")
                for item, candidates in list(sample_relationships.items())[:3]:
                    if candidates:
                        candidate_str = ", ".join([f"{aid}({score})" for aid, score in candidates])
                        log(f"      Item {item} → {candidate_str}")
            except Exception as sample_error:
                log(f"      Error creating sample relationships: {sample_error}")
                sample_relationships = {"error": str(sample_error)}

        # Quality assessment
        quality_metrics = {
            "has_data": source_items > 0,
            "reasonable_size": source_items > 100,
            "good_coverage": total_pairs > 1000,
            "balanced_candidates": count_stats.get("mean", 0) > 1 if count_stats else False
        }

        overall_quality = sum(quality_metrics.values())
        quality_level = "EXCELLENT" if overall_quality >= 4 else "GOOD" if overall_quality >= 3 else "FAIR" if overall_quality >= 2 else "POOR"

        log(f"   Quality Assessment: {quality_level} ({overall_quality}/4 criteria met)")

        analysis_results = {
            "analysis_timestamp": datetime.now().isoformat(),
            "generation_successful": generation_successful,
            "matrix_size": {
                "source_items": source_items,
                "total_pairs": total_pairs,
                "avg_candidates_per_item": avg_candidates
            },
            "candidate_count_stats": count_stats,
            "score_distribution": score_stats,
            "memory_usage": memory_analysis,
            "sample_relationships": sample_relationships,
            "quality_metrics": quality_metrics,
            "overall_quality": quality_level
        }

        log("Matrix analysis completed!")
        return analysis_results

    except Exception as e:
        log(f"Error during matrix analysis: {e}")
        return {
            "error": str(e),
            "generation_successful": generation_successful,
            "analysis_timestamp": datetime.now().isoformat()
        }

# Analyze the generated matrix with robust error handling
try:
    matrix_analysis = analyze_click_matrix(click_to_click_matrix, memory_log, generation_successful)
except Exception as analysis_error:
    log(f"Critical error in matrix analysis: {analysis_error}")
    matrix_analysis = {
        "error": str(analysis_error),
        "generation_successful": generation_successful,
        "analysis_timestamp": datetime.now().isoformat(),
        "matrix_size": {
            "source_items": len(click_to_click_matrix) if click_to_click_matrix else 0,
            "total_pairs": 0,
            "avg_candidates_per_item": 0
        }
    }

[2025-08-07 17:20:11] Analyzing generated click-to-click matrix...
[2025-08-07 17:20:11]     Basic statistics:
[2025-08-07 17:20:11]     Source items: 1,839,483
[2025-08-07 17:20:11]     Total pairs: 63,503,324
[2025-08-07 17:20:11]     Avg candidates per item: 34.5
[2025-08-07 17:20:12]       Candidate count distribution:
[2025-08-07 17:20:12]       Min: 1, Max: 40
[2025-08-07 17:20:12]       Mean: 34.5, Median: 40.0
[2025-08-07 17:20:13]       Co-visitation score distribution (sample of 100 items):
[2025-08-07 17:20:13]       Min: 1, Max: 22925
[2025-08-07 17:20:13]       Mean: 245.2, Median: 24.0
[2025-08-07 17:20:13]       Memory usage during generation:
[2025-08-07 17:20:13]       Peak usage: 31.4%
[2025-08-07 17:20:13]       Average usage: 18.1%
[2025-08-07 17:20:13]       Critical events: 0
[2025-08-07 17:20:13]    Sample relationships:
[2025-08-07 17:20:13]       Item 1506112 → 369364(52), 434110(40), 1205660(27)
[2025-08-07 17:20:13]       Item 111945 → 469872(179), 324152(88)

## SAVE OUTPUTS

In [10]:
## SAVE OUTPUTS (Fixed JSON Serialization)

def convert_to_json_serializable(obj):
    """
    Convert non-JSON serializable objects to JSON-compatible types

    Args:
        obj: Object to convert

    Returns:
        JSON-serializable version of the object
    """
    if isinstance(obj, np.ndarray):
        return obj.tolist()
    elif isinstance(obj, np.integer):
        return int(obj)
    elif isinstance(obj, np.floating):
        return float(obj)
    elif isinstance(obj, np.bool_):
        return bool(obj)
    elif isinstance(obj, dict):
        return {key: convert_to_json_serializable(value) for key, value in obj.items()}
    elif isinstance(obj, list):
        return [convert_to_json_serializable(item) for item in obj]
    elif isinstance(obj, tuple):
        return [convert_to_json_serializable(item) for item in obj]
    elif hasattr(obj, 'item'):  # Handle numpy scalars
        return obj.item()
    else:
        return obj

def save_click_matrix_outputs(matrix: Dict,
                             analysis: Dict,
                             memory_log: List,
                             validation_results: Dict):
    """
    Save all outputs from click-to-click matrix generation with proper JSON handling

    Args:
        matrix: Generated co-visitation matrix
        analysis: Matrix analysis results
        memory_log: Memory usage log
        validation_results: Input validation results
    """
    log("Saving click-to-click matrix outputs...")

    try:
        # 1. Save click-to-click matrix (main output)
        matrix_path = f"{config.OUTPUT_PATH}/click_to_click_matrix.pkl"
        with open(matrix_path, "wb") as f:
            pickle.dump(matrix, f)

        file_size = os.path.getsize(matrix_path) / (1024*1024)
        log(f"   click_to_click_matrix.pkl saved ({file_size:.1f} MB)")

        # 2. Save matrix statistics (with JSON serialization fix)
        stats_path = f"{config.OUTPUT_PATH}/click_matrix_statistics.json"

        # Convert analysis to JSON-serializable format
        json_safe_analysis = convert_to_json_serializable(analysis)

        with open(stats_path, "w") as f:
            json.dump(json_safe_analysis, f, indent=2, default=str)
        log(f"   click_matrix_statistics.json saved")

        # 3. Save sample relationships for validation
        samples_path = f"{config.OUTPUT_PATH}/click_matrix_samples.json"

        # Prepare sample data with JSON conversion
        sample_data = {
            "generation_timestamp": datetime.now().isoformat(),
            "sample_relationships": convert_to_json_serializable(analysis.get("sample_relationships", {})),
            "matrix_size": convert_to_json_serializable(analysis.get("matrix_size", {})),
            "top_source_items": [int(item) for item in list(matrix.keys())[:20]] if matrix else []
        }

        with open(samples_path, "w") as f:
            json.dump(sample_data, f, indent=2, default=str)
        log(f"   click_matrix_samples.json saved")

        # 4. Save memory usage log (with JSON conversion)
        memory_path = f"{config.OUTPUT_PATH}/memory_usage_log.json"

        # Convert memory log and analysis to JSON-safe format
        json_safe_memory_log = convert_to_json_serializable(memory_log)
        json_safe_memory_analysis = convert_to_json_serializable(analysis.get("memory_usage", {}))

        memory_data = {
            "generation_timestamp": datetime.now().isoformat(),
            "memory_log": json_safe_memory_log,
            "memory_analysis": json_safe_memory_analysis,
            "generation_successful": bool(analysis.get("generation_successful", False))
        }

        with open(memory_path, "w") as f:
            json.dump(memory_data, f, indent=2, default=str)
        log(f"   memory_usage_log.json saved")

        # 5. Save comprehensive summary (with all JSON conversions)
        summary = {
            "notebook": "Part 2A2: Click-to-Click Matrix Generation",
            "completion_timestamp": datetime.now().isoformat(),
            "generation_successful": bool(analysis.get("generation_successful", False)),
            "inputs_used": {
                "covisit_data_prepared.parquet": "Click events from prepared data",
                "chunking_strategy.json": f"Chunk size: {validation_results.get('chunk_size', 'unknown'):,}",
                "session_analysis.json": "Session insights for optimization"
            },
            "outputs_generated": {
                "click_to_click_matrix.pkl": f"{analysis.get('matrix_size', {}).get('source_items', 0):,} source items",
                "click_matrix_statistics.json": "Comprehensive matrix analysis",
                "click_matrix_samples.json": "Sample relationships for validation",
                "memory_usage_log.json": "Memory usage tracking during generation"
            },
            "key_metrics": convert_to_json_serializable({
                "source_items": analysis.get("matrix_size", {}).get("source_items", 0),
                "total_pairs": analysis.get("matrix_size", {}).get("total_pairs", 0),
                "peak_memory_percent": analysis.get("memory_usage", {}).get("peak_memory_percent", 0),
                "matrix_file_size_mb": float(file_size),
                "overall_quality": analysis.get("overall_quality", "UNKNOWN")
            }),
            "quality_assessment": convert_to_json_serializable(analysis.get("quality_metrics", {})),
            "next_step": "Run Part 2A3: Click-to-Buy & Buy-to-Buy Matrix Generation" if analysis.get("generation_successful", False) else "Review errors and retry"
        }

        summary_path = f"{config.OUTPUT_PATH}/part_2a2_summary.json"
        with open(summary_path, "w") as f:
            json.dump(summary, f, indent=2, default=str)
        log(f"   part_2a2_summary.json saved")

        log("All click matrix outputs saved successfully!")

        return {
            "matrix_path": matrix_path,
            "stats_path": stats_path,
            "samples_path": samples_path,
            "memory_path": memory_path,
            "summary_path": summary_path
        }

    except Exception as e:
        log(f"Error saving outputs: {e}")
        log(f"Error type: {type(e).__name__}")
        log(f"Attempting to save with fallback methods...")

        # Fallback: try to save essential files only
        try:
            fallback_paths = {}

            # Save matrix (most important)
            if matrix:
                matrix_path = f"{config.OUTPUT_PATH}/click_to_click_matrix.pkl"
                with open(matrix_path, "wb") as f:
                    pickle.dump(matrix, f)
                fallback_paths["matrix_path"] = matrix_path
                log(f"   Fallback: click_to_click_matrix.pkl saved")

            # Save basic summary as text (avoiding JSON issues)
            summary_txt_path = f"{config.OUTPUT_PATH}/part_2a2_summary.txt"
            with open(summary_txt_path, "w") as f:
                f.write(f"Part 2A2 Summary\n")
                f.write(f"Generated: {datetime.now().isoformat()}\n")
                f.write(f"Generation Successful: {analysis.get('generation_successful', False)}\n")
                f.write(f"Source Items: {analysis.get('matrix_size', {}).get('source_items', 0)}\n")
                f.write(f"Total Pairs: {analysis.get('matrix_size', {}).get('total_pairs', 0)}\n")
                f.write(f"Matrix File Size: {file_size:.1f} MB\n")
            fallback_paths["summary_txt_path"] = summary_txt_path
            log(f"   Fallback: part_2a2_summary.txt saved")

            return fallback_paths

        except Exception as fallback_error:
            log(f"Fallback save also failed: {fallback_error}")
            raise e

# Save all outputs with error handling
try:
    output_paths = save_click_matrix_outputs(click_to_click_matrix, matrix_analysis, memory_log, validation_results)
    log("Output saving completed successfully!")
except Exception as save_error:
    log(f"Critical error saving outputs: {save_error}")
    # Create minimal output paths for summary section
    output_paths = {
        "matrix_path": f"{config.OUTPUT_PATH}/click_to_click_matrix.pkl",
        "stats_path": f"{config.OUTPUT_PATH}/click_matrix_statistics.json",
        "samples_path": f"{config.OUTPUT_PATH}/click_matrix_samples.json",
        "memory_path": f"{config.OUTPUT_PATH}/memory_usage_log.json",
        "summary_path": f"{config.OUTPUT_PATH}/part_2a2_summary.json"
    }

[2025-08-07 17:24:36] Saving click-to-click matrix outputs...
[2025-08-07 17:25:34]    click_to_click_matrix.pkl saved (556.6 MB)
[2025-08-07 17:25:34]    click_matrix_statistics.json saved
[2025-08-07 17:25:35]    click_matrix_samples.json saved
[2025-08-07 17:25:35]    memory_usage_log.json saved
[2025-08-07 17:25:35]    part_2a2_summary.json saved
[2025-08-07 17:25:35] All click matrix outputs saved successfully!
[2025-08-07 17:25:35] Output saving completed successfully!


## FINAL SUMMARY AND NEXT STEPS

In [11]:
## FINAL SUMMARY AND NEXT STEPS (Robust Version)

log("\n" + "="*80)
log("PART 2A2 COMPLETED: CLICK-TO-CLICK MATRIX GENERATION")
log("="*80)

# Generation status
if generation_successful and matrix_analysis.get("matrix_size"):
    log(f"\n MATRIX GENERATION SUCCESSFUL")
    matrix_size = matrix_analysis["matrix_size"]
    log(f"Source items: {matrix_size['source_items']:,}")
    log(f"Total pairs: {matrix_size['total_pairs']:,}")
    log(f"Avg candidates per item: {matrix_size['avg_candidates_per_item']:.1f}")

    if 'output_paths' in locals() and 'matrix_path' in output_paths:
        matrix_file_size = os.path.getsize(output_paths['matrix_path']) / (1024*1024)
        log(f"Matrix file size: {matrix_file_size:.1f} MB")
else:
    log(f"\n  MATRIX GENERATION INCOMPLETE")
    if click_to_click_matrix:
        log(f"Partial results: {len(click_to_click_matrix):,} source items")
        total_partial_pairs = sum(len(candidates) for candidates in click_to_click_matrix.values())
        log(f"Partial pairs: {total_partial_pairs:,}")
    else:
        log(f"No matrix data generated")

# Memory usage summary
if matrix_analysis.get("memory_usage"):
    memory_stats = matrix_analysis["memory_usage"]
    log(f"\n MEMORY USAGE SUMMARY:")
    if "error" not in memory_stats:
        log(f"Peak memory usage: {memory_stats.get('peak_memory_percent', 0):.1f}%")
        log(f"Average memory usage: {memory_stats.get('avg_memory_percent', 0):.1f}%")
        log(f"Critical memory events: {memory_stats.get('critical_memory_events', 0)}")

        if memory_stats.get('critical_memory_events', 0) > 0:
            log(f"  Memory constraints detected - consider using smaller chunks or more selective filtering")
    else:
        log(f"Memory analysis error: {memory_stats['error']}")

# Quality assessment
if matrix_analysis.get("quality_metrics"):
    quality = matrix_analysis.get("overall_quality", "UNKNOWN")
    log(f"\n QUALITY ASSESSMENT: {quality}")

    metrics = matrix_analysis["quality_metrics"]
    log(f"Has data: {'yes' if metrics.get('has_data') else 'no'}")
    log(f"Reasonable size (>100 items): {'yes' if metrics.get('reasonable_size') else 'no'}")
    log(f"Good coverage (>1k pairs): {'yes' if metrics.get('good_coverage') else 'no'}")
    log(f"Balanced candidates: {'yes' if metrics.get('balanced_candidates') else 'no'}")

# Output files
log(f"\n OUTPUT FILES GENERATED:")
try:
    if 'output_paths' in locals():
        for description, path in output_paths.items():
            filename = os.path.basename(path)
            if os.path.exists(path):
                if path.endswith('.pkl'):
                    file_size = os.path.getsize(path) / (1024*1024)
                    log(f"    {filename} ({file_size:.1f} MB)")
                else:
                    log(f"    {filename}")
            else:
                log(f"    {filename} (not found)")
        log(f"Files location: {config.OUTPUT_PATH}")
    else:
        log("   No output paths available - files may not have been saved")
except Exception as file_error:
    log(f"   Error checking output files: {file_error}")

# Recommendations based on results
log(f"\n🎯 RECOMMENDATIONS:")

if generation_successful:
    log(f"    Matrix generation completed successfully")
    log(f"     Proceed to Part 2A3: Click-to-Buy & Buy-to-Buy Matrix Generation")
    log(f"    Matrix quality is {matrix_analysis.get('overall_quality', 'UNKNOWN')} - suitable for recommendations")
else:
    log(f"     Matrix generation incomplete due to memory constraints")
    log(f"    TROUBLESHOOTING OPTIONS:")
    log(f"      1. Reduce chunk size (currently {validation_results.get('chunk_size', 'unknown'):,})")
    log(f"      2. Increase MAX_CANDIDATES_PER_ITEM limit (currently {config.MAX_CANDIDATES_PER_ITEM})")
    log(f"      3. Use more aggressive session filtering (skip sessions >50 items)")
    log(f"      4. Process in smaller time windows")

    if click_to_click_matrix:
        log(f"     Partial results available - consider using for testing")
        log(f"     Can proceed with caution to Part 2A3 using partial data")

# Memory optimization suggestions
memory_peak = matrix_analysis.get("memory_usage", {}).get("peak_memory_percent", 0)
if memory_peak > 90:
    log(f"    MEMORY OPTIMIZATION SUGGESTIONS:")
    log(f"      - Peak memory usage was {memory_peak:.1f}% - very close to limit")
    log(f"      - Consider reducing chunk_size from {validation_results.get('chunk_size', 50000):,} to 25,000")
    log(f"      - Implement more aggressive session filtering")
    log(f"      - Use shorter time windows for co-visitation")

# Performance metrics
try:
    if 'generation_time' in locals() and generation_time > 0:
        sessions_per_second = validation_results.get("processed_sessions", 0) / generation_time
        log(f"\n PERFORMANCE METRICS:")
        log(f"   Processing speed: {sessions_per_second:.1f} sessions/second")
        log(f"   Total runtime: {generation_time/60:.1f} minutes")

        if sessions_per_second < 100:
            log(f"    Processing speed is slow - consider optimizing session filtering")
except:
    pass

# Emergency cleanup
log(f"\n PERFORMING FINAL CLEANUP...")
try:
    # Clean up any remaining variables
    if 'click_to_click_matrix' in locals():
        matrix_items = len(click_to_click_matrix)
        del click_to_click_matrix
        log(f"   Cleared matrix from memory ({matrix_items:,} items)")

    if 'generator' in locals():
        if hasattr(generator, 'covisitation_counts'):
            generator.covisitation_counts.clear()
        del generator
        log(f"   Cleared generator from memory")

    # Force aggressive garbage collection
    for i in range(3):
        gc.collect()

    final_memory_status = check_memory()
    log(f"   Final memory status: {final_memory_status}")

except Exception as cleanup_error:
    log(f"   Cleanup error: {cleanup_error}")

log(f"\n Part 2A2 processing completed!")
log(f"Check all output files in: {config.OUTPUT_PATH}")

# Save execution summary
try:
    execution_summary = {
        "notebook": "Part 2A2: Click-to-Click Matrix Generation",
        "completion_timestamp": datetime.now().isoformat(),
        "generation_successful": generation_successful,
        "matrix_quality": matrix_analysis.get("overall_quality", "UNKNOWN"),
        "peak_memory_percent": matrix_analysis.get("memory_usage", {}).get("peak_memory_percent", 0),
        "source_items_generated": matrix_analysis.get("matrix_size", {}).get("source_items", 0),
        "total_pairs_generated": matrix_analysis.get("matrix_size", {}).get("total_pairs", 0),
        "processing_time_minutes": locals().get('generation_time', 0) / 60,
        "recommendations": "Proceed to Part 2A3" if generation_successful else "Review and optimize parameters"
    }

    with open(f"{config.OUTPUT_PATH}/part_2a2_execution_summary.json", "w") as f:
        json.dump(execution_summary, f, indent=2)

    log(f"Execution summary saved: part_2a2_execution_summary.json")

except Exception as summary_error:
    log(f"Could not save execution summary: {summary_error}")

[2025-08-07 17:25:40] 
[2025-08-07 17:25:40] PART 2A2 COMPLETED: CLICK-TO-CLICK MATRIX GENERATION
[2025-08-07 17:25:40] 
 MATRIX GENERATION SUCCESSFUL
[2025-08-07 17:25:40] Source items: 1,839,483
[2025-08-07 17:25:40] Total pairs: 63,503,324
[2025-08-07 17:25:40] Avg candidates per item: 34.5
[2025-08-07 17:25:40] Matrix file size: 556.6 MB
[2025-08-07 17:25:40] 
 MEMORY USAGE SUMMARY:
[2025-08-07 17:25:40] Peak memory usage: 31.4%
[2025-08-07 17:25:40] Average memory usage: 18.1%
[2025-08-07 17:25:40] Critical memory events: 0
[2025-08-07 17:25:40] 
 QUALITY ASSESSMENT: EXCELLENT
[2025-08-07 17:25:40] Has data: yes
[2025-08-07 17:25:40] Reasonable size (>100 items): yes
[2025-08-07 17:25:40] Good coverage (>1k pairs): yes
[2025-08-07 17:25:40] Balanced candidates: yes
[2025-08-07 17:25:40] 
 OUTPUT FILES GENERATED:
[2025-08-07 17:25:40]     click_to_click_matrix.pkl (556.6 MB)
[2025-08-07 17:25:40]     click_matrix_statistics.json
[2025-08-07 17:25:40]     click_matrix_samples.json
[