# Part 2A4 Matrix Consolidation & Validation

In [1]:
# Install required packages
!pip install polars==0.20.31
!pip install psutil

import polars as pl
import pandas as pd
import numpy as np
import gc
import os
import pickle
import json
import time
import psutil
from typing import Dict, List, Tuple, Optional, Any
from datetime import datetime
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')

Collecting polars==0.20.31
  Downloading polars-0.20.31-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (14 kB)
Downloading polars-0.20.31-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (28.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m28.8/28.8 MB[0m [31m63.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: polars
  Attempting uninstall: polars
    Found existing installation: polars 1.25.2
    Uninstalling polars-1.25.2:
      Successfully uninstalled polars-1.25.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf-polars-cu12 25.6.0 requires polars<1.29,>=1.25, but you have polars 0.20.31 which is incompatible.[0m[31m
[0mSuccessfully installed polars-0.20.31


In [2]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Configuration
class Config:
    DATA_PATH = '/content/drive/MyDrive/Colab Notebooks/CML/Assignment 1/content/otto-data'
    OUTPUT_PATH = '/content/drive/MyDrive/Colab Notebooks/CML/Assignment 1/content/otto-output'

    # Consolidation parameters
    MIN_CANDIDATES_PER_MATRIX = 5     # Minimum candidates required per matrix type
    MAX_CANDIDATES_PER_ITEM = 40      # Maximum candidates to keep per source item
    QUALITY_THRESHOLD_COVERAGE = 0.1  # Minimum coverage threshold for quality assessment

    # Validation parameters
    SAMPLE_SIZE_FOR_VALIDATION = 1000  # Number of items to sample for detailed validation
    CROSS_VALIDATION_SAMPLE = 500     # Sample size for cross-matrix validation

config = Config()

## LOGGING SETUP

In [4]:
def setup_logging():
    """Setup comprehensive logging for matrix consolidation"""
    log_file = f"{config.OUTPUT_PATH}/matrix_consolidation_log.txt"

    def log_message(message):
        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        log_entry = f"[{timestamp}] {message}"
        print(log_entry)

        # Also write to file
        with open(log_file, "a") as f:
            f.write(log_entry + "\n")

    def check_memory_usage():
        """Check current memory usage"""
        memory = psutil.virtual_memory()
        memory_pct = memory.percent
        available_gb = memory.available / (1024**3)

        if memory_pct > 75:
            log_message(f"Memory usage: {memory_pct:.1f}% used, {available_gb:.1f} GB available")

        return memory_pct

    return log_message, check_memory_usage

log, check_memory = setup_logging()

log("="*80)
log("OTTO PART 2A4: MATRIX CONSOLIDATION & VALIDATION STARTED")
log("="*80)

[2025-08-07 18:53:48] OTTO PART 2A4: MATRIX CONSOLIDATION & VALIDATION STARTED


## INPUT VALIDATION AND LOADING

In [5]:
def validate_input_files():
    """
    Validate that all required input files exist and are accessible

    Returns:
        dict: File validation results with sizes and status
    """
    log("Validating input files from previous notebooks...")

    # Required input files with their sources
    required_files = {
        "click_to_click_matrix.pkl": "Click-to-click co-visitation matrix from Part 2A2",
        "click_to_buy_matrix.pkl": "Click-to-buy co-visitation matrix from Part 2A3",
        "buy_to_buy_matrix.pkl": "Buy-to-buy co-visitation matrix from Part 2A3",
        "covisit_data_prepared.parquet": "Optimized training data from Part 2A1",
        "session_analysis.json": "Session analysis results from Part 2A1"
    }

    # Optional files (may not exist depending on previous runs)
    optional_files = {
        "click_matrix_statistics.json": "Click matrix statistics from Part 2A2",
        "buy_matrices_statistics.json": "Buy matrices statistics from Part 2A3",
        "item_stats.parquet": "Item statistics from Part 1"
    }

    validation_results = {
        "required_files": {},
        "optional_files": {},
        "missing_required": [],
        "missing_optional": [],
        "total_input_size_mb": 0
    }

    # Check required files
    log("   Checking required files:")
    for filename, description in required_files.items():
        filepath = f"{config.OUTPUT_PATH}/{filename}"
        if os.path.exists(filepath):
            file_size = os.path.getsize(filepath) / (1024*1024)  # MB
            validation_results["required_files"][filename] = {
                "exists": True,
                "size_mb": file_size,
                "description": description
            }
            validation_results["total_input_size_mb"] += file_size
            log(f"       {filename} - {file_size:.1f} MB")
        else:
            validation_results["required_files"][filename] = {
                "exists": False,
                "size_mb": 0,
                "description": description
            }
            validation_results["missing_required"].append(filename)
            log(f"       {filename} - MISSING")

    # Check optional files
    log("   Checking optional files:")
    for filename, description in optional_files.items():
        filepath = f"{config.OUTPUT_PATH}/{filename}"
        if os.path.exists(filepath):
            file_size = os.path.getsize(filepath) / (1024*1024)  # MB
            validation_results["optional_files"][filename] = {
                "exists": True,
                "size_mb": file_size,
                "description": description
            }
            validation_results["total_input_size_mb"] += file_size
            log(f"       {filename} - {file_size:.1f} MB")
        else:
            validation_results["optional_files"][filename] = {
                "exists": False,
                "size_mb": 0,
                "description": description
            }
            validation_results["missing_optional"].append(filename)
            log(f"       {filename} - optional, not found")

    # Validation summary
    missing_required_count = len(validation_results["missing_required"])
    if missing_required_count > 0:
        log(f"   ERROR: {missing_required_count} required files are missing!")
        for missing_file in validation_results["missing_required"]:
            log(f"       {missing_file}")
        log("   Please run the previous notebooks (Part 2A1, 2A2, 2A3) to generate required files.")
        raise FileNotFoundError("Required input files are missing!")

    log(f"   SUCCESS: All required files found (Total size: {validation_results['total_input_size_mb']:.1f} MB)")

    return validation_results

def load_co_visitation_matrices():
    """
    Load all co-visitation matrices from previous notebooks

    Returns:
        tuple: (click_to_click_matrix, click_to_buy_matrix, buy_to_buy_matrix, load_stats)
    """
    log("Loading co-visitation matrices...")

    load_stats = {}

    try:
        # Load click-to-click matrix
        log("   Loading click-to-click matrix...")
        with open(f"{config.OUTPUT_PATH}/click_to_click_matrix.pkl", "rb") as f:
            click_to_click_matrix = pickle.load(f)

        ctc_source_items = len(click_to_click_matrix) if click_to_click_matrix else 0
        ctc_total_pairs = sum(len(candidates) for candidates in click_to_click_matrix.values()) if click_to_click_matrix else 0

        load_stats["click_to_click"] = {
            "source_items": ctc_source_items,
            "total_pairs": ctc_total_pairs,
            "loaded_successfully": True
        }

        log(f"       Click-to-click: {ctc_source_items:,} source items, {ctc_total_pairs:,} pairs")

    except Exception as e:
        log(f"       Failed to load click-to-click matrix: {e}")
        click_to_click_matrix = {}
        load_stats["click_to_click"] = {"source_items": 0, "total_pairs": 0, "loaded_successfully": False, "error": str(e)}

    try:
        # Load click-to-buy matrix
        log("   Loading click-to-buy matrix...")
        with open(f"{config.OUTPUT_PATH}/click_to_buy_matrix.pkl", "rb") as f:
            click_to_buy_matrix = pickle.load(f)

        ctb_source_items = len(click_to_buy_matrix) if click_to_buy_matrix else 0
        ctb_total_pairs = sum(len(candidates) for candidates in click_to_buy_matrix.values()) if click_to_buy_matrix else 0

        load_stats["click_to_buy"] = {
            "source_items": ctb_source_items,
            "total_pairs": ctb_total_pairs,
            "loaded_successfully": True
        }

        log(f"       Click-to-buy: {ctb_source_items:,} source items, {ctb_total_pairs:,} pairs")

    except Exception as e:
        log(f"       Failed to load click-to-buy matrix: {e}")
        click_to_buy_matrix = {}
        load_stats["click_to_buy"] = {"source_items": 0, "total_pairs": 0, "loaded_successfully": False, "error": str(e)}

    try:
        # Load buy-to-buy matrix
        log("   Loading buy-to-buy matrix...")
        with open(f"{config.OUTPUT_PATH}/buy_to_buy_matrix.pkl", "rb") as f:
            buy_to_buy_matrix = pickle.load(f)

        btb_source_items = len(buy_to_buy_matrix) if buy_to_buy_matrix else 0
        btb_total_pairs = sum(len(candidates) for candidates in buy_to_buy_matrix.values()) if buy_to_buy_matrix else 0

        load_stats["buy_to_buy"] = {
            "source_items": btb_source_items,
            "total_pairs": btb_total_pairs,
            "loaded_successfully": True
        }

        log(f"       Buy-to-buy: {btb_source_items:,} source items, {btb_total_pairs:,} pairs")

    except Exception as e:
        log(f"       Failed to load buy-to-buy matrix: {e}")
        buy_to_buy_matrix = {}
        load_stats["buy_to_buy"] = {"source_items": 0, "total_pairs": 0, "loaded_successfully": False, "error": str(e)}

    # Summary
    successful_loads = sum(1 for stats in load_stats.values() if stats["loaded_successfully"])
    total_source_items = sum(stats["source_items"] for stats in load_stats.values())
    total_pairs = sum(stats["total_pairs"] for stats in load_stats.values())

    log(f"   Matrix loading summary:")
    log(f"      Successfully loaded: {successful_loads}/3 matrices")
    log(f"      Total source items: {total_source_items:,}")
    log(f"      Total pairs: {total_pairs:,}")

    if successful_loads == 0:
        raise ValueError("No co-visitation matrices could be loaded!")

    return click_to_click_matrix, click_to_buy_matrix, buy_to_buy_matrix, load_stats

def load_supporting_data():
    """
    Load supporting data and statistics from previous notebooks

    Returns:
        dict: Supporting data including session analysis and matrix statistics
    """
    log("Loading supporting data and statistics...")

    supporting_data = {}

    try:
        # Load session analysis
        log("   Loading session analysis...")
        with open(f"{config.OUTPUT_PATH}/session_analysis.json", "r") as f:
            supporting_data["session_analysis"] = json.load(f)
        log(f"       Session analysis loaded")
    except Exception as e:
        log(f"       Failed to load session analysis: {e}")
        supporting_data["session_analysis"] = {}

    try:
        # Load prepared data for validation
        log("   Loading prepared training data...")
        prepared_data = pl.read_parquet(f"{config.OUTPUT_PATH}/covisit_data_prepared.parquet")

        # Extract basic statistics
        total_events = len(prepared_data)
        unique_items = prepared_data.select("aid").n_unique()
        unique_sessions = prepared_data.select("session").n_unique()

        supporting_data["data_statistics"] = {
            "total_events": total_events,
            "unique_items": unique_items,
            "unique_sessions": unique_sessions,
            "data_loaded": True
        }

        log(f"       Training data: {total_events:,} events, {unique_items:,} items, {unique_sessions:,} sessions")

        # Keep a sample for validation
        supporting_data["validation_sample"] = prepared_data.sample(min(10000, len(prepared_data)), seed=42)

    except Exception as e:
        log(f"       Failed to load training data: {e}")
        supporting_data["data_statistics"] = {"data_loaded": False, "error": str(e)}
        supporting_data["validation_sample"] = None

    try:
        # Load matrix statistics if available
        log("   Loading matrix statistics...")

        # Click matrix statistics
        try:
            with open(f"{config.OUTPUT_PATH}/click_matrix_statistics.json", "r") as f:
                supporting_data["click_matrix_stats"] = json.load(f)
            log(f"       Click matrix statistics loaded")
        except:
            supporting_data["click_matrix_stats"] = {}

        # Buy matrices statistics
        try:
            with open(f"{config.OUTPUT_PATH}/buy_matrices_statistics.json", "r") as f:
                supporting_data["buy_matrices_stats"] = json.load(f)
            log(f"       Buy matrices statistics loaded")
        except:
            supporting_data["buy_matrices_stats"] = {}

    except Exception as e:
        log(f"       Matrix statistics not fully available: {e}")

    log("   Supporting data loading completed")
    return supporting_data

## MATRIX CONSOLIDATION AND STANDARDIZATION

In [6]:
class MatrixConsolidator:
    """
    Consolidates and standardizes co-visitation matrices from different generation processes
    """

    def __init__(self):
        self.matrix_types = ["click_to_click", "click_to_buy", "buy_to_buy"]
        self.consolidation_stats = {}

        log("Initializing matrix consolidator...")
        log(f"   Matrix types to consolidate: {', '.join(self.matrix_types)}")
        log(f"   Max candidates per item: {config.MAX_CANDIDATES_PER_ITEM}")

    def standardize_matrix_format(self, matrix: Dict, matrix_type: str) -> Dict:
        """
        Standardize matrix format and ensure consistent structure

        Args:
            matrix: Raw matrix from generation process
            matrix_type: Type of matrix (click_to_click, click_to_buy, buy_to_buy)

        Returns:
            dict: Standardized matrix
        """
        log(f"   Standardizing {matrix_type} matrix format...")

        if not matrix:
            log(f"      Warning: {matrix_type} matrix is empty")
            return {}

        standardized_matrix = {}
        original_source_items = len(matrix)
        original_total_pairs = sum(len(candidates) for candidates in matrix.values())

        removed_items = 0
        removed_pairs = 0

        for source_item, candidates in matrix.items():
            if not candidates:
                removed_items += 1
                continue

            # Ensure candidates are in the correct format (aid, score)
            standardized_candidates = []

            for candidate in candidates:
                if isinstance(candidate, (list, tuple)) and len(candidate) == 2:
                    aid, score = candidate
                    # Ensure numeric types
                    try:
                        aid = int(aid)
                        score = float(score)
                        standardized_candidates.append((aid, score))
                    except (ValueError, TypeError):
                        removed_pairs += 1
                        continue
                else:
                    removed_pairs += 1
                    continue

            # Sort by score and limit candidates
            if standardized_candidates:
                standardized_candidates.sort(key=lambda x: x[1], reverse=True)
                limited_candidates = standardized_candidates[:config.MAX_CANDIDATES_PER_ITEM]

                removed_pairs += len(standardized_candidates) - len(limited_candidates)

                if len(limited_candidates) >= config.MIN_CANDIDATES_PER_MATRIX:
                    standardized_matrix[int(source_item)] = limited_candidates
                else:
                    removed_items += 1
            else:
                removed_items += 1

        final_source_items = len(standardized_matrix)
        final_total_pairs = sum(len(candidates) for candidates in standardized_matrix.values())

        standardization_stats = {
            "original_source_items": original_source_items,
            "final_source_items": final_source_items,
            "original_total_pairs": original_total_pairs,
            "final_total_pairs": final_total_pairs,
            "removed_items": removed_items,
            "removed_pairs": removed_pairs,
            "standardization_successful": True
        }

        self.consolidation_stats[matrix_type] = standardization_stats

        log(f"       {matrix_type} standardization completed:")
        log(f"         Source items: {original_source_items:,} -> {final_source_items:,} ({removed_items:,} removed)")
        log(f"         Total pairs: {original_total_pairs:,} -> {final_total_pairs:,} ({removed_pairs:,} removed)")

        return standardized_matrix

    def consolidate_matrices(self, click_to_click: Dict, click_to_buy: Dict, buy_to_buy: Dict) -> Dict:
        """
        Consolidate all matrices into a unified structure

        Args:
            click_to_click: Click-to-click matrix
            click_to_buy: Click-to-buy matrix
            buy_to_buy: Buy-to-buy matrix

        Returns:
            dict: Consolidated matrices structure
        """
        log("Consolidating co-visitation matrices...")

        # Standardize each matrix
        standardized_matrices = {}

        standardized_matrices["click_to_click"] = self.standardize_matrix_format(click_to_click, "click_to_click")
        standardized_matrices["click_to_buy"] = self.standardize_matrix_format(click_to_buy, "click_to_buy")
        standardized_matrices["buy_to_buy"] = self.standardize_matrix_format(buy_to_buy, "buy_to_buy")

        # Create consolidated structure
        consolidated_matrices = {
            "matrices": standardized_matrices,
            "metadata": {
                "consolidation_timestamp": datetime.now().isoformat(),
                "matrix_types": self.matrix_types,
                "max_candidates_per_item": config.MAX_CANDIDATES_PER_ITEM,
                "min_candidates_per_matrix": config.MIN_CANDIDATES_PER_MATRIX,
                "consolidation_stats": self.consolidation_stats
            }
        }

        # Summary statistics
        total_source_items = sum(len(matrix) for matrix in standardized_matrices.values())
        total_pairs = sum(sum(len(candidates) for candidates in matrix.values())
                         for matrix in standardized_matrices.values())

        consolidated_matrices["summary"] = {
            "total_source_items": total_source_items,
            "total_pairs": total_pairs,
            "matrices_with_data": sum(1 for matrix in standardized_matrices.values() if matrix),
            "consolidation_successful": True
        }

        log(f"   Consolidation summary:")
        log(f"      Total source items across all matrices: {total_source_items:,}")
        log(f"      Total pairs across all matrices: {total_pairs:,}")
        log(f"      Matrices with data: {consolidated_matrices['summary']['matrices_with_data']}/3")

        return consolidated_matrices

## MATRIX VALIDATION AND QUALITY ASSESSMENT

In [7]:
class MatrixValidator:
    """
    Performs comprehensive validation and quality assessment of consolidated matrices
    """

    def __init__(self, supporting_data: Dict):
        self.supporting_data = supporting_data
        self.validation_results = {}

        log("Initializing matrix validator...")

    def validate_matrix_coverage(self, consolidated_matrices: Dict) -> Dict:
        """
        Analyze coverage of items across different matrix types

        Args:
            consolidated_matrices: Consolidated matrices structure

        Returns:
            dict: Coverage analysis results
        """
        log("Analyzing matrix coverage...")

        matrices = consolidated_matrices["matrices"]

        # Get all source items from each matrix
        all_source_items = {}
        for matrix_type, matrix in matrices.items():
            all_source_items[matrix_type] = set(matrix.keys()) if matrix else set()

        # Get all target items (candidates)
        all_target_items = {}
        for matrix_type, matrix in matrices.items():
            targets = set()
            if matrix:
                for candidates in matrix.values():
                    for aid, score in candidates:
                        targets.add(aid)
            all_target_items[matrix_type] = targets

        # Calculate coverage statistics
        all_items_union = set()
        for items in all_source_items.values():
            all_items_union.update(items)
        for items in all_target_items.values():
            all_items_union.update(items)

        total_items_in_matrices = len(all_items_union)

        # Compare with dataset statistics if available
        dataset_items = self.supporting_data.get("data_statistics", {}).get("unique_items", 0)
        coverage_percentage = (total_items_in_matrices / dataset_items * 100) if dataset_items > 0 else 0

        # Matrix overlap analysis
        matrix_overlaps = {}
        matrix_types = list(all_source_items.keys())

        for i, type1 in enumerate(matrix_types):
            for j, type2 in enumerate(matrix_types):
                if i < j:
                    overlap_key = f"{type1}_vs_{type2}"
                    source_overlap = len(all_source_items[type1].intersection(all_source_items[type2]))
                    target_overlap = len(all_target_items[type1].intersection(all_target_items[type2]))

                    matrix_overlaps[overlap_key] = {
                        "source_item_overlap": source_overlap,
                        "target_item_overlap": target_overlap
                    }

        coverage_analysis = {
            "total_items_in_matrices": total_items_in_matrices,
            "dataset_total_items": dataset_items,
            "coverage_percentage": coverage_percentage,
            "source_item_counts": {matrix_type: len(items) for matrix_type, items in all_source_items.items()},
            "target_item_counts": {matrix_type: len(items) for matrix_type, items in all_target_items.items()},
            "matrix_overlaps": matrix_overlaps
        }

        log(f"   Coverage analysis results:")
        log(f"      Total items covered: {total_items_in_matrices:,}")
        log(f"      Dataset coverage: {coverage_percentage:.1f}% ({total_items_in_matrices:,}/{dataset_items:,})")

        for matrix_type, count in coverage_analysis["source_item_counts"].items():
            log(f"      {matrix_type} source items: {count:,}")

        return coverage_analysis

    def validate_matrix_quality(self, consolidated_matrices: Dict) -> Dict:
        """
        Assess the quality of consolidated matrices

        Args:
            consolidated_matrices: Consolidated matrices structure

        Returns:
            dict: Quality assessment results
        """
        log("Assessing matrix quality...")

        matrices = consolidated_matrices["matrices"]
        quality_metrics = {}

        for matrix_type, matrix in matrices.items():
            if not matrix:
                quality_metrics[matrix_type] = {
                    "quality_score": 0.0,
                    "has_data": False,
                    "quality_issues": ["Matrix is empty"]
                }
                continue

            log(f"   Analyzing {matrix_type} matrix quality...")

            # Basic quality metrics
            source_items = len(matrix)
            candidate_counts = [len(candidates) for candidates in matrix.values()]
            total_pairs = sum(candidate_counts)

            # Score distribution analysis
            all_scores = []
            for candidates in matrix.values():
                scores = [score for aid, score in candidates]
                all_scores.extend(scores)

            # Quality indicators
            quality_issues = []
            quality_score = 1.0

            # Check for minimum data requirements
            if source_items < 100:
                quality_issues.append(f"Low source item count: {source_items}")
                quality_score *= 0.7

            if total_pairs < 1000:
                quality_issues.append(f"Low total pairs count: {total_pairs}")
                quality_score *= 0.8

            # Check candidate distribution
            if candidate_counts:
                avg_candidates = np.mean(candidate_counts)
                std_candidates = np.std(candidate_counts)

                if avg_candidates < config.MIN_CANDIDATES_PER_MATRIX:
                    quality_issues.append(f"Low average candidates per item: {avg_candidates:.1f}")
                    quality_score *= 0.8

                # Check for extremely uneven distribution
                if std_candidates > avg_candidates * 2:
                    quality_issues.append("Highly uneven candidate distribution")
                    quality_score *= 0.9

            # Check score distribution
            if all_scores:
                score_stats = {
                    "min": min(all_scores),
                    "max": max(all_scores),
                    "mean": np.mean(all_scores),
                    "std": np.std(all_scores)
                }

                # Check for score validity
                if score_stats["min"] < 0:
                    quality_issues.append("Negative scores found")
                    quality_score *= 0.9

                if score_stats["std"] == 0:
                    quality_issues.append("All scores are identical")
                    quality_score *= 0.8
            else:
                score_stats = {}
                quality_issues.append("No valid scores found")
                quality_score *= 0.5

            quality_metrics[matrix_type] = {
                "quality_score": quality_score,
                "has_data": True,
                "source_items": source_items,
                "total_pairs": total_pairs,
                "avg_candidates_per_item": np.mean(candidate_counts) if candidate_counts else 0,
                "score_statistics": score_stats,
                "quality_issues": quality_issues
            }

            log(f"      Quality score: {quality_score:.2f}")
            if quality_issues:
                log(f"      Issues found: {', '.join(quality_issues)}")
            else:
                log(f"      No quality issues detected")

        # Overall quality assessment
        valid_matrices = sum(1 for metrics in quality_metrics.values() if metrics["has_data"])
        avg_quality_score = np.mean([metrics["quality_score"] for metrics in quality_metrics.values()])

        overall_assessment = {
            "valid_matrices": valid_matrices,
            "total_matrices": len(matrices),
            "average_quality_score": avg_quality_score,
            "overall_quality": "EXCELLENT" if avg_quality_score >= 0.9 else
                              "GOOD" if avg_quality_score >= 0.7 else
                              "ACCEPTABLE" if avg_quality_score >= 0.5 else "POOR"
        }

        quality_assessment = {
            "matrix_quality_metrics": quality_metrics,
            "overall_assessment": overall_assessment,
            "quality_timestamp": datetime.now().isoformat()
        }

        log(f"   Overall quality assessment:")
        log(f"      Valid matrices: {valid_matrices}/{len(matrices)}")
        log(f"      Average quality score: {avg_quality_score:.2f}")
        log(f"      Overall quality rating: {overall_assessment['overall_quality']}")

        return quality_assessment

    def cross_validate_matrices(self, consolidated_matrices: Dict) -> Dict:
        """
        Perform cross-validation between different matrix types

        Args:
            consolidated_matrices: Consolidated matrices structure

        Returns:
            dict: Cross-validation results
        """
        log("Performing cross-matrix validation...")

        matrices = consolidated_matrices["matrices"]
        cross_validation_results = {}

        # Sample items for cross-validation
        all_source_items = set()
        for matrix in matrices.values():
            if matrix:
                all_source_items.update(matrix.keys())

        if len(all_source_items) == 0:
            log("   No source items available for cross-validation")
            return {"error": "No source items available"}

        sample_items = list(all_source_items)[:config.CROSS_VALIDATION_SAMPLE]
        log(f"   Cross-validating {len(sample_items)} sample items...")

        # Check consistency across matrices
        consistency_metrics = {}

        for matrix_type, matrix in matrices.items():
            if not matrix:
                continue

            item_consistency = []

            for item in sample_items:
                if item in matrix:
                    candidates = matrix[item]

                    # Check for duplicate candidates
                    candidate_aids = [aid for aid, score in candidates]
                    has_duplicates = len(candidate_aids) != len(set(candidate_aids))

                    # Check score ordering
                    scores = [score for aid, score in candidates]
                    is_sorted = all(scores[i] >= scores[i+1] for i in range(len(scores)-1))

                    item_consistency.append({
                        "item": item,
                        "has_duplicates": has_duplicates,
                        "is_sorted": is_sorted,
                        "candidate_count": len(candidates)
                    })

            # Calculate consistency metrics
            if item_consistency:
                duplicate_rate = sum(1 for item in item_consistency if item["has_duplicates"]) / len(item_consistency)
                sorting_compliance = sum(1 for item in item_consistency if item["is_sorted"]) / len(item_consistency)
                avg_candidates = np.mean([item["candidate_count"] for item in item_consistency])

                consistency_metrics[matrix_type] = {
                    "items_validated": len(item_consistency),
                    "duplicate_rate": duplicate_rate,
                    "sorting_compliance": sorting_compliance,
                    "avg_candidates": avg_candidates
                }

                log(f"      {matrix_type}:")
                log(f"         Items validated: {len(item_consistency)}")
                log(f"         Duplicate rate: {duplicate_rate:.1%}")
                log(f"         Sorting compliance: {sorting_compliance:.1%}")

        cross_validation_results = {
            "sample_size": len(sample_items),
            "consistency_metrics": consistency_metrics,
            "validation_timestamp": datetime.now().isoformat()
        }

        return cross_validation_results

## SAMPLE GENERATION FOR MANUAL VALIDATION

In [8]:
def generate_validation_samples(consolidated_matrices: Dict) -> Dict:
    """
    Generate sample relationships for manual validation and inspection

    Args:
        consolidated_matrices: Consolidated matrices structure

    Returns:
        dict: Sample relationships for validation
    """
    log("Generating validation samples...")

    matrices = consolidated_matrices["matrices"]
    validation_samples = {}

    for matrix_type, matrix in matrices.items():
        if not matrix:
            validation_samples[matrix_type] = {"error": "Matrix is empty"}
            continue

        log(f"   Generating samples for {matrix_type}...")

        # Get sample of source items
        source_items = list(matrix.keys())
        sample_size = min(config.SAMPLE_SIZE_FOR_VALIDATION, len(source_items))
        sample_items = np.random.choice(source_items, sample_size, replace=False)

        matrix_samples = {}

        for item in sample_items:
            candidates = matrix[item]

            # Include top candidates with their scores
            top_candidates = candidates[:10]  # Top 10 for manual inspection

            matrix_samples[str(item)] = {
                "total_candidates": len(candidates),
                "top_candidates": [{"aid": aid, "score": score} for aid, score in top_candidates],
                "score_range": {
                    "max": max(score for aid, score in candidates),
                    "min": min(score for aid, score in candidates)
                }
            }

        validation_samples[matrix_type] = {
            "sample_count": len(matrix_samples),
            "samples": matrix_samples
        }

        log(f"      Generated {len(matrix_samples)} samples for {matrix_type}")

    validation_samples["generation_timestamp"] = datetime.now().isoformat()
    validation_samples["sample_purpose"] = "Manual validation and quality inspection"

    return validation_samples

## CONFIGURATION GENERATION FOR PART 2B

In [9]:
def generate_candidate_generation_config(consolidated_matrices: Dict,
                                        quality_assessment: Dict,
                                        coverage_analysis: Dict) -> Dict:
    """
    Generate configuration for Part 2B candidate generation

    Args:
        consolidated_matrices: Consolidated matrices structure
        quality_assessment: Quality assessment results
        coverage_analysis: Coverage analysis results

    Returns:
        dict: Configuration for candidate generation in Part 2B
    """
    log("Generating configuration for Part 2B candidate generation...")

    matrices = consolidated_matrices["matrices"]

    # Determine optimal candidate generation strategy based on matrix quality
    matrix_strategies = {}

    for matrix_type, matrix in matrices.items():
        if not matrix:
            matrix_strategies[matrix_type] = {
                "enabled": False,
                "reason": "Matrix is empty or unavailable"
            }
            continue

        quality_metrics = quality_assessment["matrix_quality_metrics"].get(matrix_type, {})
        quality_score = quality_metrics.get("quality_score", 0)

        if quality_score >= 0.7:
            strategy = {
                "enabled": True,
                "priority": "HIGH" if quality_score >= 0.9 else "MEDIUM",
                "max_candidates": config.MAX_CANDIDATES_PER_ITEM,
                "min_score_threshold": None,  # Use all candidates
                "sampling_method": "top_k"
            }
        elif quality_score >= 0.5:
            strategy = {
                "enabled": True,
                "priority": "LOW",
                "max_candidates": min(20, config.MAX_CANDIDATES_PER_ITEM),
                "min_score_threshold": 1.0,  # Filter low-quality candidates
                "sampling_method": "filtered_top_k"
            }
        else:
            strategy = {
                "enabled": False,
                "reason": f"Quality score too low: {quality_score:.2f}"
            }

        matrix_strategies[matrix_type] = strategy

    # Overall candidate generation parameters
    enabled_matrices = [name for name, strategy in matrix_strategies.items() if strategy.get("enabled", False)]

    candidate_config = {
        "generation_timestamp": datetime.now().isoformat(),
        "source_notebook": "Part 2A4: Matrix Consolidation & Validation",
        "target_notebook": "Part 2B: Model Training",

        "matrix_strategies": matrix_strategies,
        "enabled_matrices": enabled_matrices,
        "total_enabled_matrices": len(enabled_matrices),

        "global_parameters": {
            "max_candidates_per_item": config.MAX_CANDIDATES_PER_ITEM,
            "candidate_combination_method": "weighted_union",  # How to combine candidates from different matrices
            "default_weights": {
                "click_to_click": 0.4,
                "click_to_buy": 0.35,
                "buy_to_buy": 0.25
            }
        },

        "quality_based_adjustments": {
            "use_quality_weighting": True,
            "quality_scores": {matrix_type: quality_assessment["matrix_quality_metrics"].get(matrix_type, {}).get("quality_score", 0)
                             for matrix_type in matrices.keys()},
            "coverage_percentage": coverage_analysis.get("coverage_percentage", 0)
        },

        "fallback_strategy": {
            "min_matrices_required": 1,
            "use_popularity_fallback": True,
            "popularity_candidates": 20
        }
    }

    log(f"   Configuration generated:")
    log(f"      Enabled matrices: {len(enabled_matrices)}/3")
    log(f"      Primary matrices: {[name for name, strategy in matrix_strategies.items() if strategy.get('priority') == 'HIGH']}")
    log(f"      Coverage percentage: {coverage_analysis.get('coverage_percentage', 0):.1f}%")

    return candidate_config

## MAIN EXECUTION

In [10]:
# Step 1: Validate input files
file_validation = validate_input_files()

# Step 2: Load co-visitation matrices
click_to_click_matrix, click_to_buy_matrix, buy_to_buy_matrix, load_stats = load_co_visitation_matrices()

# Step 3: Load supporting data
supporting_data = load_supporting_data()

# Step 4: Consolidate matrices
log("\n" + "="*60)
log("MATRIX CONSOLIDATION")
log("="*60)

consolidator = MatrixConsolidator()
consolidated_matrices = consolidator.consolidate_matrices(
    click_to_click_matrix,
    click_to_buy_matrix,
    buy_to_buy_matrix
)

# Clean up individual matrices to free memory
del click_to_click_matrix, click_to_buy_matrix, buy_to_buy_matrix
gc.collect()

# Step 5: Validate and assess quality
log("\n" + "="*60)
log("MATRIX VALIDATION AND QUALITY ASSESSMENT")
log("="*60)

validator = MatrixValidator(supporting_data)

# Coverage analysis
coverage_analysis = validator.validate_matrix_coverage(consolidated_matrices)

# Quality assessment
quality_assessment = validator.validate_matrix_quality(consolidated_matrices)

# Cross-validation
cross_validation_results = validator.cross_validate_matrices(consolidated_matrices)

# Step 6: Generate validation samples
log("\n" + "="*60)
log("VALIDATION SAMPLE GENERATION")
log("="*60)

validation_samples = generate_validation_samples(consolidated_matrices)

# Step 7: Generate configuration for Part 2B
log("\n" + "="*60)
log("CANDIDATE GENERATION CONFIGURATION")
log("="*60)

candidate_config = generate_candidate_generation_config(
    consolidated_matrices,
    quality_assessment,
    coverage_analysis
)

[2025-08-07 18:53:48] Validating input files from previous notebooks...
[2025-08-07 18:53:48]    Checking required files:
[2025-08-07 18:53:48]        click_to_click_matrix.pkl - 556.6 MB
[2025-08-07 18:53:48]        click_to_buy_matrix.pkl - 65.3 MB
[2025-08-07 18:53:48]        buy_to_buy_matrix.pkl - 32.3 MB
[2025-08-07 18:53:48]        covisit_data_prepared.parquet - 1605.4 MB
[2025-08-07 18:53:48]        session_analysis.json - 0.0 MB
[2025-08-07 18:53:48]    Checking optional files:
[2025-08-07 18:53:48]        click_matrix_statistics.json - 0.0 MB
[2025-08-07 18:53:48]        buy_matrices_statistics.json - 0.0 MB
[2025-08-07 18:53:48]        item_stats.parquet - 0.0 MB
[2025-08-07 18:53:48]    SUCCESS: All required files found (Total size: 2259.6 MB)
[2025-08-07 18:53:48] Loading co-visitation matrices...
[2025-08-07 18:53:48]    Loading click-to-click matrix...
[2025-08-07 18:54:16]        Click-to-click: 1,839,483 source items, 63,503,324 pairs
[2025-08-07 18:54:16]    Loading 

## COMPREHENSIVE REPORTING

In [11]:
def generate_comprehensive_report(consolidated_matrices: Dict,
                                coverage_analysis: Dict,
                                quality_assessment: Dict,
                                cross_validation_results: Dict,
                                load_stats: Dict,
                                file_validation: Dict) -> Dict:
    """
    Generate comprehensive consolidation report

    Returns:
        dict: Comprehensive report of consolidation process
    """
    log("Generating comprehensive consolidation report...")

    # Executive summary
    successful_matrices = sum(1 for matrix in consolidated_matrices["matrices"].values() if matrix)
    total_source_items = consolidated_matrices["summary"]["total_source_items"]
    total_pairs = consolidated_matrices["summary"]["total_pairs"]
    overall_quality = quality_assessment["overall_assessment"]["overall_quality"]
    coverage_percentage = coverage_analysis.get("coverage_percentage", 0)

    executive_summary = {
        "consolidation_successful": successful_matrices > 0,
        "matrices_consolidated": f"{successful_matrices}/3",
        "total_source_items": total_source_items,
        "total_pairs": total_pairs,
        "overall_quality_rating": overall_quality,
        "dataset_coverage_percentage": coverage_percentage,
        "ready_for_part_2b": successful_matrices >= 1 and coverage_percentage > config.QUALITY_THRESHOLD_COVERAGE * 100
    }

    # Detailed consolidation metrics
    consolidation_metrics = {
        "input_validation": file_validation,
        "matrix_loading": load_stats,
        "consolidation_stats": consolidated_matrices["metadata"]["consolidation_stats"],
        "matrix_summary": consolidated_matrices["summary"]
    }

    # Quality and validation metrics
    validation_metrics = {
        "coverage_analysis": coverage_analysis,
        "quality_assessment": quality_assessment,
        "cross_validation": cross_validation_results
    }

    # Recommendations
    recommendations = []

    if successful_matrices == 3:
        recommendations.append("All matrices successfully consolidated - proceed with confidence to Part 2B")
    elif successful_matrices >= 2:
        recommendations.append("Most matrices consolidated successfully - good for Part 2B with minor limitations")
    elif successful_matrices == 1:
        recommendations.append("Limited matrix availability - Part 2B possible but with reduced performance")
    else:
        recommendations.append("No matrices available - re-run previous notebooks to generate matrices")

    if coverage_percentage < 50:
        recommendations.append("Low dataset coverage - consider reviewing matrix generation parameters")

    if overall_quality in ["POOR", "ACCEPTABLE"]:
        recommendations.append("Quality concerns detected - review quality assessment details")

    comprehensive_report = {
        "report_timestamp": datetime.now().isoformat(),
        "consolidation_version": "Part 2A4 v1.0",
        "executive_summary": executive_summary,
        "consolidation_metrics": consolidation_metrics,
        "validation_metrics": validation_metrics,
        "recommendations": recommendations,
        "next_steps": [
            "Review quality assessment and recommendations",
            "Proceed to Part 2B: Model Training",
            "Use candidate_generation_config.json for optimal candidate generation"
        ]
    }

    log(f"   Report generation completed")
    log(f"      Overall success: {executive_summary['consolidation_successful']}")
    log(f"      Matrices: {executive_summary['matrices_consolidated']}")
    log(f"      Quality: {executive_summary['overall_quality_rating']}")
    log(f"      Coverage: {executive_summary['dataset_coverage_percentage']:.1f}%")

    return comprehensive_report

# Generate comprehensive report
comprehensive_report = generate_comprehensive_report(
    consolidated_matrices,
    coverage_analysis,
    quality_assessment,
    cross_validation_results,
    load_stats,
    file_validation
)

[2025-08-07 18:56:19] Generating comprehensive consolidation report...
[2025-08-07 18:56:19]    Report generation completed
[2025-08-07 18:56:19]       Overall success: True
[2025-08-07 18:56:19]       Matrices: 3/3
[2025-08-07 18:56:19]       Quality: EXCELLENT
[2025-08-07 18:56:19]       Coverage: 98.5%


## SAVE ALL OUTPUTS

In [12]:
def save_consolidation_outputs(consolidated_matrices: Dict,
                              comprehensive_report: Dict,
                              coverage_analysis: Dict,
                              quality_assessment: Dict,
                              candidate_config: Dict,
                              validation_samples: Dict):
    """
    Save all consolidation outputs

    Args:
        consolidated_matrices: Final consolidated matrices
        comprehensive_report: Comprehensive consolidation report
        coverage_analysis: Coverage analysis results
        quality_assessment: Quality assessment results
        candidate_config: Configuration for Part 2B
        validation_samples: Sample relationships for validation
    """
    log("Saving consolidation outputs...")

    output_files = {}

    try:
        # 1. Save consolidated matrices (main output)
        matrices_path = f"{config.OUTPUT_PATH}/consolidated_covisitation_matrices.pkl"
        with open(matrices_path, "wb") as f:
            pickle.dump(consolidated_matrices, f)

        file_size = os.path.getsize(matrices_path) / (1024*1024)
        output_files["matrices"] = {"path": matrices_path, "size_mb": file_size}
        log(f"    consolidated_covisitation_matrices.pkl saved ({file_size:.1f} MB)")

        # 2. Save comprehensive report
        report_path = f"{config.OUTPUT_PATH}/matrix_consolidation_report.json"
        with open(report_path, "w") as f:
            json.dump(comprehensive_report, f, indent=2)
        output_files["report"] = {"path": report_path}
        log(f"    matrix_consolidation_report.json saved")

        # 3. Save coverage analysis
        coverage_path = f"{config.OUTPUT_PATH}/matrix_coverage_analysis.json"
        with open(coverage_path, "w") as f:
            json.dump(coverage_analysis, f, indent=2)
        output_files["coverage"] = {"path": coverage_path}
        log(f"    matrix_coverage_analysis.json saved")

        # 4. Save quality metrics
        quality_path = f"{config.OUTPUT_PATH}/matrix_quality_metrics.json"
        with open(quality_path, "w") as f:
            json.dump(quality_assessment, f, indent=2)
        output_files["quality"] = {"path": quality_path}
        log(f"    matrix_quality_metrics.json saved")

        # 5. Save candidate generation config
        config_path = f"{config.OUTPUT_PATH}/candidate_generation_config.json"
        with open(config_path, "w") as f:
            json.dump(candidate_config, f, indent=2)
        output_files["config"] = {"path": config_path}
        log(f"    candidate_generation_config.json saved")

        # 6. Save validation samples
        samples_path = f"{config.OUTPUT_PATH}/matrix_samples_validation.json"
        with open(samples_path, "w") as f:
            json.dump(validation_samples, f, indent=2)
        output_files["samples"] = {"path": samples_path}
        log(f"    matrix_samples_validation.json saved")

        # 7. Save executive summary
        summary = {
            "notebook": "Part 2A4: Matrix Consolidation & Validation",
            "completion_timestamp": datetime.now().isoformat(),
            "consolidation_successful": comprehensive_report["executive_summary"]["consolidation_successful"],
            "matrices_processed": {
                "click_to_click": len(consolidated_matrices["matrices"]["click_to_click"]) > 0,
                "click_to_buy": len(consolidated_matrices["matrices"]["click_to_buy"]) > 0,
                "buy_to_buy": len(consolidated_matrices["matrices"]["buy_to_buy"]) > 0
            },
            "key_metrics": {
                "total_source_items": comprehensive_report["executive_summary"]["total_source_items"],
                "total_pairs": comprehensive_report["executive_summary"]["total_pairs"],
                "quality_rating": comprehensive_report["executive_summary"]["overall_quality_rating"],
                "coverage_percentage": comprehensive_report["executive_summary"]["dataset_coverage_percentage"]
            },
            "outputs_generated": list(output_files.keys()),
            "ready_for_part_2b": comprehensive_report["executive_summary"]["ready_for_part_2b"],
            "next_step": "Part 2B: Model Training"
        }

        summary_path = f"{config.OUTPUT_PATH}/consolidation_summary.json"
        with open(summary_path, "w") as f:
            json.dump(summary, f, indent=2)
        output_files["summary"] = {"path": summary_path}
        log(f"    consolidation_summary.json saved")

        log(" All consolidation outputs saved successfully!")
        return output_files

    except Exception as e:
        log(f" Error saving consolidation outputs: {e}")
        raise e

# Save all outputs
output_files = save_consolidation_outputs(
    consolidated_matrices,
    comprehensive_report,
    coverage_analysis,
    quality_assessment,
    candidate_config,
    validation_samples
)

[2025-08-07 18:56:19] Saving consolidation outputs...
[2025-08-07 18:56:55]     consolidated_covisitation_matrices.pkl saved (1122.3 MB)
[2025-08-07 18:56:55]     matrix_consolidation_report.json saved
[2025-08-07 18:56:55]     matrix_coverage_analysis.json saved
[2025-08-07 18:56:55]     matrix_quality_metrics.json saved
[2025-08-07 18:56:55]     candidate_generation_config.json saved
[2025-08-07 18:56:56]     matrix_samples_validation.json saved
[2025-08-07 18:56:56]     consolidation_summary.json saved
[2025-08-07 18:56:56]  All consolidation outputs saved successfully!


## FINAL SUMMARY AND NEXT STEPS

In [13]:
log("\n" + "="*80)
log("PART 2A4 COMPLETED: MATRIX CONSOLIDATION & VALIDATION")
log("="*80)

# Final results summary
exec_summary = comprehensive_report["executive_summary"]
log(f"\nCONSOLIDATION RESULTS:")
log(f" Matrices consolidated: {exec_summary['matrices_consolidated']}")
log(f" Total source items: {exec_summary['total_source_items']:,}")
log(f" Total pairs: {exec_summary['total_pairs']:,}")
log(f" Quality rating: {exec_summary['overall_quality_rating']}")
log(f" Dataset coverage: {exec_summary['dataset_coverage_percentage']:.1f}%")
log(f" Ready for Part 2B: {'yes' if exec_summary['ready_for_part_2b'] else 'no'}")

# Matrix-specific status
matrices_status = consolidated_matrices["matrices"]
log(f"\nMATRIX STATUS:")
for matrix_type, matrix in matrices_status.items():
    status = "yes" if matrix else "no"
    count = len(matrix) if matrix else 0
    log(f" {matrix_type}: {status} ({count:,} source items)")

# Quality breakdown
quality_metrics = quality_assessment["matrix_quality_metrics"]
log(f"\nQUALITY BREAKDOWN:")
for matrix_type, metrics in quality_metrics.items():
    if metrics.get("has_data", False):
        score = metrics["quality_score"]
        issues = len(metrics.get("quality_issues", []))
        log(f" {matrix_type}: {score:.2f} ({issues} issues)")
    else:
        log(f" {matrix_type}: No data")

# Output files summary
log(f"\nOUTPUT FILES GENERATED:")
total_size = 0
for file_type, file_info in output_files.items():
    size_info = f" ({file_info['size_mb']:.1f} MB)" if "size_mb" in file_info else ""
    filename = os.path.basename(file_info["path"])
    log(f" {filename}{size_info}")
    if "size_mb" in file_info:
        total_size += file_info["size_mb"]
log(f" Total output size: {total_size:.1f} MB")

# Recommendations
recommendations = comprehensive_report["recommendations"]
if recommendations:
    log(f"\nRECOMMENDATIONS:")
    for i, rec in enumerate(recommendations, 1):
        log(f"{i}. {rec}")

log(f"\nFINAL STATUS:")
if exec_summary["ready_for_part_2b"]:
    log(f" READY FOR PART 2B - All requirements met")
    log(f"  Quality: {exec_summary['overall_quality_rating']}")
    log(f"  Coverage: {exec_summary['dataset_coverage_percentage']:.1f}%")
    log(f"  Matrices: {exec_summary['matrices_consolidated']}")
else:
    log(f" LIMITED READINESS - Part 2B possible with reduced performance")
    log(f"  Review recommendations for improvement strategies")

# Memory cleanup
del consolidated_matrices, supporting_data
gc.collect()

final_memory = check_memory()
log(f"\nMemory cleanup completed: {final_memory:.1f}% usage")
log(f" Part 2A4 finished successfully!")

[2025-08-07 18:56:56] 
[2025-08-07 18:56:56] PART 2A4 COMPLETED: MATRIX CONSOLIDATION & VALIDATION
[2025-08-07 18:56:56] 
CONSOLIDATION RESULTS:
[2025-08-07 18:56:56]  Matrices consolidated: 3/3
[2025-08-07 18:56:56]  Total source items: 2,389,818
[2025-08-07 18:56:56]  Total pairs: 72,533,420
[2025-08-07 18:56:56]  Quality rating: EXCELLENT
[2025-08-07 18:56:56]  Dataset coverage: 98.5%
[2025-08-07 18:56:56]  Ready for Part 2B: yes
[2025-08-07 18:56:56] 
MATRIX STATUS:
[2025-08-07 18:56:56]  click_to_click: yes (1,805,562 source items)
[2025-08-07 18:56:56]  click_to_buy: yes (379,726 source items)
[2025-08-07 18:56:56]  buy_to_buy: yes (204,530 source items)
[2025-08-07 18:56:56] 
QUALITY BREAKDOWN:
[2025-08-07 18:56:56]  click_to_click: 1.00 (0 issues)
[2025-08-07 18:56:56]  click_to_buy: 1.00 (0 issues)
[2025-08-07 18:56:56]  buy_to_buy: 1.00 (0 issues)
[2025-08-07 18:56:56] 
OUTPUT FILES GENERATED:
[2025-08-07 18:56:56]  consolidated_covisitation_matrices.pkl (1122.3 MB)
[2025-08-