In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import re
import time
import sys
from tqdm import tqdm
from sklearn.metrics import mean_absolute_error

class HistogramContainEstimator:
    def __init__(self, histograms_dir="../../large_files/traditional_methods/histogram"):
        """
        Initialize the histogram-based contain estimator
        """
        self.histograms_dir = histograms_dir
        self.histograms = {}
        self.metadata = {}
        self.cache = {}
        
        os.makedirs(f"{histograms_dir}/results/contain", exist_ok=True)
        os.makedirs(f"{histograms_dir}/visualizations/contain", exist_ok=True)
        
        self.load_histograms()
    
    def load_histograms(self):
        """Load all available histograms from the histograms directory."""
        files = os.listdir(self.histograms_dir)
        histogram_files = [f for f in files if f.endswith('_histogram.npy')]
        
        for hf in histogram_files:
            dataset_name = hf.replace('_histogram.npy', '')
            metadata_file = f"{dataset_name}_metadata.npy"
            
            if os.path.exists(os.path.join(self.histograms_dir, metadata_file)):
                print(f"Loading histogram for {dataset_name}...")
                sys.stdout.flush()
                self.histograms[dataset_name] = np.load(
                    os.path.join(self.histograms_dir, hf)
                )
                self.metadata[dataset_name] = np.load(
                    os.path.join(self.histograms_dir, metadata_file),
                    allow_pickle=True
                ).item()
                
        print(f"Loaded {len(self.histograms)} histograms")
        sys.stdout.flush()

    def parse_mbr(self, mbr_str):
        """Parse MBR string from '(x1, y1, x2, y2)'."""
        if isinstance(mbr_str, str):
            coords = mbr_str.strip('"()').split(', ')
            return [float(coord) for coord in coords]
        return mbr_str
    
    def estimate_contain_count(self, dataset_name, query_mbr):
        """
        Estimate the number of objects contained within the query MBR
        
        Parameters:
        -----------
        dataset_name : str
            Name of the dataset to query
        query_mbr : tuple or list (x1, y1, x2, y2)
            MBR coordinates of the query region
            
        Returns:
        --------
        float : Estimated number of objects contained within the query region
        """
        # Convert query to tuple for hashing in the cache
        if isinstance(query_mbr, list):
            query_mbr = tuple(query_mbr)
        
        cache_key = f"{dataset_name}_{query_mbr}"
        if cache_key in self.cache:
            return self.cache[cache_key]
        
        if dataset_name not in self.histograms:
            raise ValueError(f"No histogram found for {dataset_name}")
        
        if isinstance(query_mbr, str):
            query_mbr = self.parse_mbr(query_mbr)
            query_mbr = tuple(query_mbr)
        
        grid = self.histograms[dataset_name]
        metadata = self.metadata[dataset_name]
        grid_dim_x, grid_dim_y = metadata['dimensions']
        univ_xmin, univ_ymin, univ_xmax, univ_ymax = metadata['universe']
        
        q_xmin, q_ymin, q_xmax, q_ymax = query_mbr
        
        q_xmin = max(q_xmin, univ_xmin)
        q_ymin = max(q_ymin, univ_ymin)
        q_xmax = min(q_xmax, univ_xmax)
        q_ymax = min(q_ymax, univ_ymax)
        
        if q_xmin >= q_xmax or q_ymin >= q_ymax:
            return 0
        
        cell_width = (univ_xmax - univ_xmin) / grid_dim_x
        cell_height = (univ_ymax - univ_ymin) / grid_dim_y
        
        g_xmin = int((q_xmin - univ_xmin) / cell_width)
        g_ymin = int((q_ymin - univ_ymin) / cell_height)
        g_xmax = int(np.ceil((q_xmax - univ_xmin) / cell_width)) - 1
        g_ymax = int(np.ceil((q_ymax - univ_ymin) / cell_height)) - 1
        
        g_xmin = max(0, min(grid_dim_x-1, g_xmin))
        g_ymin = max(0, min(grid_dim_y-1, g_ymin))
        g_xmax = max(0, min(grid_dim_x-1, g_xmax))
        g_ymax = max(0, min(grid_dim_y-1, g_ymax))
        
        contained_sum = np.sum(grid[g_xmin:g_xmax+1, g_ymin:g_ymax+1])
        total_sum = np.sum(grid)
        
        total_objects = metadata.get('objects', total_sum)
        if total_sum > 0:
            estimated_count = total_objects * (float(contained_sum) / total_sum)
        else:
            estimated_count = 0
        
        self.cache[cache_key] = estimated_count
        return estimated_count
    
    def evaluate_on_dataset(self, dataset_name, results_file=None, sample_ratio=0.2):
        if not results_file:
            results_file = f"../../large_files/resultsContains/{dataset_name}_results.csv"
        
        if not os.path.exists(results_file):
            raise ValueError(f"Results file not found: {results_file}")
        
        print(f"Loading query results from {results_file}")
        sys.stdout.flush()
        
        results_df = pd.read_csv(results_file)
        
        sample_size = max(1, int(len(results_df) * sample_ratio))
        print(f"Using {sample_ratio*100}% sample: {sample_size} out of {len(results_df)} queries")
        sys.stdout.flush()
        
        sampled_results = results_df.sample(n=sample_size, random_state=42)
        
        actual_counts = []
        estimated_counts = []
        estimation_times = []
        
        # Use simple progress reporting instead of relying solely on tqdm
        print(f"Processing {dataset_name} queries: ", end="", flush=True)
        sys.stdout.flush()
        
        total_queries = len(sampled_results)
        progress_step = max(1, total_queries // 10)
        
        for i, (index, row) in enumerate(sampled_results.iterrows()):
            # Show simple progress every 10%
            if i % progress_step == 0 or i == total_queries - 1:
                progress = (i+1) / total_queries * 100
                print(f"{progress:.1f}%... ", end="", flush=True)
                sys.stdout.flush()
            
            query_mbr = self.parse_mbr(row['Query MBR'])
            actual_count = row['Count MBR']
            
            start_time = time.time()
            estimated_count = self.estimate_contain_count(dataset_name, query_mbr)
            end_time = time.time()
            
            actual_counts.append(actual_count)
            estimated_counts.append(estimated_count)
            estimation_times.append((end_time - start_time) * 1000)
        
        print("Done!")
        sys.stdout.flush()
        
        # Convert to arrays for calculations
        actual_counts = np.array(actual_counts)
        estimated_counts = np.array(estimated_counts)
        estimation_times = np.array(estimation_times)
        
        # Ensure non-negative estimates
        estimated_counts = np.maximum(0, estimated_counts)
        
        # Calculate MAE
        mae = mean_absolute_error(actual_counts, estimated_counts)
        
        # Calculate MAPE with handling for zeros
        non_zero_mask = (actual_counts != 0)
        zero_mask = ~non_zero_mask
        mape_sum = 0
        count = len(actual_counts)
        
        if np.any(non_zero_mask):
            mape_sum += np.sum(
                np.abs((actual_counts[non_zero_mask] - estimated_counts[non_zero_mask]) / actual_counts[non_zero_mask])
            )
        
        if np.any(zero_mask):
            mape_sum += np.sum(np.abs(actual_counts[zero_mask] - estimated_counts[zero_mask]) / 100)
        
        mape = mape_sum / count if count > 0 else 0
        
        # Calculate q-score
        valid_indices = (actual_counts != 0) & (estimated_counts != 0)
        if np.any(valid_indices):
            ratios = np.maximum(
                estimated_counts[valid_indices] / actual_counts[valid_indices],
                actual_counts[valid_indices] / estimated_counts[valid_indices]
            )
            q_score = np.mean(ratios)
        else:
            q_score = float('inf')
        
        avg_time_ms = np.mean(estimation_times) if len(estimation_times) > 0 else 0
        
        results = {
            'Dataset': dataset_name,
            'MAE': mae,
            'MAPE': mape,
            'Q_Score': q_score,
            'Avg_Time_ms': avg_time_ms,
            'Num_Queries': len(sampled_results),
            'Sample_Ratio': sample_ratio
        }
        
        results_df_out = pd.DataFrame([results])
        results_df_out.to_csv(
            f"{self.histograms_dir}/results/contain/{dataset_name}_evaluation_sample{int(sample_ratio*100)}.csv",
            index=False
        )
        
        # Generate visualization
        self.visualize_results(dataset_name, actual_counts, estimated_counts, sample_ratio)
        
        print(f"Evaluation results for {dataset_name} ({sample_ratio*100}% sample):")
        print(f"  MAE: {mae:.2f}")
        print(f"  MAPE: {mape:.2%}")
        print(f"  Q-Score: {q_score:.2f}")
        print(f"  Avg. Estimation Time: {avg_time_ms:.4f} ms")
        sys.stdout.flush()
        
        return results
    
    def visualize_results(self, dataset_name, actual_counts, estimated_counts, sample_ratio=0.2):
        plt.figure(figsize=(12, 10))
        plt.scatter(actual_counts, estimated_counts, alpha=0.5, s=8)
        
        max_val = max(np.max(actual_counts), np.max(estimated_counts))
        plt.plot([0, max_val], [0, max_val], 'r--', alpha=0.7)
        
        plt.xlabel('Actual Count')
        plt.ylabel('Estimated Count')
        plt.title(f'Histogram-based Contain Estimation for {dataset_name} ({int(sample_ratio*100)}% sample)')
        plt.grid(True, alpha=0.3)
        
        out_path = f"{self.histograms_dir}/visualizations/contain/{dataset_name}_estimation_sample{int(sample_ratio*100)}.png"
        plt.savefig(out_path, dpi=150)
        plt.close()
        
        sample_size = min(100, len(actual_counts))
        indices = np.random.choice(len(actual_counts), sample_size, replace=False)
        
        plt.figure(figsize=(20, 10))
        plt.scatter(
            range(sample_size),
            actual_counts[indices],
            label='Actual Count',
            s=100, alpha=0.7, marker='o', color='green'
        )
        plt.scatter(
            range(sample_size),
            estimated_counts[indices],
            label='Histogram Estimate',
            s=100, alpha=0.7, marker='x', color='blue'
        )
        
        plt.xlabel('Query Index')
        plt.ylabel('Object Count')
        plt.title(
            f'Histogram Estimation vs. Actual Count for {dataset_name} - '
            f'Sample of {sample_size} Queries ({int(sample_ratio*100)}% dataset)'
        )
        plt.legend()
        plt.grid(True, alpha=0.3)
        
        out_path_comp = f"{self.histograms_dir}/visualizations/contain/{dataset_name}_comparison_sample{int(sample_ratio*100)}.png"
        plt.savefig(out_path_comp, dpi=150)
        plt.close()

def evaluate_all_datasets(histograms_dir="../../large_files/traditional_methods/histogram", sample_ratio=0.2):
    """
    Evaluate histogram-based contain estimation on all available datasets sequentially.
    Shows progress for each dataset independently.
    """
    print("Initializing estimator...")
    sys.stdout.flush()
    
    estimator = HistogramContainEstimator(histograms_dir)
    dataset_names = list(estimator.histograms.keys())
    
    print(f"Found {len(dataset_names)} datasets to evaluate")
    sys.stdout.flush()
    
    all_results = []
    
    # Process each dataset with clear separation
    for idx, dataset_name in enumerate(dataset_names, start=1):
        print("\n" + "="*80)
        print(f"DATASET {idx}/{len(dataset_names)}: {dataset_name}")
        print("="*80)
        sys.stdout.flush()
        
        try:
            results = estimator.evaluate_on_dataset(dataset_name, sample_ratio=sample_ratio)
            all_results.append(results)
            print(f"Finished processing {dataset_name} ({idx}/{len(dataset_names)})")
            sys.stdout.flush()
        except Exception as e:
            print(f"Error evaluating {dataset_name}: {e}")
            sys.stdout.flush()
    
    # Combine and save results
    if all_results:
        all_results_df = pd.DataFrame(all_results)
        out_file = f"{histograms_dir}/results/contain/all_datasets_evaluation_sample{int(sample_ratio*100)}.csv"
        all_results_df.to_csv(out_file, index=False)
        print("\nCombined results:")
        print(all_results_df)
    else:
        print("No results were generated")
    
    sys.stdout.flush()

if __name__ == "__main__":
    evaluate_all_datasets(sample_ratio=0.2)
    print("Histogram-based contain estimation evaluation complete!")

Initializing estimator...


Loading histogram for powerthingwaysorted...


Loading histogram for zcta5...


Loading histogram for emergencythingwaysorted...


Loading histogram for yago2...


Loading histogram for aerowaythingnodesorted...


Loading histogram for barrierthingwaysorted...


Loading histogram for leisurewaysorted...


Loading histogram for arealm...


Loading histogram for aerowaythingwaysorted...


Loading histogram for cyclewaythingwaysorted...


Loading histogram for powerthingnodesorted...


Loading histogram for historicthingwaysorted...


Loading histogram for areawater...


Loading histogram for craftwaysorted...


Loaded 14 histograms


Found 14 datasets to evaluate



DATASET 1/14: powerthingwaysorted


Loading query results from ../../large_files/resultsContains/powerthingwaysorted_results.csv


Using 20.0% sample: 543457 out of 2717289 queries


Processing powerthingwaysorted queries: 

0.0%... 

10.0%... 

20.0%... 

30.0%... 

40.0%... 

50.0%... 

60.0%... 

70.0%... 

80.0%... 

90.0%... 

100.0%... 

100.0%... 

Done!


Evaluation results for powerthingwaysorted (20.0% sample):
  MAE: 4986.57
  MAPE: 38.66%
  Q-Score: 1.64
  Avg. Estimation Time: 0.1229 ms


Finished processing powerthingwaysorted (1/14)



DATASET 2/14: zcta5


Loading query results from ../../large_files/resultsContains/zcta5_results.csv


Using 20.0% sample: 1325 out of 6626 queries


Processing zcta5 queries: 

0.1%... 

10.0%... 

20.0%... 

30.0%... 

39.9%... 

49.9%... 

59.8%... 

69.8%... 

79.8%... 

89.7%... 

99.7%... 

100.0%... 

Done!


Evaluation results for zcta5 (20.0% sample):
  MAE: 113.55
  MAPE: 78.84%
  Q-Score: 6.90
  Avg. Estimation Time: 0.0270 ms


Finished processing zcta5 (2/14)



DATASET 3/14: emergencythingwaysorted


Loading query results from ../../large_files/resultsContains/emergencythingwaysorted_results.csv


Using 20.0% sample: 32302 out of 161514 queries


Processing emergencythingwaysorted queries: 

0.0%... 

10.0%... 

20.0%... 

30.0%... 

40.0%... 

50.0%... 

60.0%... 

70.0%... 

80.0%... 

90.0%... 

100.0%... 

100.0%... 

Done!


Evaluation results for emergencythingwaysorted (20.0% sample):
  MAE: 473.94
  MAPE: 38.97%
  Q-Score: 1.68
  Avg. Estimation Time: 0.0535 ms


Finished processing emergencythingwaysorted (3/14)



DATASET 4/14: yago2


Loading query results from ../../large_files/resultsContains/yago2_results.csv


Using 20.0% sample: 179788 out of 898942 queries


Processing yago2 queries: 

0.0%... 

10.0%... 

20.0%... 

30.0%... 

40.0%... 

50.0%... 

60.0%... 

70.0%... 

80.0%... 

90.0%... 

100.0%... 

100.0%... 

Done!


Evaluation results for yago2 (20.0% sample):
  MAE: 38414.61
  MAPE: 33984.49%
  Q-Score: 375.32
  Avg. Estimation Time: 0.1216 ms


Finished processing yago2 (4/14)



DATASET 5/14: aerowaythingnodesorted


Loading query results from ../../large_files/resultsContains/aerowaythingnodesorted_results.csv


Using 20.0% sample: 3168 out of 15843 queries


Processing aerowaythingnodesorted queries: 

0.0%... 

10.0%... 

20.0%... 

30.0%... 

39.9%... 

49.9%... 

59.9%... 

69.9%... 

79.8%... 

89.8%... 

99.8%... 

100.0%... 

Done!


Evaluation results for aerowaythingnodesorted (20.0% sample):
  MAE: 168.50
  MAPE: 172.07%
  Q-Score: 3.67
  Avg. Estimation Time: 0.0292 ms


Finished processing aerowaythingnodesorted (5/14)



DATASET 6/14: barrierthingwaysorted


Loading query results from ../../large_files/resultsContains/barrierthingwaysorted_results.csv


Using 20.0% sample: 916334 out of 4581670 queries


Processing barrierthingwaysorted queries: 

0.0%... 

10.0%... 

20.0%... 

30.0%... 

40.0%... 

50.0%... 

60.0%... 

70.0%... 

80.0%... 

90.0%... 

100.0%... 

100.0%... 

Done!


Evaluation results for barrierthingwaysorted (20.0% sample):
  MAE: 8495.15
  MAPE: 67.26%
  Q-Score: 2.03
  Avg. Estimation Time: 0.1182 ms


Finished processing barrierthingwaysorted (6/14)



DATASET 7/14: leisurewaysorted


Loading query results from ../../large_files/resultsContains/leisurewaysorted_results.csv


Using 20.0% sample: 1000000 out of 5000000 queries


Processing leisurewaysorted queries: 

0.0%... 

10.0%... 

20.0%... 

30.0%... 

40.0%... 

50.0%... 

60.0%... 

70.0%... 

80.0%... 

90.0%... 

100.0%... 

Done!


Evaluation results for leisurewaysorted (20.0% sample):
  MAE: 10653.40
  MAPE: 57.63%
  Q-Score: 1.87
  Avg. Estimation Time: 0.1181 ms


Finished processing leisurewaysorted (7/14)



DATASET 8/14: arealm


Loading query results from ../../large_files/resultsContains/arealm_results.csv


Using 20.0% sample: 5166 out of 25833 queries


Processing arealm queries: 

0.0%... 

10.0%... 

20.0%... 

30.0%... 

40.0%... 

50.0%... 

59.9%... 

69.9%... 

79.9%... 

89.9%... 

99.9%... 

100.0%... 

Done!


Evaluation results for arealm (20.0% sample):
  MAE: 217.68
  MAPE: 29.74%
  Q-Score: 2.89
  Avg. Estimation Time: 0.0297 ms


Finished processing arealm (8/14)



DATASET 9/14: aerowaythingwaysorted


Loading query results from ../../large_files/resultsContains/aerowaythingwaysorted_results.csv


Using 20.0% sample: 73673 out of 368365 queries


Processing aerowaythingwaysorted queries: 

0.0%... 

10.0%... 

20.0%... 

30.0%... 

40.0%... 

50.0%... 

60.0%... 

70.0%... 

80.0%... 

90.0%... 

100.0%... 

100.0%... 

Done!


Evaluation results for aerowaythingwaysorted (20.0% sample):
  MAE: 801.23
  MAPE: 57.30%
  Q-Score: 1.86
  Avg. Estimation Time: 0.0916 ms


Finished processing aerowaythingwaysorted (9/14)



DATASET 10/14: cyclewaythingwaysorted


Loading query results from ../../large_files/resultsContains/cyclewaythingwaysorted_results.csv


Using 20.0% sample: 213412 out of 1067063 queries


Processing cyclewaythingwaysorted queries: 

0.0%... 

10.0%... 

20.0%... 

30.0%... 

40.0%... 

50.0%... 

60.0%... 

70.0%... 

80.0%... 

90.0%... 

100.0%... 

100.0%... 

Done!


Evaluation results for cyclewaythingwaysorted (20.0% sample):
  MAE: 1841.80
  MAPE: 49.19%
  Q-Score: 1.96
  Avg. Estimation Time: 0.1267 ms


Finished processing cyclewaythingwaysorted (10/14)



DATASET 11/14: powerthingnodesorted


Loading query results from ../../large_files/resultsContains/powerthingnodesorted_results.csv


Using 20.0% sample: 420502 out of 2102514 queries


Processing powerthingnodesorted queries: 

0.0%... 

10.0%... 

20.0%... 

30.0%... 

40.0%... 

50.0%... 

60.0%... 

70.0%... 

80.0%... 

90.0%... 

100.0%... 

100.0%... 

Done!


Evaluation results for powerthingnodesorted (20.0% sample):
  MAE: 3757.83
  MAPE: 74.79%
  Q-Score: 2.31
  Avg. Estimation Time: 0.1267 ms


Finished processing powerthingnodesorted (11/14)



DATASET 12/14: historicthingwaysorted


Loading query results from ../../large_files/resultsContains/historicthingwaysorted_results.csv


Using 20.0% sample: 71687 out of 358439 queries


Processing historicthingwaysorted queries: 

0.0%... 

10.0%... 

20.0%... 

30.0%... 

40.0%... 

50.0%... 

60.0%... 

70.0%... 

80.0%... 

90.0%... 

100.0%... 

100.0%... 

Done!


Evaluation results for historicthingwaysorted (20.0% sample):
  MAE: 776.63
  MAPE: 24.57%
  Q-Score: 1.43
  Avg. Estimation Time: 0.0889 ms


Finished processing historicthingwaysorted (12/14)



DATASET 13/14: areawater


Loading query results from ../../large_files/resultsContains/areawater_results.csv


Using 20.0% sample: 91710 out of 458552 queries


Processing areawater queries: 

0.0%... 

10.0%... 

20.0%... 

30.0%... 

40.0%... 

50.0%... 

60.0%... 

70.0%... 

80.0%... 

90.0%... 

100.0%... 

Done!


Evaluation results for areawater (20.0% sample):
  MAE: 881.03
  MAPE: 17.31%
  Q-Score: 2.12
  Avg. Estimation Time: 0.1050 ms


Finished processing areawater (13/14)



DATASET 14/14: craftwaysorted


Loading query results from ../../large_files/resultsContains/craftwaysorted_results.csv


Using 20.0% sample: 4364 out of 21822 queries


Processing craftwaysorted queries: 

0.0%... 

10.0%... 

20.0%... 

30.0%... 

40.0%... 

50.0%... 

60.0%... 

70.0%... 

79.9%... 

89.9%... 

99.9%... 

100.0%... 

Done!


Evaluation results for craftwaysorted (20.0% sample):
  MAE: 203.85
  MAPE: 50.11%
  Q-Score: 2.08
  Avg. Estimation Time: 0.0293 ms


Finished processing craftwaysorted (14/14)



Combined results:
                    Dataset           MAE        MAPE     Q_Score  \
0       powerthingwaysorted   4986.565035    0.386573    1.642497   
1                     zcta5    113.548398    0.788386    6.898302   
2   emergencythingwaysorted    473.940183    0.389682    1.681774   
3                     yago2  38414.613394  339.844926  375.323149   
4    aerowaythingnodesorted    168.504470    1.720709    3.673449   
5     barrierthingwaysorted   8495.148895    0.672632    2.026001   
6          leisurewaysorted  10653.397787    0.576341    1.874430   
7                    arealm    217.679225    0.297392    2.894201   
8     aerowaythingwaysorted    801.225889    0.573029    1.860975   
9    cyclewaythingwaysorted   1841.802329    0.491892    1.962305   
10     powerthingnodesorted   3757.826079    0.747868    2.309405   
11   historicthingwaysorted    776.630745    0.245668    1.429788   
12                areawater    881.027606    0.173086    2.122390   
13           cr

Histogram-based contain estimation evaluation complete!
