In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import re
import configparser
import sys
import time
import pickle
from tqdm import tqdm
from sklearn.metrics import mean_absolute_error

class RTreeEstimator:
    """
    Spatial selectivity estimator using pre-built R-tree models
    """
    def __init__(self, data_dir="../../large_files"):
        """
        Initialize the R-tree based estimator using pre-built models
        
        Parameters:
        -----------
        data_dir : str
            Directory containing R-tree models
        """
        self.data_dir = data_dir
        self.rtree_dir = f"{data_dir}/traditional_methods/rtree/models"
        self.results_dir = f"{data_dir}/traditional_methods/rtree/results"
        self.viz_dir = f"{data_dir}/traditional_methods/rtree/visualizations"
        
        # Create directories
        os.makedirs(self.results_dir, exist_ok=True)
        os.makedirs(self.viz_dir, exist_ok=True)
        
        self.universe_boundaries = {}
        self.dataset_sizes = {}
        self.level_nodes = {}
        self.model_metadata = {}
        self.cache = {}  # Query result cache
        
        # Load dataset metadata
        self.load_spatial_statistics()
        
        # Load available models
        self.load_available_models()
    
    def load_spatial_statistics(self):
        """Load dataset information from spatial_statistics.csv"""
        try:
            stats_df = pd.read_csv("../../spatial_statistics.csv")
            for _, row in stats_df.iterrows():
                table_name = row['Table Name']
                total_objects = row['Total Spatial Objects']
                bbox_str = row['Universe Limits (Bounding Box)']
                
                # Parse bounding box
                bbox = self.parse_bbox(bbox_str)
                self.universe_boundaries[table_name] = bbox
                self.dataset_sizes[table_name] = int(total_objects)
                
            print(f"Loaded metadata for {len(self.universe_boundaries)} datasets")
            sys.stdout.flush()
        except Exception as e:
            print(f"Error loading spatial statistics: {e}")
            sys.stdout.flush()
    
    def parse_bbox(self, bbox_str):
        """Parse bounding box string into coordinates"""
        pattern = r"BOX\(([-\d\.]+) ([-\d\.]+),([-\d\.]+) ([-\d\.]+)\)"
        match = re.search(pattern, bbox_str)
        if match:
            xmin = float(match.group(1))
            ymin = float(match.group(2))
            xmax = float(match.group(3))
            ymax = float(match.group(4))
            return (xmin, ymin, xmax, ymax)
        return (-180, -90, 180, 90)  # Default if parsing fails
    
    def load_available_models(self):
        """Load all available pre-built R-tree models"""
        if not os.path.exists(self.rtree_dir):
            print(f"Model directory not found: {self.rtree_dir}")
            return
        
        # Check for metadata file first
        metadata_file = f"{self.rtree_dir}/all_rtree_metadata.csv"
        if os.path.exists(metadata_file):
            metadata_df = pd.read_csv(metadata_file)
            for _, row in metadata_df.iterrows():
                dataset_name = row['dataset']
                self.model_metadata[dataset_name] = row.to_dict()
        
        # Load level nodes for each dataset
        loaded_count = 0
        for dataset_name in self.universe_boundaries.keys():
            level_nodes_path = f"{self.rtree_dir}/{dataset_name}_level_nodes.pkl"
            metadata_path = f"{self.rtree_dir}/{dataset_name}_metadata.json"
            
            if os.path.exists(level_nodes_path):
                try:
                    with open(level_nodes_path, 'rb') as f:
                        self.level_nodes[dataset_name] = pickle.load(f)
                    
                    # Load metadata if not already loaded
                    if dataset_name not in self.model_metadata and os.path.exists(metadata_path):
                        self.model_metadata[dataset_name] = pd.read_json(metadata_path, typ='series').to_dict()
                    
                    loaded_count += 1
                except Exception as e:
                    print(f"Error loading model for {dataset_name}: {e}")
        
        print(f"Loaded {loaded_count} pre-built R-tree models")
        sys.stdout.flush()
    
    def parse_mbr(self, mbr_str):
        """Parse MBR string from format like '(x1, y1, x2, y2)'"""
        if isinstance(mbr_str, str):
            coords = mbr_str.strip('"()').split(', ')
            return [float(coord) for coord in coords]
        return mbr_str  # Already parsed
    
    def estimate_intersection_count(self, dataset_name, query_mbr):
        """
        Estimate the number of objects that intersect with a query rectangle
        using the level-before-leaves approach
        
        Parameters:
        -----------
        dataset_name : str
            Name of the dataset to query against
        query_mbr : list or str
            Query rectangle as [xmin, ymin, xmax, ymax] or '(xmin, ymin, xmax, ymax)'
            
        Returns:
        --------
        float
            Estimated number of objects intersecting the query
        """
        # Check if result is in cache
        if isinstance(query_mbr, list):
            query_mbr = tuple(query_mbr)
        cache_key = f"{dataset_name}_{query_mbr}"
        if cache_key in self.cache:
            return self.cache[cache_key]
        
        # Parse query rectangle if needed
        if isinstance(query_mbr, str):
            query_mbr = self.parse_mbr(query_mbr)
        
        # Query coordinates
        q_xmin, q_ymin, q_xmax, q_ymax = query_mbr
        query_area = (q_xmax - q_xmin) * (q_ymax - q_ymin)
        
        # If query area is 0, return 0
        if query_area <= 0:
            return 0
        
        # Check if model is available
        if dataset_name not in self.level_nodes:
            print(f"No R-tree model found for {dataset_name}")
            return 0
        
        # Get level nodes and total objects
        nodes = self.level_nodes[dataset_name]
        total_objects = self.dataset_sizes[dataset_name]
        
        # Count how many nodes intersect with query and their coverage ratio
        total_node_objects = sum(node['objects'] for node in nodes)
        if total_node_objects <= 0:
            return 0  # Avoid division by zero
            
        weighted_intersections = 0
        
        for node in nodes:
            node_mbr = node['mbr']
            node_objects = node['objects']
            
            # Check if node intersects with query
            if (node_mbr[0] <= q_xmax and node_mbr[2] >= q_xmin and 
                node_mbr[1] <= q_ymax and node_mbr[3] >= q_ymin):
                
                # Calculate intersection area
                intersection_xmin = max(q_xmin, node_mbr[0])
                intersection_ymin = max(q_ymin, node_mbr[1])
                intersection_xmax = min(q_xmax, node_mbr[2])
                intersection_ymax = min(q_ymax, node_mbr[3])
                
                intersection_area = (intersection_xmax - intersection_xmin) * (intersection_ymax - intersection_ymin)
                node_area = node['area']
                
                if node_area > 0:
                    # Calculate coverage ratio and add weighted count
                    coverage_ratio = intersection_area / node_area
                    weighted_intersections += node_objects * coverage_ratio
        
        # Scale to match total objects
        estimated_count = weighted_intersections * (total_objects / total_node_objects)
        
        # Cache and return result
        estimated_count = max(0, round(estimated_count))
        self.cache[cache_key] = estimated_count
        return estimated_count
    
    def evaluate_on_dataset(self, dataset_name, results_file=None, sample_ratio=0.2):
        """
        Evaluate the R-tree based estimation method on a dataset
        
        Parameters:
        -----------
        dataset_name : str
            Name of the dataset to evaluate
        results_file : str
            Path to the file containing actual query results
        sample_ratio : float
            Fraction of dataset to use (0.2 = 20%)
            
        Returns:
        --------
        dict
            Evaluation results including MAE, MAPE, q-score, and model metadata
        """
        if not results_file:
            results_file = f"../../large_files/resultsIntersects/{dataset_name}_results.csv"
            
        if not os.path.exists(results_file):
            raise ValueError(f"Results file not found: {results_file}")
            
        # Check if model is available
        if dataset_name not in self.level_nodes:
            raise ValueError(f"No R-tree model found for {dataset_name}")
            
        # Load query results
        print(f"Loading query results from {results_file}")
        sys.stdout.flush()
        
        try:
            results_df = pd.read_csv(results_file)
            
            # Sample only a portion of the dataset
            sample_size = max(1, int(len(results_df) * sample_ratio))
            print(f"Using {sample_ratio*100}% sample: {sample_size} out of {len(results_df)} queries")
            sys.stdout.flush()
            sampled_results = results_df.sample(n=sample_size, random_state=42)
            
            # Prepare arrays for evaluation
            actual_counts = []
            estimated_counts = []
            estimation_times = []
            
            # Process each query with progress reporting
            print(f"Processing {dataset_name} queries: ", end="", flush=True)
            sys.stdout.flush()
            
            total_queries = len(sampled_results)
            progress_step = max(1, total_queries // 10)
            
            for i, (index, row) in enumerate(sampled_results.iterrows()):
                # Show progress every 10%
                if i % progress_step == 0 or i == total_queries - 1:
                    progress = (i+1) / total_queries * 100
                    print(f"{progress:.1f}%... ", end="", flush=True)
                    sys.stdout.flush()
                    
                query_mbr = self.parse_mbr(row['Query MBR'])
                actual_count = row['Count MBR']
                
                # Measure estimation time
                start_time = time.time()
                estimated_count = self.estimate_intersection_count(dataset_name, query_mbr)
                end_time = time.time()
                estimation_time = (end_time - start_time) * 1000  # ms
                
                actual_counts.append(actual_count)
                estimated_counts.append(estimated_count)
                estimation_times.append(estimation_time)
            
            print("Done!")
            sys.stdout.flush()
            
            # Convert to arrays for calculations
            actual_counts = np.array(actual_counts)
            estimated_counts = np.array(estimated_counts)
            estimation_times = np.array(estimation_times)
            
            # Calculate MAE
            mae = mean_absolute_error(actual_counts, estimated_counts)
            
            # Calculate MAPE with handling for zeros
            non_zero_mask = actual_counts != 0
            zero_mask = ~non_zero_mask
            mape_sum = 0
            count = len(actual_counts)
            
            if np.any(non_zero_mask):
                mape_sum += np.sum(np.abs((actual_counts[non_zero_mask] - estimated_counts[non_zero_mask]) / 
                                        actual_counts[non_zero_mask]))
            
            if np.any(zero_mask):
                mape_sum += np.sum(np.abs(actual_counts[zero_mask] - estimated_counts[zero_mask]) / 100)
            
            mape = mape_sum / count
            
            # Calculate q-score
            valid_indices = (actual_counts != 0) & (estimated_counts != 0)
            if np.any(valid_indices):
                ratios = np.maximum(
                    estimated_counts[valid_indices] / actual_counts[valid_indices],
                    actual_counts[valid_indices] / estimated_counts[valid_indices]
                )
                q_score = np.mean(ratios)
            else:
                q_score = float('inf')
                
            avg_time_ms = np.mean(estimation_times)
            
            # Get model metadata
            model_size_bytes = 0
            level_nodes_size_bytes = 0
            total_size_bytes = 0
            rtree_params = {}
            num_level_nodes = len(self.level_nodes.get(dataset_name, []))
            
            if dataset_name in self.model_metadata:
                meta = self.model_metadata[dataset_name]
                model_size_bytes = meta.get('model_size_bytes', 0)
                level_nodes_size_bytes = meta.get('level_nodes_size_bytes', 0)
                total_size_bytes = meta.get('total_size_bytes', 0)
                rtree_params = meta.get('rtree_params', {})
                if isinstance(rtree_params, str):
                    # Handle JSON parsing if needed
                    try:
                        rtree_params = eval(rtree_params)
                    except:
                        rtree_params = {}
            
            # Combine results with model metadata
            results = {
                'Dataset': dataset_name,
                'Method': 'RTree-Level',
                'MAE': mae,
                'MAPE': mape,
                'Q_Score': q_score,
                'Avg_Time_ms': avg_time_ms,
                'Num_Queries': len(sampled_results),
                'Sample_Ratio': sample_ratio,
                'Model_Size_MB': total_size_bytes / (1024*1024),
                'Level_Nodes_Size_MB': level_nodes_size_bytes / (1024*1024),
                'Num_Level_Nodes': num_level_nodes
            }
            
            # Add R-tree parameters to results
            for key, value in rtree_params.items():
                results[f'rtree_{key}'] = value
            
            # Save results
            results_file_out = f"{self.results_dir}/{dataset_name}_evaluation_sample{int(sample_ratio*100)}.csv"
            pd.DataFrame([results]).to_csv(results_file_out, index=False)
            
            # Generate visualization
            self.visualize_results(dataset_name, actual_counts, estimated_counts, sample_ratio)
            
            print(f"Evaluation results for {dataset_name} ({sample_ratio*100}% sample):")
            print(f"  MAE: {mae:.2f}")
            print(f"  MAPE: {mape:.2%}")
            print(f"  Q-Score: {q_score:.2f}")
            print(f"  Avg. Estimation Time: {avg_time_ms:.4f} ms")
            print(f"  Model Size: {results['Model_Size_MB']:.2f} MB")
            print(f"  Num Level Nodes: {num_level_nodes}")
            sys.stdout.flush()
            
            return results
            
        except Exception as e:
            print(f"Error evaluating {dataset_name}: {str(e)}")
            sys.stdout.flush()
            raise
    
    def visualize_results(self, dataset_name, actual_counts, estimated_counts, sample_ratio=0.2):
        """Create visualization of actual vs. predicted counts"""
        os.makedirs(self.viz_dir, exist_ok=True)
        
        plt.figure(figsize=(12, 10))
        plt.scatter(actual_counts, estimated_counts, alpha=0.5, s=8)
        
        max_val = max(np.max(actual_counts), np.max(estimated_counts))
        plt.plot([0, max_val], [0, max_val], 'r--', alpha=0.7)
        
        plt.xlabel('Actual Count')
        plt.ylabel('Estimated Count')
        plt.title(f'R-tree Estimation for {dataset_name} ({int(sample_ratio*100)}% sample)')
        plt.grid(True, alpha=0.3)
        
        plt.savefig(
            f"{self.viz_dir}/{dataset_name}_estimation_sample{int(sample_ratio*100)}.png", 
            dpi=150
        )
        plt.close()
        
        # Create a comparison for a sample of queries
        sample_size = min(100, len(actual_counts))
        indices = np.random.choice(len(actual_counts), sample_size, replace=False)
        
        plt.figure(figsize=(20, 10))
        plt.scatter(
            range(sample_size), 
            actual_counts[indices],
            label='Actual Count', 
            s=100, alpha=0.7, marker='o', color='green'
        )
        plt.scatter(
            range(sample_size), 
            estimated_counts[indices],
            label='R-tree Estimate', 
            s=100, alpha=0.7, marker='x', color='blue'
        )
        
        plt.xlabel('Query Index')
        plt.ylabel('Object Count')
        plt.title(
            f'R-tree vs. Actual Count for {dataset_name} - '
            f'Sample of {sample_size} Queries ({int(sample_ratio*100)}% dataset)'
        )
        plt.legend()
        plt.grid(True, alpha=0.3)
        
        plt.savefig(
            f"{self.viz_dir}/{dataset_name}_comparison_sample{int(sample_ratio*100)}.png", 
            dpi=150
        )
        plt.close()

def evaluate_all_datasets(sample_ratio=0.2):
    """Evaluate R-tree estimation on all available pre-built models"""
    print("Initializing R-tree estimator...")
    sys.stdout.flush()
    
    estimator = RTreeEstimator()
    available_datasets = list(estimator.level_nodes.keys())
    
    if not available_datasets:
        print("No pre-built R-tree models found. Please run RTree-builder first.")
        return
    
    print(f"Found {len(available_datasets)} pre-built R-tree models to evaluate")
    sys.stdout.flush()
    
    all_results = []
    
    # Process each available dataset with clear separation
    for idx, dataset_name in enumerate(available_datasets, start=1):
        print("\n" + "="*80)
        print(f"DATASET {idx}/{len(available_datasets)}: {dataset_name}")
        print("="*80)
        sys.stdout.flush()
        
        try:
            results = estimator.evaluate_on_dataset(dataset_name, sample_ratio=sample_ratio)
            all_results.append(results)
            print(f"Finished processing {dataset_name} ({idx}/{len(available_datasets)})")
            sys.stdout.flush()
        except Exception as e:
            print(f"Error evaluating {dataset_name}: {e}")
            print("Moving to next dataset")
            sys.stdout.flush()
    
    # Save combined results
    if all_results:
        out_dir = estimator.results_dir
        os.makedirs(out_dir, exist_ok=True)
        all_results_df = pd.DataFrame(all_results)
        all_results_df.to_csv(
            f"{out_dir}/all_datasets_evaluation_sample{int(sample_ratio*100)}.csv", 
            index=False
        )
        print("\nCombined results:")
        print(all_results_df[['Dataset', 'MAE', 'MAPE', 'Q_Score', 'Model_Size_MB', 'Num_Level_Nodes']])
    else:
        print("No results were generated")

if __name__ == "__main__":
    evaluate_all_datasets(sample_ratio=0.2)
    print("R-tree estimation evaluation complete!")

Initializing R-tree estimator...


Loaded metadata for 14 datasets


Loaded 14 pre-built R-tree models


Found 14 pre-built R-tree models to evaluate



DATASET 1/14: yago2


Loading query results from ../../large_files/resultsIntersects/yago2_results.csv


Using 20.0% sample: 179788 out of 898942 queries


Processing yago2 queries: 

0.0%... 

10.0%... 

20.0%... 

30.0%... 

40.0%... 

50.0%... 

60.0%... 

70.0%... 

80.0%... 

90.0%... 

100.0%... 

100.0%... 

Done!


Evaluation results for yago2 (20.0% sample):
  MAE: 19192.01
  MAPE: 73.86%
  Q-Score: 114.77
  Avg. Estimation Time: 0.0476 ms
  Model Size: 294.98 MB
  Num Level Nodes: 305


Finished processing yago2 (1/14)



DATASET 2/14: craftwaysorted


Loading query results from ../../large_files/resultsIntersects/craftwaysorted_results.csv


Using 20.0% sample: 4364 out of 21822 queries


Processing craftwaysorted queries: 

0.0%... 

10.0%... 

20.0%... 

30.0%... 

40.0%... 

50.0%... 

60.0%... 

70.0%... 

79.9%... 

89.9%... 

99.9%... 

100.0%... 

Done!


Evaluation results for craftwaysorted (20.0% sample):
  MAE: 1244.86
  MAPE: 486.88%
  Q-Score: 11.69
  Avg. Estimation Time: 0.0094 ms
  Model Size: 6.90 MB
  Num Level Nodes: 9


Finished processing craftwaysorted (2/14)



DATASET 3/14: zcta5


Loading query results from ../../large_files/resultsIntersects/zcta5_results.csv


Using 20.0% sample: 1325 out of 6626 queries


Processing zcta5 queries: 

0.1%... 

10.0%... 

20.0%... 

30.0%... 

39.9%... 

49.9%... 

59.8%... 

69.8%... 

79.8%... 

89.7%... 

99.7%... 

100.0%... 

Done!


Evaluation results for zcta5 (20.0% sample):
  MAE: 445.12
  MAPE: 235.70%
  Q-Score: 13.49
  Avg. Estimation Time: 0.0082 ms
  Model Size: 2.20 MB
  Num Level Nodes: 3


Finished processing zcta5 (3/14)



DATASET 4/14: areawater


Loading query results from ../../large_files/resultsIntersects/areawater_results.csv


Using 20.0% sample: 91710 out of 458552 queries


Processing areawater queries: 

0.0%... 

10.0%... 

20.0%... 

30.0%... 

40.0%... 

50.0%... 

60.0%... 

70.0%... 

80.0%... 

90.0%... 

100.0%... 

Done!


Evaluation results for areawater (20.0% sample):
  MAE: 5168.13
  MAPE: 1733.05%
  Q-Score: 98.04
  Avg. Estimation Time: 0.0106 ms
  Model Size: 146.79 MB
  Num Level Nodes: 26


Finished processing areawater (4/14)



DATASET 5/14: aerowaythingnodesorted


Loading query results from ../../large_files/resultsIntersects/aerowaythingnodesorted_results.csv


Using 20.0% sample: 3168 out of 15843 queries


Processing aerowaythingnodesorted queries: 

0.0%... 

10.0%... 

20.0%... 

30.0%... 

39.9%... 

49.9%... 

59.9%... 

69.9%... 

79.8%... 

89.8%... 

99.8%... 

100.0%... 

Done!


Evaluation results for aerowaythingnodesorted (20.0% sample):
  MAE: 722.70
  MAPE: 1377.19%
  Q-Score: 22.84
  Avg. Estimation Time: 0.0087 ms
  Model Size: 5.09 MB
  Num Level Nodes: 4


Finished processing aerowaythingnodesorted (5/14)



DATASET 6/14: emergencythingwaysorted


Loading query results from ../../large_files/resultsIntersects/emergencythingwaysorted_results.csv


Using 20.0% sample: 32302 out of 161514 queries


Processing emergencythingwaysorted queries: 

0.0%... 

10.0%... 

20.0%... 

30.0%... 

40.0%... 

50.0%... 

60.0%... 

70.0%... 

80.0%... 

90.0%... 

100.0%... 

100.0%... 

Done!


Evaluation results for emergencythingwaysorted (20.0% sample):
  MAE: 4587.46
  MAPE: 1077.19%
  Q-Score: 19.12
  Avg. Estimation Time: 0.0166 ms
  Model Size: 51.57 MB
  Num Level Nodes: 58


Finished processing emergencythingwaysorted (6/14)



DATASET 7/14: historicthingwaysorted


Loading query results from ../../large_files/resultsIntersects/historicthingwaysorted_results.csv


Using 20.0% sample: 71687 out of 358439 queries


Processing historicthingwaysorted queries: 

0.0%... 

10.0%... 

20.0%... 

30.0%... 

40.0%... 

50.0%... 

60.0%... 

70.0%... 

80.0%... 

90.0%... 

100.0%... 

100.0%... 

Done!


Evaluation results for historicthingwaysorted (20.0% sample):
  MAE: 5735.42
  MAPE: 352.10%
  Q-Score: 6.63
  Avg. Estimation Time: 0.0216 ms
  Model Size: 114.03 MB
  Num Level Nodes: 94


Finished processing historicthingwaysorted (7/14)



DATASET 8/14: aerowaythingwaysorted


Loading query results from ../../large_files/resultsIntersects/aerowaythingwaysorted_results.csv


Using 20.0% sample: 73673 out of 368365 queries


Processing aerowaythingwaysorted queries: 

0.0%... 

10.0%... 

20.0%... 

30.0%... 

40.0%... 

50.0%... 

60.0%... 

70.0%... 

80.0%... 

90.0%... 

100.0%... 

100.0%... 

Done!


Evaluation results for aerowaythingwaysorted (20.0% sample):
  MAE: 3645.65
  MAPE: 485.98%
  Q-Score: 7.54
  Avg. Estimation Time: 0.0232 ms
  Model Size: 116.49 MB
  Num Level Nodes: 113


Finished processing aerowaythingwaysorted (8/14)



DATASET 9/14: cyclewaythingwaysorted


Loading query results from ../../large_files/resultsIntersects/cyclewaythingwaysorted_results.csv


Using 20.0% sample: 213412 out of 1067063 queries


Processing cyclewaythingwaysorted queries: 

0.0%... 

10.0%... 

20.0%... 

30.0%... 

40.0%... 

50.0%... 

60.0%... 

70.0%... 

80.0%... 

90.0%... 

100.0%... 

100.0%... 

Done!


Evaluation results for cyclewaythingwaysorted (20.0% sample):
  MAE: 9123.38
  MAPE: 1060.91%
  Q-Score: 18.91
  Avg. Estimation Time: 0.0301 ms
  Model Size: 336.85 MB
  Num Level Nodes: 170


Finished processing cyclewaythingwaysorted (9/14)



DATASET 10/14: powerthingwaysorted


Loading query results from ../../large_files/resultsIntersects/powerthingwaysorted_results.csv


Using 20.0% sample: 543457 out of 2717289 queries


Processing powerthingwaysorted queries: 

0.0%... 

10.0%... 

20.0%... 

30.0%... 

40.0%... 

50.0%... 

60.0%... 

70.0%... 

80.0%... 

90.0%... 

100.0%... 

100.0%... 

Done!


Evaluation results for powerthingwaysorted (20.0% sample):
  MAE: 8848.55
  MAPE: 843.06%
  Q-Score: 14.01
  Avg. Estimation Time: 0.0580 ms
  Model Size: 874.03 MB
  Num Level Nodes: 403


Finished processing powerthingwaysorted (10/14)



DATASET 11/14: leisurewaysorted


Loading query results from ../../large_files/resultsIntersects/leisurewaysorted_results.csv


Using 20.0% sample: 1000000 out of 5000000 queries


Processing leisurewaysorted queries: 

0.0%... 

10.0%... 

20.0%... 

30.0%... 

40.0%... 

50.0%... 

60.0%... 

70.0%... 

80.0%... 

90.0%... 

100.0%... 

Done!


Evaluation results for leisurewaysorted (20.0% sample):
  MAE: 16165.65
  MAPE: 470.02%
  Q-Score: 7.75
  Avg. Estimation Time: 0.0928 ms
  Model Size: 1866.94 MB
  Num Level Nodes: 687


Finished processing leisurewaysorted (11/14)



DATASET 12/14: barrierthingwaysorted


Loading query results from ../../large_files/resultsIntersects/barrierthingwaysorted_results.csv


Using 20.0% sample: 916334 out of 4581670 queries


Processing barrierthingwaysorted queries: 

0.0%... 

10.0%... 

20.0%... 

30.0%... 

40.0%... 

50.0%... 

60.0%... 

70.0%... 

80.0%... 

90.0%... 

100.0%... 

100.0%... 

Done!


Evaluation results for barrierthingwaysorted (20.0% sample):
  MAE: 16561.21
  MAPE: 376.86%
  Q-Score: 6.51
  Avg. Estimation Time: 0.0794 ms
  Model Size: 1456.06 MB
  Num Level Nodes: 569


Finished processing barrierthingwaysorted (12/14)



DATASET 13/14: powerthingnodesorted


Loading query results from ../../large_files/resultsIntersects/powerthingnodesorted_results.csv


Using 20.0% sample: 420502 out of 2102514 queries


Processing powerthingnodesorted queries: 

0.0%... 

10.0%... 

20.0%... 

30.0%... 

40.0%... 

50.0%... 

60.0%... 

70.0%... 

80.0%... 

90.0%... 

100.0%... 

100.0%... 

Done!


Evaluation results for powerthingnodesorted (20.0% sample):
  MAE: 7986.18
  MAPE: 3149.50%
  Q-Score: 58.77
  Avg. Estimation Time: 0.0499 ms
  Model Size: 686.43 MB
  Num Level Nodes: 332


Finished processing powerthingnodesorted (13/14)



DATASET 14/14: arealm


Loading query results from ../../large_files/resultsIntersects/arealm_results.csv


Using 20.0% sample: 5166 out of 25833 queries


Processing arealm queries: 

0.0%... 

10.0%... 

20.0%... 

30.0%... 

40.0%... 

50.0%... 

59.9%... 

69.9%... 

79.9%... 

89.9%... 

99.9%... 

100.0%... 

Done!


Evaluation results for arealm (20.0% sample):
  MAE: 1642.61
  MAPE: 931.34%
  Q-Score: 52.56
  Avg. Estimation Time: 0.0086 ms
  Model Size: 8.34 MB
  Num Level Nodes: 6


Finished processing arealm (14/14)



Combined results:
                    Dataset           MAE       MAPE     Q_Score  \
0                     yago2  19192.006836   0.738640  114.768867   
1            craftwaysorted   1244.864803   4.868776   11.688287   
2                     zcta5    445.122264   2.356957   13.485428   
3                 areawater   5168.130738  17.330511   98.040151   
4    aerowaythingnodesorted    722.697601  13.771877   22.838991   
5   emergencythingwaysorted   4587.459043  10.771897   19.124291   
6    historicthingwaysorted   5735.416938   3.520972    6.631058   
7     aerowaythingwaysorted   3645.647727   4.859794    7.536050   
8    cyclewaythingwaysorted   9123.383666  10.609082   18.911203   
9       powerthingwaysorted   8848.545322   8.430554   14.011152   
10         leisurewaysorted  16165.645074   4.700202    7.753138   
11    barrierthingwaysorted  16561.214694   3.768560    6.506789   
12     powerthingnodesorted   7986.182665  31.495020   58.768564   
13                   arealm  