In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import re
import configparser
import sys
import time
import pickle
from tqdm import tqdm
from sklearn.metrics import mean_absolute_error

class RTreeDistanceEstimator:
    """
    Spatial selectivity estimator using pre-built R-tree models for distance filter operations
    """
    def __init__(self, data_dir="../../large_files"):
        """
        Initialize the R-tree based estimator using pre-built models
        
        Parameters:
        -----------
        data_dir : str
            Directory containing R-tree models
        """
        self.data_dir = data_dir
        self.rtree_dir = f"{data_dir}/traditional_methods/rtree/models"
        self.results_dir = f"{data_dir}/traditional_methods/rtree/results/distance"
        self.viz_dir = f"{data_dir}/traditional_methods/rtree/visualizations/distance"
        
        # Create directories
        os.makedirs(self.results_dir, exist_ok=True)
        os.makedirs(self.viz_dir, exist_ok=True)
        
        self.universe_boundaries = {}
        self.dataset_sizes = {}
        self.level_nodes = {}
        self.model_metadata = {}
        self.cache = {}  # Query result cache
        
        # Load dataset metadata
        self.load_spatial_statistics()
        
        # Load available models
        self.load_available_models()
    
    def load_spatial_statistics(self):
        """Load dataset information from spatial_statistics.csv"""
        try:
            stats_df = pd.read_csv("../../spatial_statistics.csv")
            for _, row in stats_df.iterrows():
                table_name = row['Table Name']
                total_objects = row['Total Spatial Objects']
                bbox_str = row['Universe Limits (Bounding Box)']
                
                # Parse bounding box
                bbox = self.parse_bbox(bbox_str)
                self.universe_boundaries[table_name] = bbox
                self.dataset_sizes[table_name] = int(total_objects)
                
            print(f"Loaded metadata for {len(self.universe_boundaries)} datasets")
            sys.stdout.flush()
        except Exception as e:
            print(f"Error loading spatial statistics: {e}")
            sys.stdout.flush()
    
    def parse_bbox(self, bbox_str):
        """Parse bounding box string into coordinates"""
        pattern = r"BOX\(([-\d\.]+) ([-\d\.]+),([-\d\.]+) ([-\d\.]+)\)"
        match = re.search(pattern, bbox_str)
        if match:
            xmin = float(match.group(1))
            ymin = float(match.group(2))
            xmax = float(match.group(3))
            ymax = float(match.group(4))
            return (xmin, ymin, xmax, ymax)
        return (-180, -90, 180, 90)  # Default if parsing fails
    
    def load_available_models(self):
        """Load all available pre-built R-tree models"""
        if not os.path.exists(self.rtree_dir):
            print(f"Model directory not found: {self.rtree_dir}")
            return
        
        # Check for metadata file first
        metadata_file = f"{self.rtree_dir}/all_rtree_metadata.csv"
        if os.path.exists(metadata_file):
            metadata_df = pd.read_csv(metadata_file)
            for _, row in metadata_df.iterrows():
                dataset_name = row['dataset']
                self.model_metadata[dataset_name] = row.to_dict()
        
        # Load level nodes for each dataset
        loaded_count = 0
        for dataset_name in self.universe_boundaries.keys():
            level_nodes_path = f"{self.rtree_dir}/{dataset_name}_level_nodes.pkl"
            metadata_path = f"{self.rtree_dir}/{dataset_name}_metadata.json"
            
            if os.path.exists(level_nodes_path):
                try:
                    with open(level_nodes_path, 'rb') as f:
                        self.level_nodes[dataset_name] = pickle.load(f)
                    
                    # Load metadata if not already loaded
                    if dataset_name not in self.model_metadata and os.path.exists(metadata_path):
                        self.model_metadata[dataset_name] = pd.read_json(metadata_path, typ='series').to_dict()
                    
                    loaded_count += 1
                except Exception as e:
                    print(f"Error loading model for {dataset_name}: {e}")
        
        print(f"Loaded {loaded_count} pre-built R-tree models")
        sys.stdout.flush()
    
    def parse_mbr(self, mbr_str):
        """Parse MBR string from format like '(x1, y1, x2, y2)'"""
        if isinstance(mbr_str, str):
            coords = mbr_str.strip('"()').split(', ')
            return [float(coord) for coord in coords]
        return mbr_str  # Already parsed
    
    def calculate_distance(self, point1, point2):
        """
        Calculate Euclidean distance between two points
        
        Parameters:
        -----------
        point1, point2 : tuple
            (x, y) coordinates of the two points
            
        Returns:
        --------
        float : Distance between the points
        """
        return np.sqrt((point1[0] - point2[0])**2 + (point1[1] - point2[1])**2)
    
    def calculate_min_distance(self, mbr1, mbr2):
        """
        Calculate minimum distance between two MBRs
        
        Parameters:
        -----------
        mbr1, mbr2 : tuple
            (xmin, ymin, xmax, ymax) coordinates of the two MBRs
            
        Returns:
        --------
        float : Minimum distance between the MBRs
        """
        # Unpack MBR coordinates
        mbr1_xmin, mbr1_ymin, mbr1_xmax, mbr1_ymax = mbr1
        mbr2_xmin, mbr2_ymin, mbr2_xmax, mbr2_ymax = mbr2
        
        # Calculate distance in x direction
        dx = 0
        if mbr1_xmax < mbr2_xmin:
            dx = mbr2_xmin - mbr1_xmax
        elif mbr2_xmax < mbr1_xmin:
            dx = mbr1_xmin - mbr2_xmax
        
        # Calculate distance in y direction
        dy = 0
        if mbr1_ymax < mbr2_ymin:
            dy = mbr2_ymin - mbr1_ymax
        elif mbr2_ymax < mbr1_ymin:
            dy = mbr1_ymin - mbr2_ymax
        
        # Return Euclidean distance
        return np.sqrt(dx**2 + dy**2)
    
    def calculate_max_distance(self, mbr1, mbr2):
        """
        Calculate maximum distance between two MBRs (farthest corners)
        
        Parameters:
        -----------
        mbr1, mbr2 : tuple
            (xmin, ymin, xmax, ymax) coordinates of the two MBRs
            
        Returns:
        --------
        float : Maximum distance between the MBRs
        """
        # Unpack MBR coordinates
        mbr1_xmin, mbr1_ymin, mbr1_xmax, mbr1_ymax = mbr1
        mbr2_xmin, mbr2_ymin, mbr2_xmax, mbr2_ymax = mbr2
        
        # Calculate distance for all corner combinations
        distances = []
        for x1, y1 in [(mbr1_xmin, mbr1_ymin), (mbr1_xmin, mbr1_ymax), 
                        (mbr1_xmax, mbr1_ymin), (mbr1_xmax, mbr1_ymax)]:
            for x2, y2 in [(mbr2_xmin, mbr2_ymin), (mbr2_xmin, mbr2_ymax), 
                           (mbr2_xmax, mbr2_ymin), (mbr2_xmax, mbr2_ymax)]:
                distances.append(np.sqrt((x1 - x2)**2 + (y1 - y2)**2))
        
        return max(distances)
    
    def get_mbr_center(self, mbr):
        """
        Calculate center point of an MBR
        
        Parameters:
        -----------
        mbr : tuple
            (xmin, ymin, xmax, ymax) coordinates
            
        Returns:
        --------
        tuple : (x_center, y_center) coordinates
        """
        xmin, ymin, xmax, ymax = mbr
        return ((xmin + xmax) / 2, (ymin + ymax) / 2)
    
    def estimate_distance_count(self, dataset_name, object_mbr, min_distance, max_distance):
        """
        Estimate the number of objects that are within the specified distance range
        of the object using the level-before-leaves approach
        
        Parameters:
        -----------
        dataset_name : str
            Name of the dataset to query against
        object_mbr : list or str
            Object MBR as [xmin, ymin, xmax, ymax] or '(xmin, ymin, xmax, ymax)'
        min_distance : float
            Minimum distance from the object
        max_distance : float
            Maximum distance from the object
            
        Returns:
        --------
        float
            Estimated number of objects within the distance range
        """
        # Check if result is in cache
        if isinstance(object_mbr, list):
            object_mbr = tuple(object_mbr)
        cache_key = f"{dataset_name}_{object_mbr}_{min_distance}_{max_distance}"
        if cache_key in self.cache:
            return self.cache[cache_key]
        
        # Parse object MBR if needed
        if isinstance(object_mbr, str):
            object_mbr = self.parse_mbr(object_mbr)
        
        # Check if model is available
        if dataset_name not in self.level_nodes:
            print(f"No R-tree model found for {dataset_name}")
            return 0
        
        # Get level nodes and total objects
        nodes = self.level_nodes[dataset_name]
        total_objects = self.dataset_sizes[dataset_name]
        
        # Calculate object center
        object_center = self.get_mbr_center(object_mbr)
        
        # Count objects within the distance range
        total_node_objects = sum(node['objects'] for node in nodes)
        if total_node_objects <= 0:
            return 0  # Avoid division by zero
            
        objects_in_range = 0
        
        for node in nodes:
            node_mbr = node['mbr']
            node_objects = node['objects']
            node_center = self.get_mbr_center(node_mbr)
            
            # Calculate min and max distances between object and node
            min_dist = self.calculate_min_distance(object_mbr, node_mbr)
            max_dist = self.calculate_max_distance(object_mbr, node_mbr)
            
            # Case 1: Node is completely within the distance range
            if min_dist >= min_distance and max_dist <= max_distance:
                objects_in_range += node_objects
            
            # Case 2: Node is partially within the distance range
            elif min_dist <= max_distance and max_dist >= min_distance:
                # Calculate center-to-center distance for weighting
                center_dist = self.calculate_distance(object_center, node_center)
                
                # Calculate overlap proportion based on distances
                if max_dist - min_dist > 0:
                    range_overlap = (min(max_distance, max_dist) - max(min_distance, min_dist)) / (max_dist - min_dist)
                    # Adjust with center distance as a factor
                    if center_dist >= min_distance and center_dist <= max_distance:
                        weight = range_overlap * 0.8 + 0.2  # Boost weight if center is in range
                    else:
                        weight = range_overlap * 0.5
                    
                    objects_in_range += node_objects * weight
                
        # Scale to match total objects
        estimated_count = objects_in_range * (total_objects / total_node_objects)
        
        # Cache and return result
        estimated_count = max(0, round(estimated_count))
        self.cache[cache_key] = estimated_count
        return estimated_count
    
    def evaluate_on_dataset(self, dataset_name, results_file=None, sample_ratio=0.2):
        """
        Evaluate the R-tree based distance estimation method on a dataset
        
        Parameters:
        -----------
        dataset_name : str
            Name of the dataset to evaluate
        results_file : str
            Path to the file containing actual query results
        sample_ratio : float
            Fraction of dataset to use (0.2 = 20%)
            
        Returns:
        --------
        dict
            Evaluation results including MAE, MAPE, q-score, and model metadata
        """
        if not results_file:
            results_file = f"../../large_files/resultsDistance/{dataset_name}_results.csv"
            
        if not os.path.exists(results_file):
            raise ValueError(f"Results file not found: {results_file}")
            
        # Check if model is available
        if dataset_name not in self.level_nodes:
            raise ValueError(f"No R-tree model found for {dataset_name}")
            
        # Load query results
        print(f"Loading query results from {results_file}")
        sys.stdout.flush()
        
        try:
            results_df = pd.read_csv(results_file)
            
            # Sample only a portion of the dataset
            sample_size = max(1, int(len(results_df) * sample_ratio))
            print(f"Using {sample_ratio*100}% sample: {sample_size} out of {len(results_df)} queries")
            sys.stdout.flush()
            sampled_results = results_df.sample(n=sample_size, random_state=42)
            
            # Prepare arrays for evaluation
            actual_counts = []
            estimated_counts = []
            estimation_times = []
            
            # Process each query with progress reporting
            print(f"Processing {dataset_name} queries: ", end="", flush=True)
            sys.stdout.flush()
            
            total_queries = len(sampled_results)
            progress_step = max(1, total_queries // 10)
            
            for i, (index, row) in enumerate(sampled_results.iterrows()):
                # Show progress every 10%
                if i % progress_step == 0 or i == total_queries - 1:
                    progress = (i+1) / total_queries * 100
                    print(f"{progress:.1f}%... ", end="", flush=True)
                    sys.stdout.flush()
                    
                object_mbr = self.parse_mbr(row['Object MBR'])
                min_distance = row['Distance Min']
                max_distance = row['Distance Max']
                actual_count = row['Count MBR']
                
                # Measure estimation time
                start_time = time.time()
                estimated_count = self.estimate_distance_count(dataset_name, object_mbr, min_distance, max_distance)
                end_time = time.time()
                estimation_time = (end_time - start_time) * 1000  # ms
                
                actual_counts.append(actual_count)
                estimated_counts.append(estimated_count)
                estimation_times.append(estimation_time)
            
            print("Done!")
            sys.stdout.flush()
            
            # Convert to arrays for calculations
            actual_counts = np.array(actual_counts)
            estimated_counts = np.array(estimated_counts)
            estimation_times = np.array(estimation_times)
            
            # Calculate MAE
            mae = mean_absolute_error(actual_counts, estimated_counts)
            
            # Calculate MAPE with handling for zeros
            non_zero_mask = actual_counts != 0
            zero_mask = ~non_zero_mask
            mape_sum = 0
            count = len(actual_counts)
            
            if np.any(non_zero_mask):
                mape_sum += np.sum(np.abs((actual_counts[non_zero_mask] - estimated_counts[non_zero_mask]) / 
                                        actual_counts[non_zero_mask]))
            
            if np.any(zero_mask):
                mape_sum += np.sum(np.abs(actual_counts[zero_mask] - estimated_counts[zero_mask]) / 100)
            
            mape = mape_sum / count
            
            # Calculate q-score
            valid_indices = (actual_counts != 0) & (estimated_counts != 0)
            if np.any(valid_indices):
                ratios = np.maximum(
                    estimated_counts[valid_indices] / actual_counts[valid_indices],
                    actual_counts[valid_indices] / estimated_counts[valid_indices]
                )
                q_score = np.mean(ratios)
            else:
                q_score = float('inf')
                
            avg_time_ms = np.mean(estimation_times)
            
            # Get model metadata
            model_size_bytes = 0
            level_nodes_size_bytes = 0
            total_size_bytes = 0
            rtree_params = {}
            num_level_nodes = len(self.level_nodes.get(dataset_name, []))
            
            if dataset_name in self.model_metadata:
                meta = self.model_metadata[dataset_name]
                model_size_bytes = meta.get('model_size_bytes', 0)
                level_nodes_size_bytes = meta.get('level_nodes_size_bytes', 0)
                total_size_bytes = meta.get('total_size_bytes', 0)
                rtree_params = meta.get('rtree_params', {})
                if isinstance(rtree_params, str):
                    # Handle JSON parsing if needed
                    try:
                        rtree_params = eval(rtree_params)
                    except:
                        rtree_params = {}
            
            # Combine results with model metadata
            results = {
                'Dataset': dataset_name,
                'Method': 'RTree-Distance',
                'MAE': mae,
                'MAPE': mape,
                'Q_Score': q_score,
                'Avg_Time_ms': avg_time_ms,
                'Num_Queries': len(sampled_results),
                'Sample_Ratio': sample_ratio,
                'Model_Size_MB': total_size_bytes / (1024*1024),
                'Level_Nodes_Size_MB': level_nodes_size_bytes / (1024*1024),
                'Num_Level_Nodes': num_level_nodes
            }
            
            # Add R-tree parameters to results
            for key, value in rtree_params.items():
                results[f'rtree_{key}'] = value
            
            # Save results
            results_file_out = f"{self.results_dir}/{dataset_name}_evaluation_sample{int(sample_ratio*100)}.csv"
            pd.DataFrame([results]).to_csv(results_file_out, index=False)
            
            # Generate visualization
            self.visualize_results(dataset_name, actual_counts, estimated_counts, sample_ratio)
            
            print(f"Evaluation results for {dataset_name} ({sample_ratio*100}% sample):")
            print(f"  MAE: {mae:.2f}")
            print(f"  MAPE: {mape:.2%}")
            print(f"  Q-Score: {q_score:.2f}")
            print(f"  Avg. Estimation Time: {avg_time_ms:.4f} ms")
            print(f"  Model Size: {results['Model_Size_MB']:.2f} MB")
            print(f"  Num Level Nodes: {num_level_nodes}")
            sys.stdout.flush()
            
            return results
            
        except Exception as e:
            print(f"Error evaluating {dataset_name}: {str(e)}")
            sys.stdout.flush()
            raise
    
    def visualize_results(self, dataset_name, actual_counts, estimated_counts, sample_ratio=0.2):
        """Create visualization of actual vs. predicted counts"""
        os.makedirs(self.viz_dir, exist_ok=True)
        
        plt.figure(figsize=(12, 10))
        plt.scatter(actual_counts, estimated_counts, alpha=0.5, s=8)
        
        max_val = max(np.max(actual_counts), np.max(estimated_counts))
        plt.plot([0, max_val], [0, max_val], 'r--', alpha=0.7)
        
        plt.xlabel('Actual Count')
        plt.ylabel('Estimated Count')
        plt.title(f'R-tree Distance Estimation for {dataset_name} ({int(sample_ratio*100)}% sample)')
        plt.grid(True, alpha=0.3)
        
        plt.savefig(
            f"{self.viz_dir}/{dataset_name}_estimation_sample{int(sample_ratio*100)}.png", 
            dpi=150
        )
        plt.close()
        
        # Create a comparison for a sample of queries
        sample_size = min(100, len(actual_counts))
        indices = np.random.choice(len(actual_counts), sample_size, replace=False)
        
        plt.figure(figsize=(20, 10))
        plt.scatter(
            range(sample_size), 
            actual_counts[indices],
            label='Actual Count', 
            s=100, alpha=0.7, marker='o', color='green'
        )
        plt.scatter(
            range(sample_size), 
            estimated_counts[indices],
            label='R-tree Distance Estimate', 
            s=100, alpha=0.7, marker='x', color='blue'
        )
        
        plt.xlabel('Query Index')
        plt.ylabel('Object Count')
        plt.title(
            f'R-tree Distance vs. Actual Count for {dataset_name} - '
            f'Sample of {sample_size} Queries ({int(sample_ratio*100)}% dataset)'
        )
        plt.legend()
        plt.grid(True, alpha=0.3)
        
        plt.savefig(
            f"{self.viz_dir}/{dataset_name}_comparison_sample{int(sample_ratio*100)}.png", 
            dpi=150
        )
        plt.close()

def evaluate_all_datasets(sample_ratio=0.2):
    """Evaluate R-tree distance estimation on all available pre-built models"""
    print("Initializing R-tree distance estimator...")
    sys.stdout.flush()
    
    estimator = RTreeDistanceEstimator()
    available_datasets = list(estimator.level_nodes.keys())
    
    if not available_datasets:
        print("No pre-built R-tree models found. Please run RTree-builder first.")
        return
    
    print(f"Found {len(available_datasets)} pre-built R-tree models to evaluate")
    sys.stdout.flush()
    
    all_results = []
    
    # Process each available dataset with clear separation
    for idx, dataset_name in enumerate(available_datasets, start=1):
        print("\n" + "="*80)
        print(f"DATASET {idx}/{len(available_datasets)}: {dataset_name}")
        print("="*80)
        sys.stdout.flush()
        
        try:
            results = estimator.evaluate_on_dataset(dataset_name, sample_ratio=sample_ratio)
            all_results.append(results)
            print(f"Finished processing {dataset_name} ({idx}/{len(available_datasets)})")
            sys.stdout.flush()
        except Exception as e:
            print(f"Error evaluating {dataset_name}: {e}")
            print("Moving to next dataset")
            sys.stdout.flush()
    
    # Save combined results
    if all_results:
        out_dir = estimator.results_dir
        os.makedirs(out_dir, exist_ok=True)
        all_results_df = pd.DataFrame(all_results)
        all_results_df.to_csv(
            f"{out_dir}/all_datasets_evaluation_sample{int(sample_ratio*100)}.csv", 
            index=False
        )
        print("\nCombined results:")
        print(all_results_df[['Dataset', 'MAE', 'MAPE', 'Q_Score', 'Model_Size_MB', 'Num_Level_Nodes']])
    else:
        print("No results were generated")

if __name__ == "__main__":
    evaluate_all_datasets(sample_ratio=0.2)
    print("R-tree distance estimation evaluation complete!")

Initializing R-tree distance estimator...


Loaded metadata for 14 datasets


Loaded 14 pre-built R-tree models


Found 14 pre-built R-tree models to evaluate



DATASET 1/14: yago2


Loading query results from ../../large_files/resultsDistance/yago2_results.csv


Using 20.0% sample: 179788 out of 898942 queries


Processing yago2 queries: 

0.0%... 

10.0%... 

20.0%... 

30.0%... 

40.0%... 

50.0%... 

60.0%... 

70.0%... 

80.0%... 

90.0%... 

100.0%... 

100.0%... 

Done!


Evaluation results for yago2 (20.0% sample):
  MAE: 93859.61
  MAPE: 616.62%
  Q-Score: 13.19
  Avg. Estimation Time: 4.9109 ms
  Model Size: 294.98 MB
  Num Level Nodes: 305


Finished processing yago2 (1/14)



DATASET 2/14: craftwaysorted


Loading query results from ../../large_files/resultsDistance/craftwaysorted_results.csv


Using 20.0% sample: 4364 out of 21822 queries


Processing craftwaysorted queries: 

0.0%... 

10.0%... 

20.0%... 

30.0%... 

40.0%... 

50.0%... 

60.0%... 

70.0%... 

79.9%... 

89.9%... 

99.9%... 

100.0%... 

Done!


Evaluation results for craftwaysorted (20.0% sample):
  MAE: 8045.33
  MAPE: 133.65%
  Q-Score: 4.20
  Avg. Estimation Time: 0.1769 ms
  Model Size: 6.90 MB
  Num Level Nodes: 9


Finished processing craftwaysorted (2/14)



DATASET 3/14: zcta5


Loading query results from ../../large_files/resultsDistance/zcta5_results.csv


Using 20.0% sample: 1325 out of 6626 queries


Processing zcta5 queries: 

0.1%... 

10.0%... 

20.0%... 

30.0%... 

39.9%... 

49.9%... 

59.8%... 

69.8%... 

79.8%... 

89.7%... 

99.7%... 

100.0%... 

Done!


Evaluation results for zcta5 (20.0% sample):
  MAE: 4384.91
  MAPE: 413.50%
  Q-Score: 10.98
  Avg. Estimation Time: 0.0691 ms
  Model Size: 2.20 MB
  Num Level Nodes: 3


Finished processing zcta5 (3/14)



DATASET 4/14: areawater


Loading query results from ../../large_files/resultsDistance/areawater_results.csv


Using 20.0% sample: 91710 out of 458552 queries


Processing areawater queries: 

0.0%... 

10.0%... 

20.0%... 

30.0%... 

40.0%... 

50.0%... 

60.0%... 

70.0%... 

80.0%... 

90.0%... 

100.0%... 

Done!


Evaluation results for areawater (20.0% sample):
  MAE: 141534.96
  MAPE: 645.58%
  Q-Score: 7.51
  Avg. Estimation Time: 0.4549 ms
  Model Size: 146.79 MB
  Num Level Nodes: 26


Finished processing areawater (4/14)



DATASET 5/14: aerowaythingnodesorted


Loading query results from ../../large_files/resultsDistance/aerowaythingnodesorted_results.csv


Using 20.0% sample: 3168 out of 15843 queries


Processing aerowaythingnodesorted queries: 

0.0%... 

10.0%... 

20.0%... 

30.0%... 

39.9%... 

49.9%... 

59.9%... 

69.9%... 

79.8%... 

89.8%... 

99.8%... 

100.0%... 

Done!


Evaluation results for aerowaythingnodesorted (20.0% sample):
  MAE: 2627.87
  MAPE: 195.56%
  Q-Score: 5.29
  Avg. Estimation Time: 0.0853 ms
  Model Size: 5.09 MB
  Num Level Nodes: 4


Finished processing aerowaythingnodesorted (5/14)



DATASET 6/14: emergencythingwaysorted


Loading query results from ../../large_files/resultsDistance/emergencythingwaysorted_results.csv


Using 20.0% sample: 32302 out of 161514 queries


Processing emergencythingwaysorted queries: 

0.0%... 

10.0%... 

20.0%... 

30.0%... 

40.0%... 

50.0%... 

60.0%... 

70.0%... 

80.0%... 

90.0%... 

100.0%... 

100.0%... 

Done!


Evaluation results for emergencythingwaysorted (20.0% sample):
  MAE: 37218.58
  MAPE: 594.76%
  Q-Score: 6.27
  Avg. Estimation Time: 0.9575 ms
  Model Size: 51.57 MB
  Num Level Nodes: 58


Finished processing emergencythingwaysorted (6/14)



DATASET 7/14: historicthingwaysorted


Loading query results from ../../large_files/resultsDistance/historicthingwaysorted_results.csv


Using 20.0% sample: 71687 out of 358439 queries


Processing historicthingwaysorted queries: 

0.0%... 

10.0%... 

20.0%... 

30.0%... 

40.0%... 

50.0%... 

60.0%... 

70.0%... 

80.0%... 

90.0%... 

100.0%... 

100.0%... 

Done!


Evaluation results for historicthingwaysorted (20.0% sample):
  MAE: 104676.86
  MAPE: 67.22%
  Q-Score: 2.08
  Avg. Estimation Time: 1.5618 ms
  Model Size: 114.03 MB
  Num Level Nodes: 94


Finished processing historicthingwaysorted (7/14)



DATASET 8/14: aerowaythingwaysorted


Loading query results from ../../large_files/resultsDistance/aerowaythingwaysorted_results.csv


Using 20.0% sample: 73673 out of 368365 queries


Processing aerowaythingwaysorted queries: 

0.0%... 

10.0%... 

20.0%... 

30.0%... 

40.0%... 

50.0%... 

60.0%... 

70.0%... 

80.0%... 

90.0%... 

100.0%... 

100.0%... 

Done!


Evaluation results for aerowaythingwaysorted (20.0% sample):
  MAE: 28790.32
  MAPE: 82.72%
  Q-Score: 2.13
  Avg. Estimation Time: 1.8465 ms
  Model Size: 116.49 MB
  Num Level Nodes: 113


Finished processing aerowaythingwaysorted (8/14)



DATASET 9/14: cyclewaythingwaysorted


Loading query results from ../../large_files/resultsDistance/cyclewaythingwaysorted_results.csv


Using 20.0% sample: 213412 out of 1067063 queries


Processing cyclewaythingwaysorted queries: 

0.0%... 

10.0%... 

20.0%... 

30.0%... 

40.0%... 

50.0%... 

60.0%... 

70.0%... 

80.0%... 

90.0%... 

100.0%... 

100.0%... 

Done!


Evaluation results for cyclewaythingwaysorted (20.0% sample):
  MAE: 136427.99
  MAPE: 244.48%
  Q-Score: 3.51
  Avg. Estimation Time: 2.8176 ms
  Model Size: 336.85 MB
  Num Level Nodes: 170


Finished processing cyclewaythingwaysorted (9/14)



DATASET 10/14: powerthingwaysorted


Loading query results from ../../large_files/resultsDistance/powerthingwaysorted_results.csv


Using 20.0% sample: 543457 out of 2717289 queries


Processing powerthingwaysorted queries: 

0.0%... 

10.0%... 

20.0%... 

30.0%... 

40.0%... 

50.0%... 

60.0%... 

70.0%... 

80.0%... 

90.0%... 

100.0%... 

100.0%... 

Done!


Evaluation results for powerthingwaysorted (20.0% sample):
  MAE: 101798.81
  MAPE: 96.33%
  Q-Score: 2.05
  Avg. Estimation Time: 6.5313 ms
  Model Size: 874.03 MB
  Num Level Nodes: 403


Finished processing powerthingwaysorted (10/14)



DATASET 11/14: leisurewaysorted


Loading query results from ../../large_files/resultsDistance/leisurewaysorted_results.csv


Using 20.0% sample: 1175314 out of 5876570 queries


Processing leisurewaysorted queries: 

0.0%... 

10.0%... 

20.0%... 

30.0%... 

40.0%... 

50.0%... 

60.0%... 

70.0%... 

80.0%... 

90.0%... 

100.0%... 

100.0%... 

Done!


Evaluation results for leisurewaysorted (20.0% sample):
  MAE: 260149.78
  MAPE: 35.99%
  Q-Score: 1.42
  Avg. Estimation Time: 11.0375 ms
  Model Size: 1866.94 MB
  Num Level Nodes: 687


Finished processing leisurewaysorted (11/14)



DATASET 12/14: barrierthingwaysorted


Loading query results from ../../large_files/resultsDistance/barrierthingwaysorted_results.csv


Using 20.0% sample: 916334 out of 4581670 queries


Processing barrierthingwaysorted queries: 

0.0%... 

10.0%... 

20.0%... 

30.0%... 

40.0%... 

50.0%... 

60.0%... 

70.0%... 

80.0%... 

90.0%... 

100.0%... 

100.0%... 

Done!


Evaluation results for barrierthingwaysorted (20.0% sample):
  MAE: 301391.93
  MAPE: 44.80%
  Q-Score: 1.55
  Avg. Estimation Time: 9.2125 ms
  Model Size: 1456.06 MB
  Num Level Nodes: 569


Finished processing barrierthingwaysorted (12/14)



DATASET 13/14: powerthingnodesorted


Loading query results from ../../large_files/resultsDistance/powerthingnodesorted_results.csv


Using 20.0% sample: 420502 out of 2102514 queries


Processing powerthingnodesorted queries: 

0.0%... 

10.0%... 

20.0%... 

30.0%... 

40.0%... 

50.0%... 

60.0%... 

70.0%... 

80.0%... 

90.0%... 

100.0%... 

100.0%... 

Done!


Evaluation results for powerthingnodesorted (20.0% sample):
  MAE: 85130.85
  MAPE: 175.83%
  Q-Score: 2.86
  Avg. Estimation Time: 5.3593 ms
  Model Size: 686.43 MB
  Num Level Nodes: 332


Finished processing powerthingnodesorted (13/14)



DATASET 14/14: arealm


Loading query results from ../../large_files/resultsDistance/arealm_results.csv


Using 20.0% sample: 5166 out of 25833 queries


Processing arealm queries: 

0.0%... 

10.0%... 

20.0%... 

30.0%... 

40.0%... 

50.0%... 

59.9%... 

69.9%... 

79.9%... 

89.9%... 

99.9%... 

100.0%... 

Done!


Evaluation results for arealm (20.0% sample):
  MAE: 21423.16
  MAPE: 659.78%
  Q-Score: 12.44
  Avg. Estimation Time: 0.1198 ms
  Model Size: 8.34 MB
  Num Level Nodes: 6


Finished processing arealm (14/14)



Combined results:
                    Dataset            MAE      MAPE    Q_Score  \
0                     yago2   93859.612093  6.166167  13.192274   
1            craftwaysorted    8045.326077  1.336542   4.203567   
2                     zcta5    4384.912453  4.134996  10.978845   
3                 areawater  141534.959895  6.455758   7.505850   
4    aerowaythingnodesorted    2627.872159  1.955622   5.285887   
5   emergencythingwaysorted   37218.575444  5.947638   6.268320   
6    historicthingwaysorted  104676.863908  0.672205   2.077272   
7     aerowaythingwaysorted   28790.316995  0.827240   2.131138   
8    cyclewaythingwaysorted  136427.985877  2.444777   3.506763   
9       powerthingwaysorted  101798.811214  0.963313   2.046686   
10         leisurewaysorted  260149.780025  0.359936   1.424671   
11    barrierthingwaysorted  301391.932469  0.447968   1.545256   
12     powerthingnodesorted   85130.845832  1.758349   2.858815   
13                   arealm   21423.156407 

R-tree distance estimation evaluation complete!
