In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from time import process_time
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
import psutil
import os
import re
import multiprocessing
import joblib
import gc
from tqdm.notebook import tqdm  # Progress bars for Jupyter

# Configuration options
VISUALIZE_RECTANGLES = False  # Set to True if you want to visualize rectangles
SAVE_INTERMEDIATE_MODELS = True  # Set to False to save only final models

# Flag to control whether to use multiple scales or only the maximum scale
use_multiple_scales = True  # Set to False to use only maximum scale, True for all scales

# Get available CPU cores and set appropriate parallelism
n_cores = multiprocessing.cpu_count()
n_jobs = max(1, n_cores - 1)  # Leave one core free for system processes
print(f"Using {n_jobs} of {n_cores} available CPU cores")

def monitor_memory():
    """Print current memory usage of the process"""
    process = psutil.Process()
    memory_mb = process.memory_info().rss / (1024 * 1024)
    print(f"Memory usage: {memory_mb:.2f} MB")

def MAPE(actual_values, predicted_values):
    """Calculate Mean Absolute Percentage Error with special handling for zeros"""
    # Vectorized implementation
    actual_flat = actual_values.flatten()
    pred_flat = predicted_values.flatten()
    
    # Create mask for non-zero actual values
    non_zero_mask = actual_flat != 0
    zero_mask = ~non_zero_mask
    
    # Calculate MAPE for non-zero elements
    mape_sum = 0
    count = len(actual_flat)
    
    if np.any(non_zero_mask):
        mape_sum += np.sum(np.abs((actual_flat[non_zero_mask] - pred_flat[non_zero_mask]) / actual_flat[non_zero_mask]))
    
    if np.any(zero_mask):
        mape_sum += np.sum(np.abs(actual_flat[zero_mask] - pred_flat[zero_mask]) / 100)
    
    return mape_sum / count

# Load spatial statistics to get universe boundaries for each dataset
print("Loading spatial statistics...")
spatial_stats = pd.read_csv('../spatial_statistics.csv')

# Directory containing the datasets
data_dir = '../large_files/resultsIntersects/'

# Parse bounding box information
def parse_bbox(bbox_str):
    # Extract coordinates from BOX string using regex
    pattern = r"BOX\(([-\d\.]+) ([-\d\.]+),([-\d\.]+) ([-\d\.]+)\)"
    match = re.search(pattern, bbox_str)
    if match:
        xmin = float(match.group(1))
        ymin = float(match.group(2))
        xmax = float(match.group(3))
        ymax = float(match.group(4))
        return xmin, ymin, xmax, ymax
    return -180, -90, 180, 90  # Default if parsing fails

# Extract universe boundaries for each dataset
universe_boundaries = {}
for _, row in spatial_stats.iterrows():
    table_name = row['Table Name']
    bbox = parse_bbox(row['Universe Limits (Bounding Box)'])
    universe_boundaries[table_name] = bbox

# Get list of all CSV files in the directory
print("Finding dataset files...")
csv_files = [f for f in os.listdir(data_dir) if f.endswith('.csv')]
print(f"Found {len(csv_files)} datasets to process")

# Define the scales of learning
scales = [1000, 5000, 10000, 50000, 100000, 500000, 1000000]

# Create necessary directories
os.makedirs('../large_files/LearnedModels/intersect/RF', exist_ok=True)
os.makedirs('../large_files/LearnedModels/intersect/RF/visualizations', exist_ok=True)
os.makedirs('../large_files/LearnedModels/intersect/RF/results', exist_ok=True)

# Lists to store all results
all_results_list = []

# Process each dataset
for csv_file in tqdm(csv_files, desc="Processing datasets"):
    # Force garbage collection at the start of each dataset
    gc.collect()
    monitor_memory()
    
    # Extract dataset name (remove "_results.csv")
    dataset_name = csv_file.replace('_results.csv', '')
    
    print(f"\nProcessing dataset: {dataset_name}")
    
    # Get universe boundaries for this dataset
    if dataset_name in universe_boundaries:
        univ_xmin, univ_ymin, univ_xmax, univ_ymax = universe_boundaries[dataset_name]
    else:
        # Default values if dataset not found in spatial stats
        univ_xmin, univ_ymin, univ_xmax, univ_ymax = -180, -90, 180, 90
    
    Surface_univ = (univ_xmax - univ_xmin) * (univ_ymax - univ_ymin)
    print(f"Universe boundaries for {dataset_name}: ({univ_xmin}, {univ_ymin}, {univ_xmax}, {univ_ymax})")
    
    # Load dataset - only load required columns
    data_path = os.path.join(data_dir, csv_file)
    print(f"Loading data from {data_path}")
    data = pd.read_csv(data_path, usecols=['Query MBR', 'Count MBR'])
    
    # Extract query MBR column (needs parsing as it's in string format)
    def parse_mbr(mbr_str):
        coords = mbr_str.strip('"()').split(', ')
        return [float(coord) for coord in coords]
    
    # Extract columns - use list comprehension for better performance
    print("Parsing MBR coordinates...")
    Rectangles = np.array([parse_mbr(mbr) for mbr in data['Query MBR']])
    Y = data[['Count MBR']].values  # Using Count MBR as target
    
    # Free up memory
    del data
    gc.collect()
    
    # Calculate basic statistics
    max_count = float(np.max(Y))
    min_count = float(np.min(Y))
    mean_count = float(np.mean(Y))
    median_count = float(np.median(Y))
    total_samples = len(Y)

    # Display basic statistics for the dataset
    print(f"\nBasic statistics for {dataset_name} dataset:")
    print(f"Max count: {max_count}")
    print(f"Min count: {min_count}")
    print(f"Mean count: {mean_count:.2f}")
    print(f"Median count: {median_count:.2f}")
    print(f"Total samples: {total_samples}\n")

    # Calculate rectangles density - vectorized version
    print("Calculating rectangle densities...")
    width = Rectangles[:, 2] - Rectangles[:, 0]
    height = Rectangles[:, 3] - Rectangles[:, 1]
    rectanglesDensity = np.abs(width * height / Surface_univ).reshape(-1, 1)
    
    # Prepare the dataset
    # X = np.append(Rectangles, rectanglesDensity, axis=1)
    X = Rectangles
    
    # Split the data into 80% train and 20% test
    print("Splitting data into train and test sets...")
    X_train, X_test_all, y_train, y_test_all = train_test_split(X, Y, test_size=0.2, random_state=3)
    
    # Visualize the first 1000 rectangles (only if enabled)
    if len(Rectangles) > 0 and VISUALIZE_RECTANGLES:
        print("Visualizing rectangles sample...")
        plt.figure(figsize=(10, 8))
        ax = plt.subplot()
        
        # Only visualize a sample to save time
        sample_size = min(1000, len(Rectangles))
        for i in range(sample_size):
            x1, y1, x2, y2 = Rectangles[i]
            color_val = float(rectanglesDensity[i][0]) if hasattr(rectanglesDensity[i], '__len__') else float(rectanglesDensity[i])
            rectangle = patches.Rectangle((x1, y1), x2 - x1, y2 - y1, 
                                        linewidth=1, edgecolor='b', facecolor='none', alpha=min(1.0, color_val*10))
            ax.add_patch(rectangle)
            
        plt.xlim(univ_xmin-20, univ_xmax+20)
        plt.ylim(univ_ymin-10, univ_ymax+10)
        plt.title(f"Sample rectangles from {dataset_name}")
        plt.savefig(f"../large_files/LearnedModels/intersect/RF/visualizations/{dataset_name}_rectangles.png", dpi=150)
        plt.close()  # Close to free memory instead of plt.show()
    
    # Adjust scales to the dataset size
    max_size = len(X_train)
    print(f"Training set size: {max_size}")

    if use_multiple_scales:
        # Use multiple scales as before
        adjusted_scales = [s for s in scales if s <= max_size]
        
        # Add intermediate 1 million increments for large datasets
        if max_size > 1000000:
            million_increments = list(range(2000000, max_size, 1000000))
            adjusted_scales.extend(million_increments)
            
        # Add the actual max size if it's not already in the list
        if max_size not in adjusted_scales:
            adjusted_scales.append(max_size)
            
        # Sort the scales to ensure they're in ascending order
        adjusted_scales.sort()
    else:
        # Use only the maximum scale
        adjusted_scales = [max_size]

    # List to store dataset-specific results
    dataset_results_list = []

    # Store best parameters from max scale training to reuse
    best_params = None
    
    # Process scales in reversed order (largest first)
    for sample_size in reversed(adjusted_scales):
        print(f"\nTraining with sample size: {sample_size}")
        monitor_memory()
        
        # Create training subset
        X_train_sample = X_train[:sample_size, :]
        y_train_sample = y_train[:sample_size]
        
        # Random Forest Regressor parameters - optimized for performance
        params_rf = {
            "n_estimators": [50, 100, 200],
            "max_depth": [10, 20, 30, None],
            "min_samples_split": [2, 5, 10]
        }
        
        # For very large datasets, use smaller parameter grid
        if sample_size > 100000:
            params_rf = {
                "n_estimators": [50],
                "max_depth": [None],
                "min_samples_split": [5]
            }
            
        # Only do GridSearch for the max scale
        if sample_size == max_size or best_params is None:
            print("Performing grid search for optimal parameters...")
            # Use a smaller max_features value to reduce memory usage
            rf = RandomForestRegressor(random_state=3, max_features='sqrt', n_jobs=n_jobs)
            rf_cv = GridSearchCV(rf, params_rf, cv=3, n_jobs=1, verbose=1)  # Use n_jobs=1 here as RF already uses parallelism
            
            # Time the grid search
            t1_start = process_time()
            rf_cv.fit(X_train_sample, y_train_sample.ravel())  # Use ravel for 1D array
            t1_stop = process_time()
            grid_search_time = t1_stop - t1_start
            
            # Store best parameters for reuse
            best_params = rf_cv.best_params_
            print(f"Grid search complete in {grid_search_time:.2f}s")
            print(f"Best parameters: {best_params}")
        else:
            # Skip grid search for smaller scales, use params from max scale
            rf_cv = None
            grid_search_time = 0
            print(f"Using best parameters from max scale: {best_params}")
        
        # Train the model with best parameters
        print("Training random forest model...")
        rf = RandomForestRegressor(random_state=3, **best_params, n_jobs=n_jobs)
        t2_start = process_time()
        rf.fit(X_train_sample, y_train_sample.ravel())  # Use ravel for 1D array
        t2_stop = process_time()
        training_time = t2_stop - t2_start
        
        # Make predictions
        print("Making predictions...")
        y_pred = rf.predict(X_test_all).reshape(-1, 1)  # Reshape to match y_test_all format
        
        # Calculate metrics
        r2_score = rf.score(X_test_all, y_test_all.ravel())
        mae_value = MAE(y_test_all, y_pred)
        mape_value = MAPE(y_test_all, y_pred)
        
        # Calculate q-score - vectorized version
        print("Calculating performance metrics...")
        
        # Vectorized q-score calculation
        y_true_flat = y_test_all.flatten()
        y_pred_flat = y_pred.flatten() if y_pred.ndim > 1 else y_pred
        
        # Find indices where both values are non-zero
        valid_indices = (y_true_flat != 0) & (y_pred_flat != 0)
        
        if np.any(valid_indices):
            ratios = np.maximum(
                y_pred_flat[valid_indices] / y_true_flat[valid_indices],
                y_true_flat[valid_indices] / y_pred_flat[valid_indices]
            )
            q_score_mean = np.mean(ratios)
        else:
            q_score_mean = 0
        
        # Time prediction performance (10 iterations)
        print("Measuring prediction performance...")
        total_duration = 0
        total_read = 0
        total_write = 0
        
        for _ in range(10):
            io_before = psutil.disk_io_counters()
            t3_start = process_time()
            rf.predict(X_test_all)
            t3_stop = process_time()
            io_after = psutil.disk_io_counters()
            
            total_duration += (t3_stop - t3_start)
            total_read += io_after.read_count - io_before.read_count
            total_write += io_after.write_count - io_before.write_count
        
        avg_pred_time_microsec = (total_duration / 10) / len(y_pred) * 1000000
        avg_reads = total_read / 10 / len(y_pred)
        avg_writes = total_write / 10 / len(y_pred)
        
        # Save the model using joblib instead of pickle for better efficiency
        if SAVE_INTERMEDIATE_MODELS or sample_size == max_size:
            print("Saving model...")
            filename = f'../large_files/LearnedModels/intersect/RF/{dataset_name}_rf_{sample_size}_{training_time:.2f}s_{mape_value:.2%}_{mae_value:.2f}.joblib'
            joblib.dump(rf, filename, compress=3)
            # Get model file size in KB
            model_size_kb = os.path.getsize(filename) / 1024
            print(f"Model size: {model_size_kb:.2f} KB")
        else:
            model_size_kb = 0  # Set to 0 if model wasn't saved
        
        # Print results
        print(f"\nResults for {dataset_name}, Sample Size: {sample_size}")
        print(f"Grid Search Time: {grid_search_time:.2f}s, Training Time: {training_time:.2f}s")
        print(f"Random Forest Parameters: {best_params}")
        print(f"Performance: R² = {r2_score:.4f}, MAE = {mae_value:.2f}, MAPE = {mape_value:.2%}")
        print(f"q-score: {q_score_mean:.2f}")
        print(f"Prediction time: {avg_pred_time_microsec:.4f} μs/sample")
        print(f"I/O: Reads={avg_reads:.6f}, Writes={avg_writes:.6f}")
        print("-" * 80)
        
        # Plot actual vs predicted only for the maximum scale
        if sample_size == adjusted_scales[-1]:  # Check if this is the maximum scale
            print("Generating prediction scatter plot...")
            plt.figure(figsize=(10, 8))
            plt.scatter(y_test_all, y_pred, s=0.5, alpha=0.5)
            plt.xlabel('True Values')
            plt.ylabel('Predictions')
            plt.title(f"{dataset_name} - Sample Size: {sample_size} (Maximum)")
            plt.grid(True, alpha=0.3)
            
            # Add diagonal line for perfect predictions
            max_val = max(np.max(y_test_all), np.max(y_pred))
            plt.plot([0, max_val], [0, max_val], 'r--', alpha=0.5)
            
            plt.savefig(f"../large_files/LearnedModels/intersect/RF/visualizations/{dataset_name}_{sample_size}_prediction.png", dpi=150)
            plt.close()  # Close to free memory
            
            # Create a scatter plot comparing predicted vs real values for first 100 rectangles
            print("Generating side-by-side comparison plot...")
            
            # Get predictions for first 100 test samples
            sample_indices = range(min(100, len(X_test_all)))
            X_sample = X_test_all[sample_indices]
            y_sample_true = y_test_all[sample_indices].flatten()
            y_sample_pred = rf.predict(X_sample)
            
            plt.figure(figsize=(20, 10))
            plt.scatter(range(len(sample_indices)), y_sample_pred, c='blue', 
                        label='Predicted number of objects (Random Forest)', alpha=0.7, s=100)
            plt.scatter(range(len(sample_indices)), y_sample_true, c='green', 
                        label='Real number of objects', alpha=0.7, s=100)
            
            plt.title(f'{dataset_name} - First {len(sample_indices)} Rectangles: Predicted vs Real Values', fontsize=16)
            plt.xlabel('Rectangle Index', fontsize=14)
            plt.ylabel('Number of objects in rectangle', fontsize=14)
            plt.legend(fontsize=12)
            plt.grid(True, alpha=0.3)
            plt.tight_layout()
            
            # Save the plot
            plt.savefig(f"../large_files/LearnedModels/intersect/RF/visualizations/{dataset_name}_comparison_plot.png", dpi=150)
            plt.close()
        
        # Store results in list (more efficient than DataFrame concat)
        result_row = {
            'Dataset': dataset_name,
            'Sample_Size': sample_size,
            'Training_Time': training_time,
            'Best_Params': str(best_params),
            'R2_Score': r2_score,
            'MAE': mae_value,
            'MAPE': float(mape_value),
            'Q_Score': q_score_mean,
            'Pred_Time_Microseconds': avg_pred_time_microsec,
            'IO_Reads': avg_reads,
            'IO_Writes': avg_writes,
            'Model_Size_KB': model_size_kb,
            'Max_Count': max_count,
            'Min_Count': min_count,
            'Mean_Count': mean_count,
            'Median_Count': median_count,
            'Total_Samples': total_samples
        }
        
        dataset_results_list.append(result_row)
        all_results_list.append(result_row)
        
        # Clean up to free memory
        if sample_size != max_size:  # Don't delete for max size as we might need it
            del X_train_sample, y_train_sample, rf
            gc.collect()
    
    # Save results for this dataset
    print(f"Saving results for {dataset_name}...")
    dataset_results = pd.DataFrame(dataset_results_list)
    dataset_results.to_csv(f'../large_files/LearnedModels/intersect/RF/results/{dataset_name}_results.csv', index=False)
    
    # Clear memory before next dataset
    del X_train, X_test_all, y_train, y_test_all, Rectangles, Y, rectanglesDensity
    gc.collect()
    
# Save all results
print("Saving combined results...")
all_results = pd.DataFrame(all_results_list)
all_results.to_csv('../large_files/LearnedModels/intersect/RF/all_results.csv', index=False)

print("All processing completed and results saved.")
monitor_memory()

Using 29 of 30 available CPU cores
Loading spatial statistics...
Finding dataset files...
Found 14 datasets to process


Processing datasets:   0%|          | 0/14 [00:00<?, ?it/s]

Memory usage: 202.30 MB

Processing dataset: historicthingwaysorted
Universe boundaries for historicthingwaysorted: (-179.99526020000002, -85.0036942, 179.99597930000002, 78.06750650000001)
Loading data from ../large_files/resultsIntersects/historicthingwaysorted_results.csv


Parsing MBR coordinates...



Basic statistics for historicthingwaysorted dataset:
Max count: 1792176.0
Min count: 0.0
Mean count: 29765.11
Median count: 16.00
Total samples: 358439

Calculating rectangle densities...
Splitting data into train and test sets...
Training set size: 286751

Training with sample size: 286751
Memory usage: 247.50 MB
Performing grid search for optimal parameters...
Fitting 3 folds for each of 1 candidates, totalling 3 fits


Grid search complete in 200.49s
Best parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...


Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...


Model size: 67233.50 KB

Results for historicthingwaysorted, Sample Size: 286751
Grid Search Time: 200.49s, Training Time: 124.15s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.9992, MAE = 827.39, MAPE = 36.88%
q-score: 1.66
Prediction time: 18.4893 μs/sample
I/O: Reads=0.000000, Writes=0.000000
--------------------------------------------------------------------------------
Generating prediction scatter plot...


Generating side-by-side comparison plot...



Training with sample size: 100000
Memory usage: 1223.60 MB
Using best parameters from max scale: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...


Making predictions...
Calculating performance metrics...
Measuring prediction performance...


Saving model...


Model size: 25621.84 KB

Results for historicthingwaysorted, Sample Size: 100000
Grid Search Time: 0.00s, Training Time: 36.33s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.9983, MAE = 1257.90, MAPE = 77.76%
q-score: 2.35
Prediction time: 13.6373 μs/sample
I/O: Reads=0.000001, Writes=0.000003
--------------------------------------------------------------------------------

Training with sample size: 50000
Memory usage: 770.95 MB
Using best parameters from max scale: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...


Making predictions...
Calculating performance metrics...
Measuring prediction performance...


Saving model...


Model size: 13559.34 KB

Results for historicthingwaysorted, Sample Size: 50000
Grid Search Time: 0.00s, Training Time: 16.12s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.9973, MAE = 1677.22, MAPE = 135.06%
q-score: 3.27
Prediction time: 12.1604 μs/sample
I/O: Reads=0.000000, Writes=0.000000
--------------------------------------------------------------------------------

Training with sample size: 10000
Memory usage: 771.15 MB
Using best parameters from max scale: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...


Making predictions...
Calculating performance metrics...
Measuring prediction performance...


Saving model...


Model size: 3101.70 KB

Results for historicthingwaysorted, Sample Size: 10000
Grid Search Time: 0.00s, Training Time: 3.15s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.9903, MAE = 3849.15, MAPE = 710.12%
q-score: 11.60
Prediction time: 9.7329 μs/sample
I/O: Reads=0.000001, Writes=0.000010
--------------------------------------------------------------------------------

Training with sample size: 5000
Memory usage: 771.15 MB
Using best parameters from max scale: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...
Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...
Model size: 1687.14 KB

Results for historicthingwaysorted, Sample Size: 5000
Grid Search Time: 0.00s, Training Time: 1.73s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.9789, MAE = 5924.95, MAPE = 1278.51%
q-score: 19.79
Prediction time: 8.8817 μs/sample
I/O: Reads=0.000000, Writes=0.000000
--------------------------------------------------------------------------------

Training with sample size: 1000


Memory usage: 771.15 MB
Using best parameters from max scale: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...
Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...
Model size: 379.43 KB

Results for historicthingwaysorted, Sample Size: 1000
Grid Search Time: 0.00s, Training Time: 0.47s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.7788, MAE = 21030.48, MAPE = 10093.99%
q-score: 145.76
Prediction time: 7.4072 μs/sample
I/O: Reads=0.000000, Writes=0.000000
--------------------------------------------------------------------------------
Saving results for historicthingwaysorted...


Memory usage: 762.40 MB

Processing dataset: powerthingnodesorted
Universe boundaries for powerthingnodesorted: (-177.92741900000001, -77.8453164, 178.47197400000002, 78.2256315)
Loading data from ../large_files/resultsIntersects/powerthingnodesorted_results.csv


Parsing MBR coordinates...



Basic statistics for powerthingnodesorted dataset:
Max count: 10512575.0
Min count: 0.0
Mean count: 174964.97
Median count: 41.00
Total samples: 2102514

Calculating rectangle densities...
Splitting data into train and test sets...


Training set size: 1682011

Training with sample size: 1682011
Memory usage: 977.60 MB
Performing grid search for optimal parameters...
Fitting 3 folds for each of 1 candidates, totalling 3 fits


Grid search complete in 1767.36s
Best parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...


Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...


Model size: 412494.78 KB

Results for powerthingnodesorted, Sample Size: 1682011
Grid Search Time: 1767.36s, Training Time: 1083.75s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.9996, MAE = 3626.45, MAPE = 70.87%
q-score: 2.24
Prediction time: 29.6385 μs/sample
I/O: Reads=0.000000, Writes=0.000000
--------------------------------------------------------------------------------
Generating prediction scatter plot...


Generating side-by-side comparison plot...



Training with sample size: 1000000
Memory usage: 5471.32 MB
Using best parameters from max scale: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...


Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...


Model size: 250520.33 KB

Results for powerthingnodesorted, Sample Size: 1000000
Grid Search Time: 0.00s, Training Time: 608.56s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.9994, MAE = 4414.30, MAPE = 109.22%
q-score: 2.88
Prediction time: 25.6077 μs/sample
I/O: Reads=0.000000, Writes=0.000001
--------------------------------------------------------------------------------

Training with sample size: 500000
Memory usage: 2317.92 MB
Using best parameters from max scale: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...


Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...


Model size: 129276.92 KB

Results for powerthingnodesorted, Sample Size: 500000
Grid Search Time: 0.00s, Training Time: 262.25s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.9992, MAE = 5434.37, MAPE = 212.09%
q-score: 4.65
Prediction time: 20.2088 μs/sample
I/O: Reads=0.000000, Writes=0.000000
--------------------------------------------------------------------------------

Training with sample size: 100000
Memory usage: 2735.47 MB
Using best parameters from max scale: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...


Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...


Model size: 28145.17 KB

Results for powerthingnodesorted, Sample Size: 100000
Grid Search Time: 0.00s, Training Time: 34.94s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.9967, MAE = 11023.86, MAPE = 1212.69%
q-score: 21.36
Prediction time: 10.5321 μs/sample
I/O: Reads=0.000000, Writes=0.000004
--------------------------------------------------------------------------------

Training with sample size: 50000
Memory usage: 2765.18 MB
Using best parameters from max scale: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...


Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...


Model size: 14712.89 KB

Results for powerthingnodesorted, Sample Size: 50000
Grid Search Time: 0.00s, Training Time: 18.92s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.9943, MAE = 15208.25, MAPE = 2663.58%
q-score: 45.40
Prediction time: 8.8200 μs/sample
I/O: Reads=0.000000, Writes=0.000001
--------------------------------------------------------------------------------

Training with sample size: 10000
Memory usage: 2765.59 MB
Using best parameters from max scale: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...


Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...


Model size: 3353.24 KB

Results for powerthingnodesorted, Sample Size: 10000
Grid Search Time: 0.00s, Training Time: 3.08s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.9745, MAE = 36093.96, MAPE = 21541.38%
q-score: 352.73
Prediction time: 6.8112 μs/sample
I/O: Reads=0.000000, Writes=0.000000
--------------------------------------------------------------------------------

Training with sample size: 5000
Memory usage: 2765.59 MB
Using best parameters from max scale: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...
Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...
Model size: 1776.25 KB

Results for powerthingnodesorted, Sample Size: 5000
Grid Search Time: 0.00s, Training Time: 1.66s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.9364, MAE = 56514.19, MAPE = 41941.10%
q-score: 684.94
Prediction time: 6.0264 μs/sample
I/O: Reads=0.000000, Writes=0.000000
--------------------------------------------------------------------------------



Training with sample size: 1000
Memory usage: 2765.60 MB
Using best parameters from max scale: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...
Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...
Model size: 409.63 KB

Results for powerthingnodesorted, Sample Size: 1000
Grid Search Time: 0.00s, Training Time: 0.48s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.6780, MAE = 117322.96, MAPE = 135309.22%
q-score: 2211.44
Prediction time: 4.6104 μs/sample
I/O: Reads=0.000000, Writes=0.000000
--------------------------------------------------------------------------------
Saving results for powerthingnodesorted...


Memory usage: 2714.27 MB

Processing dataset: cyclewaythingwaysorted
Universe boundaries for cyclewaythingwaysorted: (-175.2093065, -75.1027861, 176.92582230000002, 71.0488105)
Loading data from ../large_files/resultsIntersects/cyclewaythingwaysorted_results.csv


Parsing MBR coordinates...



Basic statistics for cyclewaythingwaysorted dataset:
Max count: 5334900.0
Min count: 0.0
Mean count: 76783.36
Median count: 0.00
Total samples: 1067063

Calculating rectangle densities...
Splitting data into train and test sets...
Training set size: 853650

Training with sample size: 853650
Memory usage: 2652.12 MB
Performing grid search for optimal parameters...
Fitting 3 folds for each of 1 candidates, totalling 3 fits


Grid search complete in 745.64s
Best parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...


Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...


Model size: 157680.92 KB

Results for cyclewaythingwaysorted, Sample Size: 853650
Grid Search Time: 745.64s, Training Time: 470.00s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.9994, MAE = 1630.39, MAPE = 35.26%
q-score: 1.68
Prediction time: 21.1123 μs/sample
I/O: Reads=0.000000, Writes=0.000000
--------------------------------------------------------------------------------
Generating prediction scatter plot...


Generating side-by-side comparison plot...



Training with sample size: 500000
Memory usage: 3275.54 MB
Using best parameters from max scale: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...


Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...


Model size: 97287.61 KB

Results for cyclewaythingwaysorted, Sample Size: 500000
Grid Search Time: 0.00s, Training Time: 252.17s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.9992, MAE = 2047.18, MAPE = 51.13%
q-score: 1.96
Prediction time: 16.8494 μs/sample
I/O: Reads=0.000000, Writes=0.000000
--------------------------------------------------------------------------------

Training with sample size: 100000
Memory usage: 2805.72 MB
Using best parameters from max scale: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...


Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...


Model size: 22707.16 KB

Results for cyclewaythingwaysorted, Sample Size: 100000
Grid Search Time: 0.00s, Training Time: 37.06s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.9971, MAE = 4211.56, MAPE = 177.27%
q-score: 4.06
Prediction time: 10.1389 μs/sample
I/O: Reads=0.000000, Writes=0.000002
--------------------------------------------------------------------------------

Training with sample size: 50000
Memory usage: 2813.52 MB
Using best parameters from max scale: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...


Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...


Model size: 12158.66 KB

Results for cyclewaythingwaysorted, Sample Size: 50000
Grid Search Time: 0.00s, Training Time: 18.24s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.9947, MAE = 5932.51, MAPE = 319.85%
q-score: 6.12
Prediction time: 8.9583 μs/sample
I/O: Reads=0.000000, Writes=0.000000
--------------------------------------------------------------------------------

Training with sample size: 10000
Memory usage: 2813.57 MB
Using best parameters from max scale: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...


Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...
Model size: 2902.30 KB

Results for cyclewaythingwaysorted, Sample Size: 10000
Grid Search Time: 0.00s, Training Time: 3.15s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.9793, MAE = 13916.00, MAPE = 1910.25%
q-score: 27.69
Prediction time: 7.2423 μs/sample
I/O: Reads=0.000000, Writes=0.000075
--------------------------------------------------------------------------------



Training with sample size: 5000
Memory usage: 2813.57 MB
Using best parameters from max scale: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...
Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...
Model size: 1573.39 KB

Results for cyclewaythingwaysorted, Sample Size: 5000
Grid Search Time: 0.00s, Training Time: 1.75s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.9464, MAE = 22779.78, MAPE = 4833.90%
q-score: 68.76
Prediction time: 6.3894 μs/sample
I/O: Reads=0.000000, Writes=0.000001
--------------------------------------------------------------------------------

Training with sample size: 1000


Memory usage: 2813.57 MB
Using best parameters from max scale: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...
Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...
Model size: 361.05 KB

Results for cyclewaythingwaysorted, Sample Size: 1000
Grid Search Time: 0.00s, Training Time: 0.45s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.7422, MAE = 54332.14, MAPE = 14383.56%
q-score: 183.28
Prediction time: 4.8276 μs/sample
I/O: Reads=0.000000, Writes=0.000000
--------------------------------------------------------------------------------
Saving results for cyclewaythingwaysorted...


Memory usage: 2813.57 MB

Processing dataset: aerowaythingwaysorted
Universe boundaries for aerowaythingwaysorted: (-179.88131460000002, -79.7773063, 179.426138, 85.05258450000001)
Loading data from ../large_files/resultsIntersects/aerowaythingwaysorted_results.csv


Parsing MBR coordinates...



Basic statistics for aerowaythingwaysorted dataset:
Max count: 1841551.0
Min count: 0.0
Mean count: 32185.32
Median count: 227.00
Total samples: 368365

Calculating rectangle densities...
Splitting data into train and test sets...
Training set size: 294692

Training with sample size: 294692
Memory usage: 2815.55 MB
Performing grid search for optimal parameters...
Fitting 3 folds for each of 1 candidates, totalling 3 fits


Grid search complete in 201.61s
Best parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...


Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...


Model size: 85873.54 KB

Results for aerowaythingwaysorted, Sample Size: 294692
Grid Search Time: 201.61s, Training Time: 125.42s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.9988, MAE = 1260.26, MAPE = 78.47%
q-score: 2.17
Prediction time: 21.1853 μs/sample
I/O: Reads=0.000000, Writes=0.000000
--------------------------------------------------------------------------------
Generating prediction scatter plot...


Generating side-by-side comparison plot...



Training with sample size: 100000
Memory usage: 2847.97 MB
Using best parameters from max scale: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...


Making predictions...
Calculating performance metrics...
Measuring prediction performance...


Saving model...


Model size: 31119.99 KB

Results for aerowaythingwaysorted, Sample Size: 100000
Grid Search Time: 0.00s, Training Time: 36.99s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.9973, MAE = 2023.40, MAPE = 153.26%
q-score: 3.20
Prediction time: 15.0556 μs/sample
I/O: Reads=0.000000, Writes=0.000000
--------------------------------------------------------------------------------

Training with sample size: 50000
Memory usage: 2847.97 MB
Using best parameters from max scale: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...


Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...


Model size: 16213.42 KB

Results for aerowaythingwaysorted, Sample Size: 50000
Grid Search Time: 0.00s, Training Time: 16.98s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.9953, MAE = 2768.56, MAPE = 268.04%
q-score: 4.74
Prediction time: 13.1175 μs/sample
I/O: Reads=0.000000, Writes=0.000000
--------------------------------------------------------------------------------

Training with sample size: 10000
Memory usage: 2847.97 MB
Using best parameters from max scale: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...


Making predictions...
Calculating performance metrics...
Measuring prediction performance...


Saving model...


Model size: 3677.18 KB

Results for aerowaythingwaysorted, Sample Size: 10000
Grid Search Time: 0.00s, Training Time: 3.04s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.9800, MAE = 6227.93, MAPE = 1029.77%
q-score: 14.01
Prediction time: 10.1246 μs/sample
I/O: Reads=0.000000, Writes=0.000062
--------------------------------------------------------------------------------

Training with sample size: 5000
Memory usage: 2847.97 MB
Using best parameters from max scale: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...
Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...
Model size: 1969.31 KB

Results for aerowaythingwaysorted, Sample Size: 5000
Grid Search Time: 0.00s, Training Time: 1.57s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.9502, MAE = 9564.00, MAPE = 1836.95%
q-score: 24.08
Prediction time: 9.4653 μs/sample
I/O: Reads=0.000000, Writes=0.000011
--------------------------------------------------------------------------------



Training with sample size: 1000
Memory usage: 2847.97 MB
Using best parameters from max scale: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...
Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...
Model size: 444.05 KB

Results for aerowaythingwaysorted, Sample Size: 1000
Grid Search Time: 0.00s, Training Time: 0.39s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.8047, MAE = 19564.15, MAPE = 5101.08%
q-score: 60.97
Prediction time: 7.3817 μs/sample
I/O: Reads=0.000000, Writes=0.000000
--------------------------------------------------------------------------------
Saving results for aerowaythingwaysorted...


Memory usage: 2847.97 MB

Processing dataset: zcta5
Universe boundaries for zcta5: (-176.684744, -14.373776, 145.830505, 71.341324)
Loading data from ../large_files/resultsIntersects/zcta5_results.csv
Parsing MBR coordinates...

Basic statistics for zcta5 dataset:
Max count: 33136.0
Min count: 0.0
Mean count: 676.56
Median count: 0.00
Total samples: 6626

Calculating rectangle densities...
Splitting data into train and test sets...
Training set size: 5300

Training with sample size: 5300
Memory usage: 2847.97 MB
Performing grid search for optimal parameters...
Fitting 3 folds for each of 36 candidates, totalling 108 fits


Grid search complete in 168.05s
Best parameters: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 50}
Training random forest model...
Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...
Model size: 770.95 KB

Results for zcta5, Sample Size: 5300
Grid Search Time: 168.05s, Training Time: 1.22s
Random Forest Parameters: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 50}
Performance: R² = 0.9855, MAE = 98.29, MAPE = 112.59%
q-score: 9.10
Prediction time: 56.3718 μs/sample
I/O: Reads=0.000000, Writes=0.000000
--------------------------------------------------------------------------------
Generating prediction scatter plot...


Generating side-by-side comparison plot...



Training with sample size: 5000
Memory usage: 2847.96 MB
Using best parameters from max scale: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 50}
Training random forest model...
Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...
Model size: 725.92 KB

Results for zcta5, Sample Size: 5000
Grid Search Time: 0.00s, Training Time: 1.16s
Random Forest Parameters: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 50}
Performance: R² = 0.9826, MAE = 108.54, MAPE = 153.92%
q-score: 12.02
Prediction time: 56.1309 μs/sample
I/O: Reads=0.000000, Writes=0.000000
--------------------------------------------------------------------------------

Training with sample size: 1000
Memory usage: 2847.97 MB
Using best parameters from max scale: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 50}
Training random forest model...


Making predictions...
Calculating performance metrics...
Measuring prediction performance...


Saving model...
Model size: 184.12 KB

Results for zcta5, Sample Size: 1000
Grid Search Time: 0.00s, Training Time: 0.35s
Random Forest Parameters: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 50}
Performance: R² = 0.8722, MAE = 300.90, MAPE = 279.52%
q-score: 18.37
Prediction time: 49.6397 μs/sample
I/O: Reads=0.000000, Writes=0.000000
--------------------------------------------------------------------------------
Saving results for zcta5...


Memory usage: 2847.96 MB

Processing dataset: leisurewaysorted
Universe boundaries for leisurewaysorted: (-179.8728244, -89.6957847, 179.8091866, 81.0280175)
Loading data from ../large_files/resultsIntersects/leisurewaysorted_results.csv


Parsing MBR coordinates...



Basic statistics for leisurewaysorted dataset:
Max count: 29382688.0
Min count: 0.0
Mean count: 489269.22
Median count: 253.00
Total samples: 5000000

Calculating rectangle densities...
Splitting data into train and test sets...


Training set size: 4000000

Training with sample size: 4000000
Memory usage: 2850.97 MB
Performing grid search for optimal parameters...
Fitting 3 folds for each of 1 candidates, totalling 3 fits


Grid search complete in 5023.35s
Best parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...


Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...


Model size: 981739.18 KB

Results for leisurewaysorted, Sample Size: 4000000
Grid Search Time: 5023.35s, Training Time: 3002.09s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.9998, MAE = 5874.89, MAPE = 36.51%
q-score: 1.59
Prediction time: 36.4353 μs/sample
I/O: Reads=0.000000, Writes=0.000002
--------------------------------------------------------------------------------
Generating prediction scatter plot...


Generating side-by-side comparison plot...



Training with sample size: 3000000
Memory usage: 12158.12 MB
Using best parameters from max scale: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...


Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...


Model size: 745362.14 KB

Results for leisurewaysorted, Sample Size: 3000000
Grid Search Time: 0.00s, Training Time: 2074.49s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.9998, MAE = 6547.71, MAPE = 44.10%
q-score: 1.72
Prediction time: 33.9996 μs/sample
I/O: Reads=0.000006, Writes=0.000002
--------------------------------------------------------------------------------

Training with sample size: 2000000
Memory usage: 4353.87 MB
Using best parameters from max scale: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...


Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...


Model size: 505715.64 KB

Results for leisurewaysorted, Sample Size: 2000000
Grid Search Time: 0.00s, Training Time: 1405.79s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.9998, MAE = 7440.47, MAPE = 57.63%
q-score: 1.92
Prediction time: 30.9645 μs/sample
I/O: Reads=0.000006, Writes=0.000001
--------------------------------------------------------------------------------

Training with sample size: 1000000
Memory usage: 4047.94 MB
Using best parameters from max scale: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...


Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...


Model size: 260879.05 KB

Results for leisurewaysorted, Sample Size: 1000000
Grid Search Time: 0.00s, Training Time: 587.72s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.9996, MAE = 9643.63, MAPE = 88.89%
q-score: 2.39
Prediction time: 26.2512 μs/sample
I/O: Reads=0.000000, Writes=0.000001
--------------------------------------------------------------------------------

Training with sample size: 500000
Memory usage: 4368.09 MB
Using best parameters from max scale: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...


Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...


Model size: 134877.54 KB

Results for leisurewaysorted, Sample Size: 500000
Grid Search Time: 0.00s, Training Time: 249.28s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.9993, MAE = 12755.09, MAPE = 150.07%
q-score: 4.23
Prediction time: 20.4930 μs/sample
I/O: Reads=0.000000, Writes=0.000383
--------------------------------------------------------------------------------

Training with sample size: 100000
Memory usage: 4374.20 MB
Using best parameters from max scale: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...


Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...


Model size: 29445.59 KB

Results for leisurewaysorted, Sample Size: 100000
Grid Search Time: 0.00s, Training Time: 37.12s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.9979, MAE = 24762.55, MAPE = 581.66%
q-score: 9.47
Prediction time: 10.4511 μs/sample
I/O: Reads=0.000000, Writes=0.000000
--------------------------------------------------------------------------------

Training with sample size: 50000
Memory usage: 4374.20 MB
Using best parameters from max scale: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...


Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...


Model size: 15403.85 KB

Results for leisurewaysorted, Sample Size: 50000
Grid Search Time: 0.00s, Training Time: 18.22s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.9965, MAE = 34017.94, MAPE = 1215.96%
q-score: 17.84
Prediction time: 8.6862 μs/sample
I/O: Reads=0.000000, Writes=0.000001
--------------------------------------------------------------------------------

Training with sample size: 10000
Memory usage: 4374.20 MB
Using best parameters from max scale: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...


Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...


Model size: 3479.10 KB

Results for leisurewaysorted, Sample Size: 10000
Grid Search Time: 0.00s, Training Time: 3.13s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.9848, MAE = 77765.15, MAPE = 8226.19%
q-score: 100.86
Prediction time: 6.5046 μs/sample
I/O: Reads=0.000000, Writes=0.000250
--------------------------------------------------------------------------------

Training with sample size: 5000
Memory usage: 4374.20 MB
Using best parameters from max scale: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...
Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...
Model size: 1864.96 KB

Results for leisurewaysorted, Sample Size: 5000
Grid Search Time: 0.00s, Training Time: 1.53s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.9672, MAE = 115805.67, MAPE = 17847.51%
q-score: 206.54
Prediction time: 5.6281 μs/sample
I/O: Reads=0.000000, Writes=0.000170
--------------------------------------------------------------------------------



Training with sample size: 1000
Memory usage: 4374.20 MB
Using best parameters from max scale: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...
Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...
Model size: 429.61 KB

Results for leisurewaysorted, Sample Size: 1000
Grid Search Time: 0.00s, Training Time: 0.45s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.7109, MAE = 373390.79, MAPE = 124069.78%
q-score: 1442.90
Prediction time: 4.1071 μs/sample
I/O: Reads=0.000000, Writes=0.000000
--------------------------------------------------------------------------------
Saving results for leisurewaysorted...


Memory usage: 4374.20 MB

Processing dataset: areawater
Universe boundaries for areawater: (-179.231086, -14.601813, 179.859681, 71.441059)
Loading data from ../large_files/resultsIntersects/areawater_results.csv


Parsing MBR coordinates...



Basic statistics for areawater dataset:
Max count: 2292737.0
Min count: 0.0
Mean count: 43941.55
Median count: 0.00
Total samples: 458552

Calculating rectangle densities...
Splitting data into train and test sets...
Training set size: 366841

Training with sample size: 366841
Memory usage: 4376.20 MB
Performing grid search for optimal parameters...
Fitting 3 folds for each of 1 candidates, totalling 3 fits


Grid search complete in 242.19s
Best parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...


Making predictions...
Calculating performance metrics...
Measuring prediction performance...


Saving model...


Model size: 24328.84 KB

Results for areawater, Sample Size: 366841
Grid Search Time: 242.19s, Training Time: 166.69s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.9995, MAE = 839.96, MAPE = 17.91%
q-score: 2.22
Prediction time: 8.4062 μs/sample
I/O: Reads=0.000000, Writes=0.000000
--------------------------------------------------------------------------------
Generating prediction scatter plot...


Generating side-by-side comparison plot...



Training with sample size: 100000
Memory usage: 4376.20 MB
Using best parameters from max scale: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...


Making predictions...
Calculating performance metrics...
Measuring prediction performance...


Saving model...


Model size: 7377.40 KB

Results for areawater, Sample Size: 100000
Grid Search Time: 0.00s, Training Time: 31.85s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.9984, MAE = 1581.00, MAPE = 64.86%
q-score: 5.56
Prediction time: 6.9202 μs/sample
I/O: Reads=0.000048, Writes=0.000000
--------------------------------------------------------------------------------

Training with sample size: 50000
Memory usage: 4374.20 MB
Using best parameters from max scale: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...


Making predictions...
Calculating performance metrics...
Measuring prediction performance...


Saving model...


Model size: 3917.96 KB

Results for areawater, Sample Size: 50000
Grid Search Time: 0.00s, Training Time: 13.34s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.9972, MAE = 2212.31, MAPE = 148.17%
q-score: 9.94
Prediction time: 6.3492 μs/sample
I/O: Reads=0.000000, Writes=0.000000
--------------------------------------------------------------------------------

Training with sample size: 10000
Memory usage: 4374.20 MB
Using best parameters from max scale: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...


Making predictions...
Calculating performance metrics...
Measuring prediction performance...


Saving model...
Model size: 968.20 KB

Results for areawater, Sample Size: 10000
Grid Search Time: 0.00s, Training Time: 2.80s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.9882, MAE = 4887.32, MAPE = 660.51%
q-score: 37.15
Prediction time: 5.7803 μs/sample
I/O: Reads=0.000000, Writes=0.000000
--------------------------------------------------------------------------------

Training with sample size: 5000
Memory usage: 4374.20 MB
Using best parameters from max scale: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...


Making predictions...
Calculating performance metrics...
Measuring prediction performance...


Saving model...
Model size: 552.58 KB

Results for areawater, Sample Size: 5000
Grid Search Time: 0.00s, Training Time: 1.31s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.9765, MAE = 7267.46, MAPE = 1471.06%
q-score: 67.36
Prediction time: 5.2330 μs/sample
I/O: Reads=0.000000, Writes=0.000000
--------------------------------------------------------------------------------

Training with sample size: 1000
Memory usage: 4374.20 MB
Using best parameters from max scale: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...


Making predictions...
Calculating performance metrics...
Measuring prediction performance...


Saving model...
Model size: 125.65 KB

Results for areawater, Sample Size: 1000
Grid Search Time: 0.00s, Training Time: 0.36s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.9229, MAE = 16718.65, MAPE = 8105.42%
q-score: 365.71
Prediction time: 4.6732 μs/sample
I/O: Reads=0.000000, Writes=0.000008
--------------------------------------------------------------------------------
Saving results for areawater...


Memory usage: 4374.20 MB

Processing dataset: barrierthingwaysorted
Universe boundaries for barrierthingwaysorted: (-179.7595238, -70.776382, 179.19591350000002, 78.2501675)
Loading data from ../large_files/resultsIntersects/barrierthingwaysorted_results.csv


Parsing MBR coordinates...



Basic statistics for barrierthingwaysorted dataset:
Max count: 22908267.0
Min count: 0.0
Mean count: 399933.77
Median count: 329.00
Total samples: 4581670

Calculating rectangle densities...
Splitting data into train and test sets...


Training set size: 3665336

Training with sample size: 3665336
Memory usage: 4378.20 MB
Performing grid search for optimal parameters...
Fitting 3 folds for each of 1 candidates, totalling 3 fits


Grid search complete in 4443.09s
Best parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...


Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...


Model size: 932302.78 KB

Results for barrierthingwaysorted, Sample Size: 3665336
Grid Search Time: 4443.09s, Training Time: 2535.52s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.9999, MAE = 4564.90, MAPE = 35.92%
q-score: 1.56
Prediction time: 38.8029 μs/sample
I/O: Reads=0.000000, Writes=0.000002
--------------------------------------------------------------------------------
Generating prediction scatter plot...


Generating side-by-side comparison plot...



Training with sample size: 3000000
Memory usage: 11808.93 MB
Using best parameters from max scale: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...


Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...


Model size: 770867.50 KB

Results for barrierthingwaysorted, Sample Size: 3000000
Grid Search Time: 0.00s, Training Time: 2099.67s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.9998, MAE = 4924.53, MAPE = 40.84%
q-score: 1.64
Prediction time: 36.6875 μs/sample
I/O: Reads=0.000000, Writes=0.000003
--------------------------------------------------------------------------------

Training with sample size: 2000000
Memory usage: 4838.32 MB
Using best parameters from max scale: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...


Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...


Model size: 524579.47 KB

Results for barrierthingwaysorted, Sample Size: 2000000
Grid Search Time: 0.00s, Training Time: 1359.75s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.9998, MAE = 5721.24, MAPE = 50.66%
q-score: 1.79
Prediction time: 33.0839 μs/sample
I/O: Reads=0.000000, Writes=0.000002
--------------------------------------------------------------------------------

Training with sample size: 1000000
Memory usage: 4654.96 MB
Using best parameters from max scale: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...


Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...


Model size: 271718.72 KB

Results for barrierthingwaysorted, Sample Size: 1000000
Grid Search Time: 0.00s, Training Time: 595.93s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.9996, MAE = 7535.82, MAPE = 83.67%
q-score: 2.28
Prediction time: 28.1572 μs/sample
I/O: Reads=0.000000, Writes=0.000001
--------------------------------------------------------------------------------

Training with sample size: 500000
Memory usage: 4884.76 MB
Using best parameters from max scale: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...


Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...


Model size: 140767.90 KB

Results for barrierthingwaysorted, Sample Size: 500000
Grid Search Time: 0.00s, Training Time: 251.20s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.9994, MAE = 9733.86, MAPE = 124.70%
q-score: 2.88
Prediction time: 22.1800 μs/sample
I/O: Reads=0.000006, Writes=0.000448
--------------------------------------------------------------------------------

Training with sample size: 100000
Memory usage: 4890.16 MB
Using best parameters from max scale: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...


Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...


Model size: 30706.31 KB

Results for barrierthingwaysorted, Sample Size: 100000
Grid Search Time: 0.00s, Training Time: 36.34s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.9980, MAE = 18846.01, MAPE = 409.93%
q-score: 6.69
Prediction time: 10.8613 μs/sample
I/O: Reads=0.000000, Writes=0.000000
--------------------------------------------------------------------------------

Training with sample size: 50000
Memory usage: 4890.16 MB
Using best parameters from max scale: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...


Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...


Model size: 16015.42 KB

Results for barrierthingwaysorted, Sample Size: 50000
Grid Search Time: 0.00s, Training Time: 17.50s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.9964, MAE = 26019.60, MAPE = 775.38%
q-score: 11.48
Prediction time: 9.0484 μs/sample
I/O: Reads=0.000000, Writes=0.000000
--------------------------------------------------------------------------------

Training with sample size: 10000
Memory usage: 4890.16 MB
Using best parameters from max scale: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...


Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...


Model size: 3589.68 KB

Results for barrierthingwaysorted, Sample Size: 10000
Grid Search Time: 0.00s, Training Time: 3.44s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.9841, MAE = 62575.97, MAPE = 4278.59%
q-score: 51.57
Prediction time: 6.6606 μs/sample
I/O: Reads=0.000000, Writes=0.000300
--------------------------------------------------------------------------------

Training with sample size: 5000
Memory usage: 4890.16 MB
Using best parameters from max scale: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...
Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...
Model size: 1916.17 KB

Results for barrierthingwaysorted, Sample Size: 5000
Grid Search Time: 0.00s, Training Time: 1.63s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.9689, MAE = 88018.83, MAPE = 9358.92%
q-score: 112.40
Prediction time: 5.7861 μs/sample
I/O: Reads=0.000000, Writes=0.000170
--------------------------------------------------------------------------------



Training with sample size: 1000
Memory usage: 4890.16 MB
Using best parameters from max scale: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...
Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...
Model size: 436.47 KB

Results for barrierthingwaysorted, Sample Size: 1000
Grid Search Time: 0.00s, Training Time: 0.43s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.6708, MAE = 316898.11, MAPE = 83658.35%
q-score: 955.84
Prediction time: 4.2005 μs/sample
I/O: Reads=0.000000, Writes=0.000001
--------------------------------------------------------------------------------
Saving results for barrierthingwaysorted...


Memory usage: 4890.16 MB

Processing dataset: yago2
Universe boundaries for yago2: (-179.98473, -90.0, 180.0, 90.0)
Loading data from ../large_files/resultsIntersects/yago2_results.csv


Parsing MBR coordinates...



Basic statistics for yago2 dataset:
Max count: 4494691.0
Min count: 0.0
Mean count: 100894.17
Median count: 17450.00
Total samples: 898942

Calculating rectangle densities...
Splitting data into train and test sets...
Training set size: 719153

Training with sample size: 719153
Memory usage: 4892.13 MB
Performing grid search for optimal parameters...
Fitting 3 folds for each of 1 candidates, totalling 3 fits


Grid search complete in 619.83s
Best parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...


Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...


Model size: 329080.08 KB

Results for yago2, Sample Size: 719153
Grid Search Time: 619.83s, Training Time: 365.49s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.9994, MAE = 2415.38, MAPE = 3.10%
q-score: 1.03
Prediction time: 40.4835 μs/sample
I/O: Reads=0.000000, Writes=0.000003
--------------------------------------------------------------------------------
Generating prediction scatter plot...


Generating side-by-side comparison plot...



Training with sample size: 500000
Memory usage: 6292.79 MB
Using best parameters from max scale: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...


Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...


Model size: 229645.12 KB

Results for yago2, Sample Size: 500000
Grid Search Time: 0.00s, Training Time: 252.24s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.9992, MAE = 2799.44, MAPE = 3.73%
q-score: 1.04
Prediction time: 36.2114 μs/sample
I/O: Reads=0.000033, Writes=0.002455
--------------------------------------------------------------------------------

Training with sample size: 100000
Memory usage: 4912.00 MB
Using best parameters from max scale: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...


Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...


Model size: 46908.55 KB

Results for yago2, Sample Size: 100000
Grid Search Time: 0.00s, Training Time: 39.04s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.9965, MAE = 5763.27, MAPE = 8.34%
q-score: 1.09
Prediction time: 17.5928 μs/sample
I/O: Reads=0.000000, Writes=0.000000
--------------------------------------------------------------------------------

Training with sample size: 50000
Memory usage: 4912.00 MB
Using best parameters from max scale: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...


Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...


Model size: 23672.65 KB

Results for yago2, Sample Size: 50000
Grid Search Time: 0.00s, Training Time: 17.61s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.9945, MAE = 7692.12, MAPE = 12.02%
q-score: 1.13
Prediction time: 12.7321 μs/sample
I/O: Reads=0.000000, Writes=0.000000
--------------------------------------------------------------------------------

Training with sample size: 10000
Memory usage: 4912.00 MB
Using best parameters from max scale: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...


Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...


Model size: 4889.23 KB

Results for yago2, Sample Size: 10000
Grid Search Time: 0.00s, Training Time: 3.15s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.9693, MAE = 17883.77, MAPE = 327.89%
q-score: 4.22
Prediction time: 9.1879 μs/sample
I/O: Reads=0.000001, Writes=0.000413
--------------------------------------------------------------------------------

Training with sample size: 5000
Memory usage: 4912.00 MB
Using best parameters from max scale: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...
Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...
Model size: 2510.33 KB

Results for yago2, Sample Size: 5000
Grid Search Time: 0.00s, Training Time: 1.64s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.9415, MAE = 24876.62, MAPE = 447.43%
q-score: 5.43
Prediction time: 8.0740 μs/sample
I/O: Reads=0.000000, Writes=0.000000
--------------------------------------------------------------------------------



Training with sample size: 1000
Memory usage: 4912.00 MB
Using best parameters from max scale: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...
Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...
Model size: 535.14 KB

Results for yago2, Sample Size: 1000
Grid Search Time: 0.00s, Training Time: 0.43s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.8264, MAE = 43649.54, MAPE = 1043.34%
q-score: 11.53
Prediction time: 5.5916 μs/sample
I/O: Reads=0.000000, Writes=0.000000
--------------------------------------------------------------------------------
Saving results for yago2...


Memory usage: 4912.00 MB

Processing dataset: arealm
Universe boundaries for arealm: (-179.147236, -14.548699, 179.77847, 71.359879)
Loading data from ../large_files/resultsIntersects/arealm_results.csv
Parsing MBR coordinates...



Basic statistics for arealm dataset:
Max count: 129098.0
Min count: 0.0
Mean count: 2283.77
Median count: 0.00
Total samples: 25833

Calculating rectangle densities...
Splitting data into train and test sets...
Training set size: 20666

Training with sample size: 20666
Memory usage: 4911.11 MB
Performing grid search for optimal parameters...
Fitting 3 folds for each of 36 candidates, totalling 108 fits


Grid search complete in 477.74s
Best parameters: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 200}
Training random forest model...


Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...


Model size: 11331.57 KB

Results for arealm, Sample Size: 20666
Grid Search Time: 477.74s, Training Time: 17.67s
Random Forest Parameters: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 200}
Performance: R² = 0.9943, MAE = 170.85, MAPE = 61.69%
q-score: 4.71
Prediction time: 83.2261 μs/sample
I/O: Reads=0.000000, Writes=0.000000
--------------------------------------------------------------------------------
Generating prediction scatter plot...
Generating side-by-side comparison plot...



Training with sample size: 10000
Memory usage: 4911.11 MB
Using best parameters from max scale: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 200}
Training random forest model...


Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...


Model size: 5896.83 KB

Results for arealm, Sample Size: 10000
Grid Search Time: 0.00s, Training Time: 7.92s
Random Forest Parameters: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 200}
Performance: R² = 0.9841, MAE = 278.87, MAPE = 82.07%
q-score: 5.29
Prediction time: 76.5489 μs/sample
I/O: Reads=0.000000, Writes=0.000000
--------------------------------------------------------------------------------

Training with sample size: 5000
Memory usage: 4911.11 MB
Using best parameters from max scale: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 200}
Training random forest model...


Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...


Model size: 3276.05 KB

Results for arealm, Sample Size: 5000
Grid Search Time: 0.00s, Training Time: 4.45s
Random Forest Parameters: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 200}
Performance: R² = 0.9765, MAE = 346.38, MAPE = 121.06%
q-score: 7.66
Prediction time: 72.7006 μs/sample
I/O: Reads=0.000000, Writes=0.000252
--------------------------------------------------------------------------------

Training with sample size: 1000
Memory usage: 4911.11 MB
Using best parameters from max scale: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 200}
Training random forest model...


Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...
Model size: 783.34 KB

Results for arealm, Sample Size: 1000
Grid Search Time: 0.00s, Training Time: 1.64s
Random Forest Parameters: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 200}
Performance: R² = 0.8223, MAE = 1102.43, MAPE = 1242.85%
q-score: 72.07
Prediction time: 60.8022 μs/sample
I/O: Reads=0.000000, Writes=0.000000
--------------------------------------------------------------------------------


Saving results for arealm...
Memory usage: 4911.11 MB

Processing dataset: aerowaythingnodesorted
Universe boundaries for aerowaythingnodesorted: (-179.88088960000002, -90.0, 179.951004, 83.08333590000001)
Loading data from ../large_files/resultsIntersects/aerowaythingnodesorted_results.csv


Parsing MBR coordinates...

Basic statistics for aerowaythingnodesorted dataset:
Max count: 79139.0
Min count: 0.0
Mean count: 1260.61
Median count: 7.00
Total samples: 15843

Calculating rectangle densities...
Splitting data into train and test sets...
Training set size: 12674

Training with sample size: 12674
Memory usage: 4911.00 MB
Performing grid search for optimal parameters...
Fitting 3 folds for each of 36 candidates, totalling 108 fits


Grid search complete in 408.62s
Best parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}
Training random forest model...


Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...


Model size: 28501.57 KB

Results for aerowaythingnodesorted, Sample Size: 12674
Grid Search Time: 408.62s, Training Time: 15.50s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}
Performance: R² = 0.9781, MAE = 252.63, MAPE = 462.65%
q-score: 8.32
Prediction time: 185.8478 μs/sample
I/O: Reads=0.000000, Writes=0.000000
--------------------------------------------------------------------------------
Generating prediction scatter plot...
Generating side-by-side comparison plot...



Training with sample size: 10000
Memory usage: 4910.99 MB
Using best parameters from max scale: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}
Training random forest model...


Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...


Model size: 22810.09 KB

Results for aerowaythingnodesorted, Sample Size: 10000
Grid Search Time: 0.00s, Training Time: 12.84s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}
Performance: R² = 0.9784, MAE = 271.26, MAPE = 493.46%
q-score: 8.79
Prediction time: 168.7624 μs/sample
I/O: Reads=0.000000, Writes=0.000316
--------------------------------------------------------------------------------

Training with sample size: 5000
Memory usage: 4911.00 MB
Using best parameters from max scale: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}
Training random forest model...


Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...


Model size: 12036.63 KB

Results for aerowaythingnodesorted, Sample Size: 5000
Grid Search Time: 0.00s, Training Time: 6.47s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}
Performance: R² = 0.9613, MAE = 372.63, MAPE = 730.61%
q-score: 12.55
Prediction time: 153.5441 μs/sample
I/O: Reads=0.000000, Writes=0.000000
--------------------------------------------------------------------------------

Training with sample size: 1000
Memory usage: 4911.00 MB
Using best parameters from max scale: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}
Training random forest model...


Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...


Model size: 2764.45 KB

Results for aerowaythingnodesorted, Sample Size: 1000
Grid Search Time: 0.00s, Training Time: 2.11s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}
Performance: R² = 0.8061, MAE = 860.53, MAPE = 3130.52%
q-score: 49.29
Prediction time: 126.3392 μs/sample
I/O: Reads=0.000000, Writes=0.000000
--------------------------------------------------------------------------------
Saving results for aerowaythingnodesorted...
Memory usage: 4911.00 MB

Processing dataset: powerthingwaysorted
Universe boundaries for powerthingwaysorted: (-179.5002188, -75.1012051, 178.4574038, 82.5247908)
Loading data from ../large_files/resultsIntersects/powerthingwaysorted_results.csv


Parsing MBR coordinates...



Basic statistics for powerthingwaysorted dataset:
Max count: 13586343.0
Min count: 0.0
Mean count: 236405.41
Median count: 147.00
Total samples: 2717289

Calculating rectangle densities...
Splitting data into train and test sets...


Training set size: 2173831

Training with sample size: 2173831
Memory usage: 4914.99 MB
Performing grid search for optimal parameters...
Fitting 3 folds for each of 1 candidates, totalling 3 fits


Grid search complete in 2385.50s
Best parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...


Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...


Model size: 539415.81 KB

Results for powerthingwaysorted, Sample Size: 2173831
Grid Search Time: 2385.50s, Training Time: 1465.60s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.9997, MAE = 4036.76, MAPE = 42.74%
q-score: 1.72
Prediction time: 32.3260 μs/sample
I/O: Reads=0.000000, Writes=0.000003
--------------------------------------------------------------------------------
Generating prediction scatter plot...


Generating side-by-side comparison plot...



Training with sample size: 2000000
Memory usage: 7952.78 MB
Using best parameters from max scale: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...


Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...


Model size: 497694.30 KB

Results for powerthingwaysorted, Sample Size: 2000000
Grid Search Time: 0.00s, Training Time: 1276.46s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.9997, MAE = 4178.82, MAPE = 46.37%
q-score: 1.78
Prediction time: 30.4944 μs/sample
I/O: Reads=0.000000, Writes=0.000000
--------------------------------------------------------------------------------

Training with sample size: 1000000
Memory usage: 4783.88 MB
Using best parameters from max scale: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...


Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...


Model size: 255406.16 KB

Results for powerthingwaysorted, Sample Size: 1000000
Grid Search Time: 0.00s, Training Time: 584.55s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.9995, MAE = 5586.65, MAPE = 72.52%
q-score: 2.21
Prediction time: 26.1944 μs/sample
I/O: Reads=0.000000, Writes=0.000008
--------------------------------------------------------------------------------

Training with sample size: 500000
Memory usage: 4873.23 MB
Using best parameters from max scale: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...


Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...


Model size: 131465.05 KB

Results for powerthingwaysorted, Sample Size: 500000
Grid Search Time: 0.00s, Training Time: 249.09s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.9991, MAE = 7198.89, MAPE = 124.42%
q-score: 3.03
Prediction time: 20.4749 μs/sample
I/O: Reads=0.000000, Writes=0.000000
--------------------------------------------------------------------------------

Training with sample size: 100000
Memory usage: 4907.42 MB
Using best parameters from max scale: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...


Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...


Model size: 28654.33 KB

Results for powerthingwaysorted, Sample Size: 100000
Grid Search Time: 0.00s, Training Time: 38.96s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.9975, MAE = 13551.89, MAPE = 501.41%
q-score: 8.58
Prediction time: 10.5173 μs/sample
I/O: Reads=0.000000, Writes=0.000159
--------------------------------------------------------------------------------

Training with sample size: 50000
Memory usage: 4908.11 MB
Using best parameters from max scale: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...


Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...


Model size: 15018.21 KB

Results for powerthingwaysorted, Sample Size: 50000
Grid Search Time: 0.00s, Training Time: 17.21s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.9955, MAE = 18944.15, MAPE = 1109.47%
q-score: 16.65
Prediction time: 8.9939 μs/sample
I/O: Reads=0.000000, Writes=0.000001
--------------------------------------------------------------------------------

Training with sample size: 10000
Memory usage: 4908.11 MB
Using best parameters from max scale: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...


Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...


Model size: 3374.78 KB

Results for powerthingwaysorted, Sample Size: 10000
Grid Search Time: 0.00s, Training Time: 3.39s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.9736, MAE = 46573.98, MAPE = 4971.44%
q-score: 67.57
Prediction time: 6.8085 μs/sample
I/O: Reads=0.000000, Writes=0.000002
--------------------------------------------------------------------------------

Training with sample size: 5000
Memory usage: 4908.11 MB
Using best parameters from max scale: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...
Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...
Model size: 1818.21 KB

Results for powerthingwaysorted, Sample Size: 5000
Grid Search Time: 0.00s, Training Time: 1.61s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.9514, MAE = 67815.11, MAPE = 12630.27%
q-score: 162.69
Prediction time: 5.9807 μs/sample
I/O: Reads=0.000000, Writes=0.000000
--------------------------------------------------------------------------------



Training with sample size: 1000
Memory usage: 4908.11 MB
Using best parameters from max scale: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...
Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...
Model size: 422.96 KB

Results for powerthingwaysorted, Sample Size: 1000
Grid Search Time: 0.00s, Training Time: 0.45s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.6974, MAE = 159314.35, MAPE = 45746.97%
q-score: 545.04
Prediction time: 4.3391 μs/sample
I/O: Reads=0.000000, Writes=0.000000
--------------------------------------------------------------------------------
Saving results for powerthingwaysorted...


Memory usage: 4908.11 MB

Processing dataset: emergencythingwaysorted
Universe boundaries for emergencythingwaysorted: (-175.221337, -53.7941359, 179.3313189, 78.22019230000001)
Loading data from ../large_files/resultsIntersects/emergencythingwaysorted_results.csv


Parsing MBR coordinates...



Basic statistics for emergencythingwaysorted dataset:
Max count: 807533.0
Min count: 0.0
Mean count: 13253.96
Median count: 15.00
Total samples: 161514

Calculating rectangle densities...
Splitting data into train and test sets...
Training set size: 129211

Training with sample size: 129211
Memory usage: 4910.08 MB
Performing grid search for optimal parameters...
Fitting 3 folds for each of 1 candidates, totalling 3 fits


Grid search complete in 79.65s
Best parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...


Making predictions...
Calculating performance metrics...
Measuring prediction performance...


Saving model...


Model size: 32391.67 KB

Results for emergencythingwaysorted, Sample Size: 129211
Grid Search Time: 79.65s, Training Time: 45.80s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.9984, MAE = 467.79, MAPE = 56.74%
q-score: 2.00
Prediction time: 20.2994 μs/sample
I/O: Reads=0.000000, Writes=0.000000
--------------------------------------------------------------------------------
Generating prediction scatter plot...
Generating side-by-side comparison plot...



Training with sample size: 100000
Memory usage: 4909.09 MB
Using best parameters from max scale: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...


Making predictions...
Calculating performance metrics...
Measuring prediction performance...


Saving model...


Model size: 25611.41 KB

Results for emergencythingwaysorted, Sample Size: 100000
Grid Search Time: 0.00s, Training Time: 35.57s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.9982, MAE = 519.28, MAPE = 66.13%
q-score: 2.15
Prediction time: 18.4719 μs/sample
I/O: Reads=0.000000, Writes=0.000000
--------------------------------------------------------------------------------

Training with sample size: 50000
Memory usage: 4909.11 MB
Using best parameters from max scale: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...


Making predictions...
Calculating performance metrics...
Measuring prediction performance...


Saving model...


Model size: 13541.54 KB

Results for emergencythingwaysorted, Sample Size: 50000
Grid Search Time: 0.00s, Training Time: 18.12s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.9962, MAE = 779.47, MAPE = 132.02%
q-score: 3.21
Prediction time: 17.0749 μs/sample
I/O: Reads=0.000000, Writes=0.000000
--------------------------------------------------------------------------------

Training with sample size: 10000
Memory usage: 4909.11 MB
Using best parameters from max scale: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...


Making predictions...
Calculating performance metrics...
Measuring prediction performance...


Saving model...


Model size: 3159.73 KB

Results for emergencythingwaysorted, Sample Size: 10000
Grid Search Time: 0.00s, Training Time: 3.30s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.9851, MAE = 2061.08, MAPE = 570.67%
q-score: 9.55
Prediction time: 14.5177 μs/sample
I/O: Reads=0.000000, Writes=0.000000
--------------------------------------------------------------------------------

Training with sample size: 5000
Memory usage: 4909.11 MB
Using best parameters from max scale: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...
Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...
Model size: 1712.95 KB

Results for emergencythingwaysorted, Sample Size: 5000
Grid Search Time: 0.00s, Training Time: 1.74s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.9694, MAE = 3227.60, MAPE = 1173.53%
q-score: 18.26
Prediction time: 13.5812 μs/sample
I/O: Reads=0.000000, Writes=0.000000
--------------------------------------------------------------------------------



Training with sample size: 1000
Memory usage: 4909.36 MB
Using best parameters from max scale: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training random forest model...
Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...
Model size: 390.38 KB

Results for emergencythingwaysorted, Sample Size: 1000
Grid Search Time: 0.00s, Training Time: 0.41s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Performance: R² = 0.7169, MAE = 9314.28, MAPE = 3573.76%
q-score: 53.40
Prediction time: 11.5074 μs/sample
I/O: Reads=0.000000, Writes=0.000000
--------------------------------------------------------------------------------
Saving results for emergencythingwaysorted...


Memory usage: 4909.36 MB

Processing dataset: craftwaysorted
Universe boundaries for craftwaysorted: (-175.2000514, -65.2458821, 175.3397782, 69.6673353)
Loading data from ../large_files/resultsIntersects/craftwaysorted_results.csv
Parsing MBR coordinates...



Basic statistics for craftwaysorted dataset:
Max count: 108929.0
Min count: 0.0
Mean count: 1705.24
Median count: 0.00
Total samples: 21822

Calculating rectangle densities...
Splitting data into train and test sets...
Training set size: 17457

Training with sample size: 17457
Memory usage: 4908.43 MB
Performing grid search for optimal parameters...
Fitting 3 folds for each of 36 candidates, totalling 108 fits


Grid search complete in 503.64s
Best parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}
Training random forest model...


Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...


Model size: 26119.31 KB

Results for craftwaysorted, Sample Size: 17457
Grid Search Time: 503.64s, Training Time: 20.84s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}
Performance: R² = 0.9931, MAE = 176.71, MAPE = 76.19%
q-score: 2.63
Prediction time: 145.7200 μs/sample
I/O: Reads=0.000000, Writes=0.000160
--------------------------------------------------------------------------------
Generating prediction scatter plot...
Generating side-by-side comparison plot...



Training with sample size: 10000
Memory usage: 4908.43 MB
Using best parameters from max scale: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}
Training random forest model...


Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...


Model size: 16074.05 KB

Results for craftwaysorted, Sample Size: 10000
Grid Search Time: 0.00s, Training Time: 12.16s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}
Performance: R² = 0.9885, MAE = 247.02, MAPE = 142.45%
q-score: 3.89
Prediction time: 129.2658 μs/sample
I/O: Reads=0.000000, Writes=0.000000
--------------------------------------------------------------------------------

Training with sample size: 5000
Memory usage: 4908.43 MB
Using best parameters from max scale: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}
Training random forest model...


Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...


Model size: 8795.61 KB

Results for craftwaysorted, Sample Size: 5000
Grid Search Time: 0.00s, Training Time: 6.02s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}
Performance: R² = 0.9710, MAE = 406.57, MAPE = 241.02%
q-score: 5.68
Prediction time: 113.8700 μs/sample
I/O: Reads=0.000000, Writes=0.000115
--------------------------------------------------------------------------------

Training with sample size: 1000
Memory usage: 4908.43 MB
Using best parameters from max scale: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}
Training random forest model...


Making predictions...


Calculating performance metrics...
Measuring prediction performance...


Saving model...
Model size: 2042.62 KB

Results for craftwaysorted, Sample Size: 1000
Grid Search Time: 0.00s, Training Time: 1.86s
Random Forest Parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}
Performance: R² = 0.7582, MAE = 1211.56, MAPE = 725.38%
q-score: 14.54
Prediction time: 96.7891 μs/sample
I/O: Reads=0.000000, Writes=0.000069
--------------------------------------------------------------------------------


Saving results for craftwaysorted...
Saving combined results...
All processing completed and results saved.
Memory usage: 4908.43 MB
