In [1]:
import subprocess
import json
from urllib.request import urlopen

def in_colab():
    try:
        import google.colab
        return True
    except ImportError:
        return False

def install_dependencies_from_pyproject():
    # URL to raw pyproject.toml file in your GitHub repository
    pyproject_url = "https://raw.githubusercontent.com/rdhyee/isamples-python/exploratory/pyproject.toml"
    
    with urlopen(pyproject_url) as response:
        pyproject_content = response.read().decode()
    
    # Parse the TOML content
    import toml
    pyproject_data = toml.loads(pyproject_content)
    
    # Extract dependencies
    dependencies = pyproject_data.get('tool', {}).get('poetry', {}).get('dependencies', {})
    
    # Install each dependency
    for package, version in dependencies.items():
        if isinstance(version, str):
            subprocess.run(['pip', 'install', f"{package}{version}"])
        elif isinstance(version, dict):
            # Handle more complex version specifications
            version_str = version.get('version', '')
            subprocess.run(['pip', 'install', f"{package}{version_str}"])

if in_colab():
    # Install toml parser first
    subprocess.run(['pip', 'install', 'toml'])
    install_dependencies_from_pyproject()
    # pip install git+https://github.com/rdhyee/isamples-python.git@exploratory#egg=isamples_client
    subprocess.run(['pip', 'install', 'git+https://github.com/rdhyee/isamples-python.git@exploratory#egg=isamples_client'])

    from google.colab import output
    output.enable_custom_widget_manager()

In [2]:
import duckdb

# Connect to a database (in-memory for this example)
con = duckdb.connect(database=':memory:', read_only=False)

# Execute the SQL commands
con.execute("SET VARIABLE parquet_path = 'https://z.rslv.xyz/10.5281/zenodo.15278210/isamples_export_2025_04_21_16_23_46_geo.parquet';")
con.execute("CREATE TEMP VIEW my_data AS SELECT(*) FROM read_parquet(getvariable('parquet_path'));")
result = con.execute("SELECT count(*) from my_data;").fetchone()

# Print the result
print(result[0])

# Close the connection
con.close()

6680932


## Why DuckDB + Remote Parquet is So Fast

The previous cell demonstrates an incredibly efficient approach that leverages several key technologies:

### 1. **HTTP Range Requests (Byte-Range Handling)**
- `z.rslv.xyz` supports HTTP Range requests
- DuckDB can request only the specific bytes it needs from the remote file
- For a `COUNT(*)` operation, DuckDB only needs to read:
  - Parquet file metadata (footer)
  - Row group metadata 
  - NOT the actual data rows

### 2. **Parquet Columnar Format Benefits**
- Parquet stores metadata about row counts in each row group
- DuckDB can sum these counts without reading data
- For a ~300MB file, this might only require reading a few KB

### 3. **DuckDB's Query Optimization**
- Pushdown optimization: operations are pushed to the file level
- Lazy evaluation: only reads what's absolutely necessary
- Efficient metadata parsing

This means a `COUNT(*)` on a 300MB remote file can complete in seconds rather than minutes!

In [3]:
import time
import duckdb

# Demonstrate different types of queries and their efficiency with remote Parquet
con = duckdb.connect(database=':memory:', read_only=False)
remote_url = 'https://z.rslv.xyz/10.5281/zenodo.15278210/isamples_export_2025_04_21_16_23_46_geo.parquet'
con.execute(f"SET VARIABLE parquet_path = '{remote_url}';")
con.execute("CREATE TEMP VIEW my_data AS SELECT(*) FROM read_parquet(getvariable('parquet_path'));")

print("=== DuckDB Remote Parquet Performance Demo ===\n")

# Test 1: COUNT(*) - Only needs metadata
print("1. COUNT(*) - Metadata only")
start_time = time.time()
result = con.execute("SELECT count(*) from my_data;").fetchone()
elapsed = time.time() - start_time
print(f"   Result: {result[0]:,} records")
print(f"   Time: {elapsed:.2f} seconds")
print(f"   Data read: Minimal (just metadata)\n")

# Test 2: Count by groups - Still mostly metadata
print("2. COUNT by source_collection - Lightweight aggregation")
start_time = time.time()
result = con.execute("SELECT source_collection, count(*) FROM my_data GROUP BY source_collection ORDER BY count(*) DESC;").fetchall()
elapsed = time.time() - start_time
print("   Results:")
for source, count in result:
    print(f"     {source}: {count:,}")
print(f"   Time: {elapsed:.2f} seconds")
print(f"   Data read: Only source_collection column + metadata\n")

# Test 3: Simple column stats - Reads one column
print("3. Latitude statistics - Single column read")
start_time = time.time()
result = con.execute("""
    SELECT 
        count(*) as total,
        count(sample_location_latitude) as non_null,
        min(sample_location_latitude) as min_lat,
        max(sample_location_latitude) as max_lat,
        avg(sample_location_latitude) as avg_lat
    FROM my_data;
""").fetchone()
elapsed = time.time() - start_time
print(f"   Total records: {result[0]:,}")
print(f"   Non-null coordinates: {result[1]:,}")
print(f"   Latitude range: {result[2]:.3f} to {result[3]:.3f}")
print(f"   Average latitude: {result[4]:.3f}")
print(f"   Time: {elapsed:.2f} seconds")
print(f"   Data read: Only latitude column\n")

# Test 4: More complex query - Still efficient due to columnar format
print("4. Geographic bounding box filter - Selective read")
start_time = time.time()
result = con.execute("""
    SELECT count(*) 
    FROM my_data 
    WHERE sample_location_longitude BETWEEN -125 AND -66
      AND sample_location_latitude BETWEEN 24 AND 50;
""").fetchone()
elapsed = time.time() - start_time
print(f"   Records in continental US bounds: {result[0]:,}")
print(f"   Time: {elapsed:.2f} seconds")
print(f"   Data read: Only lon/lat columns + pushdown filtering\n")

con.close()

print("=== Key Insights ===")
print("• COUNT(*) is nearly instant - uses only Parquet metadata")
print("• Aggregations by categorical columns are very fast")
print("• Single-column operations read only that column")
print("• Filtering is pushed down to the file level")
print("• This approach scales to files much larger than available RAM")
print("\nThis is why DuckDB + remote Parquet is perfect for exploratory data analysis!")

=== DuckDB Remote Parquet Performance Demo ===

1. COUNT(*) - Metadata only
   Result: 6,680,932 records
   Time: 2.93 seconds
   Data read: Minimal (just metadata)

2. COUNT by source_collection - Lightweight aggregation
   Result: 6,680,932 records
   Time: 2.93 seconds
   Data read: Minimal (just metadata)

2. COUNT by source_collection - Lightweight aggregation
   Results:
     SESAR: 4,688,386
     OPENCONTEXT: 1,064,831
     GEOME: 605,554
     SMITHSONIAN: 322,161
   Time: 3.78 seconds
   Data read: Only source_collection column + metadata

3. Latitude statistics - Single column read
   Results:
     SESAR: 4,688,386
     OPENCONTEXT: 1,064,831
     GEOME: 605,554
     SMITHSONIAN: 322,161
   Time: 3.78 seconds
   Data read: Only source_collection column + metadata

3. Latitude statistics - Single column read


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

   Total records: 6,680,932
   Non-null coordinates: 5,980,282
   Latitude range: -89.983 to 89.981
   Average latitude: 16.281
   Time: 8.82 seconds
   Data read: Only latitude column

4. Geographic bounding box filter - Selective read


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

   Records in continental US bounds: 1,153,603
   Time: 12.05 seconds
   Data read: Only lon/lat columns + pushdown filtering

=== Key Insights ===
• COUNT(*) is nearly instant - uses only Parquet metadata
• Aggregations by categorical columns are very fast
• Single-column operations read only that column
• Filtering is pushed down to the file level
• This approach scales to files much larger than available RAM

This is why DuckDB + remote Parquet is perfect for exploratory data analysis!


In [4]:
# Compare with what traditional approaches would require
print("=== Traditional vs DuckDB Approach Comparison ===\n")

file_size_mb = 300  # Approximate size of the parquet file

print("Traditional Approach (e.g., pandas.read_parquet()):")
print(f"• Download entire file: {file_size_mb} MB")
print("• Load into memory: ~300-600 MB (depending on data types)")
print("• Process in Python: Limited by single-core performance")
print("• Time for COUNT(*): 30-60 seconds + download time")
print("• Memory requirement: > 1GB")
print()

print("DuckDB + Remote Parquet Approach:")
print("• Download for COUNT(*): < 1 KB (just metadata)")
print("• Memory usage: < 10 MB")
print("• Process with optimized engine: Multi-threaded, vectorized")
print("• Time for COUNT(*): 1-3 seconds")
print("• Memory requirement: Minimal")
print()

print("=== When to Use Each Approach ===")
print()
print("Use DuckDB + Remote Parquet when:")
print("✅ Doing exploratory analysis (counts, aggregations, sampling)")
print("✅ Working with large files that don't fit in memory")
print("✅ Need fast iteration on different queries")
print("✅ Bandwidth is limited")
print("✅ Working in cloud environments (Colab, Binder)")
print()

print("Consider local download when:")
print("• Need to do complex row-by-row operations")
print("• Performing many different analyses on the same data")
print("• Have unreliable network connection")
print("• Need to use libraries that require full data in memory")
print()

print("=== Best Practices for Large Remote Parquet Files ===")
print("1. Start with DuckDB for exploration and understanding")
print("2. Use COUNT(*), value_counts(), and aggregations to understand structure")
print("3. Filter data remotely before downloading subsets")
print("4. Cache filtered/sampled results locally for visualization")
print("5. Only download full dataset when absolutely necessary")

=== Traditional vs DuckDB Approach Comparison ===

Traditional Approach (e.g., pandas.read_parquet()):
• Download entire file: 300 MB
• Load into memory: ~300-600 MB (depending on data types)
• Process in Python: Limited by single-core performance
• Time for COUNT(*): 30-60 seconds + download time
• Memory requirement: > 1GB

DuckDB + Remote Parquet Approach:
• Download for COUNT(*): < 1 KB (just metadata)
• Memory usage: < 10 MB
• Process with optimized engine: Multi-threaded, vectorized
• Time for COUNT(*): 1-3 seconds
• Memory requirement: Minimal

=== When to Use Each Approach ===

Use DuckDB + Remote Parquet when:
✅ Doing exploratory analysis (counts, aggregations, sampling)
✅ Working with large files that don't fit in memory
✅ Need fast iteration on different queries
✅ Bandwidth is limited
✅ Working in cloud environments (Colab, Binder)

Consider local download when:
• Need to do complex row-by-row operations
• Performing many different analyses on the same data
• Have unreliable

In [5]:
# Practical example: Efficiently preparing data for visualization
print("=== Efficient Data Preparation for Visualization ===\n")

con = duckdb.connect(database=':memory:', read_only=False)
remote_url = 'https://z.rslv.xyz/10.5281/zenodo.15278210/isamples_export_2025_04_21_16_23_46_geo.parquet'
con.execute(f"SET VARIABLE parquet_path = '{remote_url}';")
con.execute("CREATE TEMP VIEW my_data AS SELECT(*) FROM read_parquet(getvariable('parquet_path'));")

# Step 1: Understand the data structure
print("1. Understanding data structure...")
start_time = time.time()

# Get basic counts
total_count = con.execute("SELECT count(*) FROM my_data").fetchone()[0]
geo_count = con.execute("""
    SELECT count(*) FROM my_data 
    WHERE sample_location_latitude IS NOT NULL 
    AND sample_location_longitude IS NOT NULL
""").fetchone()[0]

print(f"   Total records: {total_count:,}")
print(f"   Records with coordinates: {geo_count:,} ({geo_count/total_count*100:.1f}%)")
print(f"   Time: {time.time() - start_time:.2f} seconds\n")

# Step 2: Sample data efficiently for visualization
print("2. Creating stratified sample for visualization...")
start_time = time.time()

# Get sample that maintains source collection proportions
# Fixed version - avoid correlated subqueries in LIMIT clause
sample_query = """
    WITH collection_counts AS (
        SELECT source_collection, count(*) as total_count
        FROM my_data 
        WHERE sample_location_latitude IS NOT NULL 
        AND sample_location_longitude IS NOT NULL
        GROUP BY source_collection
    ),
    collection_samples AS (
        SELECT 
            source_collection,
            CASE 
                WHEN total_count > 5000 THEN 5000
                ELSE total_count 
            END as sample_size
        FROM collection_counts
    ),
    numbered_data AS (
        SELECT 
            sample_identifier,
            source_collection,
            sample_location_longitude as longitude,
            sample_location_latitude as latitude,
            has_material_category,
            label,
            row_number() OVER (PARTITION BY source_collection ORDER BY RANDOM()) as rn
        FROM my_data 
        WHERE sample_location_latitude IS NOT NULL 
        AND sample_location_longitude IS NOT NULL
    )
    SELECT 
        nd.sample_identifier,
        nd.source_collection,
        nd.longitude,
        nd.latitude,
        nd.has_material_category,
        nd.label
    FROM numbered_data nd
    INNER JOIN collection_samples cs ON nd.source_collection = cs.source_collection
    WHERE nd.rn <= cs.sample_size
    LIMIT 50000;
"""

# Execute the sampling query
sample_result = con.execute(sample_query).fetchdf()
elapsed = time.time() - start_time

print(f"   Sample size: {len(sample_result):,} records")
print(f"   Columns: {list(sample_result.columns)}")
print(f"   Memory usage: ~{sample_result.memory_usage(deep=True).sum() / 1024 / 1024:.1f} MB")
print(f"   Time: {elapsed:.2f} seconds")
print(f"   Data transferred: ~{len(sample_result) * 6 * 8 / 1024 / 1024:.1f} MB (estimated)\n")

# Show sample distribution
print("   Sample distribution by source:")
sample_dist = sample_result['source_collection'].value_counts()
for source, count in sample_dist.items():
    print(f"     {source}: {count:,}")

print()

# Step 3: Show how this could be saved for efficient reuse
print("3. Efficient caching strategy...")
print("   • Save sample as local Parquet file for reuse")
print("   • Use compressed format to minimize storage")
print("   • Include metadata about sampling method")

# Example of saving (uncommented for demo)
# sample_result.to_parquet('/tmp/isamples_visualization_sample.parquet', compression='snappy')

con.close()

print("\n=== Key Takeaways ===")
print("• Remote querying allows efficient exploration without large downloads")
print("• Stratified sampling maintains data representativeness")
print("• 50K sample points are sufficient for most visualization needs")
print("• Transferring 50K records vs 6M records: ~40x less data transfer")
print("• This approach works well for both local analysis and cloud environments")
print("\nThis sampled data would be perfect for Lonboard visualization!")

=== Efficient Data Preparation for Visualization ===

1. Understanding data structure...
1. Understanding data structure...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

   Total records: 6,680,932
   Records with coordinates: 5,980,282 (89.5%)
   Time: 13.08 seconds

2. Creating stratified sample for visualization...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

   Sample size: 20,000 records
   Columns: ['sample_identifier', 'source_collection', 'longitude', 'latitude', 'has_material_category', 'label']
   Memory usage: ~6.7 MB
   Time: 60.76 seconds
   Data transferred: ~0.9 MB (estimated)

   Sample distribution by source:
     SESAR: 5,000
     OPENCONTEXT: 5,000
     SMITHSONIAN: 5,000
     GEOME: 5,000

3. Efficient caching strategy...
   • Save sample as local Parquet file for reuse
   • Use compressed format to minimize storage
   • Include metadata about sampling method

=== Key Takeaways ===
• Remote querying allows efficient exploration without large downloads
• Stratified sampling maintains data representativeness
• 50K sample points are sufficient for most visualization needs
• Transferring 50K records vs 6M records: ~40x less data transfer
• This approach works well for both local analysis and cloud environments

This sampled data would be perfect for Lonboard visualization!
