In [1]:
from pathlib import Path as P

import duckdb
import geopandas as gpd
import pandas as pd
import shapely
from palettable.colorbrewer.diverging import BrBG_10
from sidecar import Sidecar

from lonboard import Map, ScatterplotLayer
# from lonboard.colormap import apply_continuous_cmap

import ibis


In [2]:
ibis.options.interactive = True


In [3]:
# https://storage.googleapis.com/opencontext-parquet/oc_isamples_pqg.parquet
# wget https://storage.googleapis.com/opencontext-parquet/oc_isamples_pqg.parquet -O /Users/raymondyee/Data/iSample/oc_isamples_pqg.parquet

local_path = P("/Users/raymondyee/Data/iSample/oc_isamples_pqg.parquet")
local_path.exists()

True

# using pandas and geopandas

In [29]:
df = pd.read_parquet(local_path)
df.columns

Index(['row_id', 'pid', 'tcreated', 'tmodified', 'otype', 's', 'p', 'o', 'n',
       'altids', 'geometry', 'authorized_by', 'has_feature_of_interest',
       'affiliation', 'sampling_purpose', 'complies_with', 'project',
       'alternate_identifiers', 'relationship', 'elevation',
       'sample_identifier', 'dc_rights', 'result_time', 'contact_information',
       'latitude', 'target', 'role', 'scheme_uri', 'is_part_of', 'scheme_name',
       'name', 'longitude', 'obfuscated', 'curation_location',
       'last_modified_time', 'access_constraints', 'place_name', 'description',
       'label', 'thumbnail_url'],
      dtype='object')

In [31]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import numpy as np

# Assuming you already have your DataFrame loaded
# df = pd.read_parquet('your_file.parquet')

# Create a geometry column by converting lat/lon to Points
# Handle null values by setting them to None
def create_point(row):
    if pd.isna(row['latitude']) or pd.isna(row['longitude']):
        return None
    else:
        return Point(row['longitude'], row['latitude'])

# Apply the function to create geometry
df['geometry'] = df.apply(create_point, axis=1)

# Convert to GeoDataFrame
gdf = gpd.GeoDataFrame(df, geometry='geometry')

# Set the coordinate reference system (CRS) - commonly WGS 84 for lat/lon
gdf.set_crs(epsg=4326, inplace=True)

# Save as geoparquet
gdf.to_parquet('/Users/raymondyee/Data/iSample/opencontext/output_geoparquet.parquet')

# using ibis

In [4]:
# ibis to read the parquet file
table = ibis.read_parquet(local_path)
table

In [24]:
def create_geometry_from_latlon(df, lat_col='latitude', lon_col='longitude', crs='EPSG:4326'):
    """
    Create a geometry column from latitude and longitude columns in a dataframe.
    
    Parameters:
    -----------
    df : Table-like object
        The dataframe or table containing latitude and longitude columns
    lat_col : str, default='latitude'
        Name of the latitude column
    lon_col : str, default='longitude'
        Name of the longitude column
    crs : str, default='EPSG:4326'
        Coordinate reference system for the geometry
        
    Returns:
    --------
    geopandas.GeoDataFrame
        DataFrame with geometry column created from lat/lon values
    """
    import pandas as pd
    import geopandas as gpd
    from shapely.geometry import Point
    import numpy as np
    
    # Convert to pandas DataFrame if it's not already one
    try:
        # Try to convert to pandas DataFrame if it's not already one
        if not isinstance(df, pd.DataFrame):
            df_copy = pd.DataFrame(df)
        else:
            df_copy = df.copy()
    except:
        # If conversion fails, try to work with the original object
        df_copy = df
    
    # Check if latitude and longitude columns exist and are not entirely null
    if (lat_col in df_copy.columns and lon_col in df_copy.columns):
        # Check if all values are null - handle different null checking approaches
        try:
            lat_all_null = df_copy[lat_col].isna().all()
            lon_all_null = df_copy[lon_col].isna().all()
        except AttributeError:
            # Alternative approach if isna() is not available
            try:
                lat_all_null = all(x is None for x in df_copy[lat_col])
                lon_all_null = all(x is None for x in df_copy[lon_col])
            except:
                # If we can't check for nulls, assume there are some valid values
                lat_all_null = False
                lon_all_null = False
        
        if not (lat_all_null or lon_all_null):
            # Create a mask for valid coordinate pairs
            try:
                valid_coords_mask = ~(pd.isna(df_copy[lat_col]) | pd.isna(df_copy[lon_col]))
            except:
                # Alternative approach if pandas isna doesn't work
                valid_coords_mask = [(x is not None and y is not None) 
                                    for x, y in zip(df_copy[lat_col], df_copy[lon_col])]
            
            # Create a new DataFrame with just the data we need
            valid_data = {
                'latitude': df_copy[lat_col],
                'longitude': df_copy[lon_col]
            }
            
            # Add all other columns if possible
            try:
                for col in df_copy.columns:
                    if col not in [lat_col, lon_col, 'geometry']:
                        valid_data[col] = df_copy[col]
            except:
                pass
            
            # Create a pandas DataFrame
            pdf = pd.DataFrame(valid_data)
            
            # Create geometry column
            geometries = [
                Point(lon, lat) if pd.notna(lon) and pd.notna(lat) else None
                for lon, lat in zip(pdf['longitude'], pdf['latitude'])
            ]
            
            # Create GeoDataFrame
            gdf = gpd.GeoDataFrame(pdf, geometry=geometries, crs=crs)
            
            print(f"Geometry column created from lat/long values. "
                  f"{sum(g is not None for g in geometries)} valid geometries "
                  f"created out of {len(gdf)} records.")
            
            return gdf
        else:
            print(f"Could not create geometry column. All values in {lat_col} or {lon_col} are null.")
    else:
        print(f"Could not create geometry column. Missing {lat_col} or {lon_col} columns.")
    
    # If we can't create a GeoDataFrame, return the original table
    return df

# Call the function to create a new GeoDataFrame with geometry column
gdf = create_geometry_from_latlon(table)

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Could not create geometry column. Missing latitude or longitude columns.


In [26]:
def validate_and_save_geoparquet(gdf, output_path="output.parquet", min_valid_geoms=1):
    """
    Validates a GeoDataFrame and saves it as a GeoParquet file if valid.
    
    Parameters:
    -----------
    gdf : geopandas.GeoDataFrame
        The GeoDataFrame to validate and save
    output_path : str, default="output.parquet"
        Path where the GeoParquet file will be saved
    min_valid_geoms : int, default=1
        Minimum number of valid geometries required
        
    Returns:
    --------
    bool
        True if validation passed and file was saved, False otherwise
    """
    import geopandas as gpd
    import pandas as pd
    import os
    
    # Check if the input is a GeoDataFrame
    if not isinstance(gdf, gpd.GeoDataFrame):
        print("Error: Input is not a GeoDataFrame")
        return False
    
    # Check if the GeoDataFrame has a geometry column
    if gdf.geometry is None or gdf.geometry.name not in gdf.columns:
        print("Error: GeoDataFrame does not have a valid geometry column")
        return False
    
    # Check if the GeoDataFrame has a CRS
    if gdf.crs is None:
        print("Warning: GeoDataFrame has no CRS (Coordinate Reference System)")
        print("Setting default CRS to EPSG:4326 (WGS84)")

In [27]:
validate_and_save_geoparquet(gdf, output_path="/Users/raymondyee/Data/iSample/opencontext/output_geoparquet.parquet", min_valid_geoms=1)

Error: Input is not a GeoDataFrame


False

In [7]:
table['otype'].value_counts()

In [8]:
table['geometry'].notnull().sum()

┌────────┐
│ [1;36m194040[0m │
└────────┘

In [19]:
# Initialize DuckDB connection
conn = duckdb.connect(':memory:')  # or specify a database file

# Install and load spatial extension
conn.execute("INSTALL spatial;")
conn.execute("LOAD spatial;")

<duckdb.duckdb.DuckDBPyConnection at 0x13af9f6f0>

In [20]:
query = f"""
SET VARIABLE parquet_path = '{local_path}';

CREATE VIEW my_data AS
    SELECT * FROM read_parquet(getvariable('parquet_path'));

SELECT COUNT(pid) AS ss_count, s 
FROM my_data
WHERE p = 'sampling_site' 
GROUP BY s 
HAVING ss_count > 1
ORDER BY ss_count DESC;
"""

r = conn.execute(query)

In [21]:
r.fetchall()

[]

In [22]:
r = conn.execute("""
  DESCRIBE my_data;
""")
r.fetchall()


[('row_id', 'INTEGER', 'YES', None, None, None),
 ('pid', 'VARCHAR', 'YES', None, None, None),
 ('tcreated', 'INTEGER', 'YES', None, None, None),
 ('tmodified', 'INTEGER', 'YES', None, None, None),
 ('otype', 'VARCHAR', 'YES', None, None, None),
 ('s', 'INTEGER', 'YES', None, None, None),
 ('p', 'VARCHAR', 'YES', None, None, None),
 ('o', 'INTEGER[]', 'YES', None, None, None),
 ('n', 'VARCHAR', 'YES', None, None, None),
 ('altids', 'VARCHAR[]', 'YES', None, None, None),
 ('geometry', 'BLOB', 'YES', None, None, None),
 ('authorized_by', 'VARCHAR[]', 'YES', None, None, None),
 ('has_feature_of_interest', 'VARCHAR', 'YES', None, None, None),
 ('affiliation', 'VARCHAR', 'YES', None, None, None),
 ('sampling_purpose', 'VARCHAR', 'YES', None, None, None),
 ('complies_with', 'VARCHAR[]', 'YES', None, None, None),
 ('project', 'VARCHAR', 'YES', None, None, None),
 ('alternate_identifiers', 'VARCHAR[]', 'YES', None, None, None),
 ('relationship', 'VARCHAR', 'YES', None, None, None),
 ('elevatio

In [None]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point

def create_geometry(row):
    # We'll directly check for latitude and longitude without looking for geometry_type
    # since it doesn't exist in the dataframe
    if pd.notna(row.get('item__latitude', None)) and pd.notna(row.get('item__longitude', None)):
        return Point(row['item__longitude'], row['item__latitude'])
    else:
        return None

# load the dataset into a dataframe
df = table.to_pandas()

# Let's first check what columns actually exist in the dataframe
print("Available columns:", df.columns.tolist())

# Create geometry column - wrapped in try-except to handle potential errors
try:
    df['geometry'] = df.apply(create_geometry, axis=1)
    
    # Count how many rows have geometry data
    geo_count = df['geometry'].notna().sum()
    total_count = len(df)
    print(f"Rows with geometry: {geo_count} out of {total_count} ({geo_count/total_count:.2%})")
    
    # Create a GeoDataFrame with only rows that have geometry
    if geo_count > 0:
        gdf = gpd.GeoDataFrame(df.loc[df['geometry'].notna()], geometry='geometry')
        
        # Set CRS to WGS 84 since we're using lat/long
        gdf.set_crs(epsg=4326, inplace=True)
        
        # Write to GeoParquet
        output_path = P.home() / 'data/iSample/opencontext' / 'output_geoparquet.parquet'
        gdf.to_parquet(output_path)
        print(f"Saved {len(gdf)} rows with geometry to {output_path}")
    else:
        print("No rows with valid geometry found.")
        
    # Optionally save the rows without geometry to a separate file
    non_geo_df = df[df['geometry'].isna()]
    if not non_geo_df.empty:
        non_geo_path = P.home() / 'data/iSample/opencontext' / 'non_geo_output.parquet'
        non_geo_df.drop(columns=['geometry'], errors='ignore').to_parquet(non_geo_path)
        print(f"Saved {len(non_geo_df)} rows without geometry to {non_geo_path}")
        
except Exception as e:
    print(f"Error processing data: {str(e)}")
    # Check if latitude and longitude columns exist
    lat_col = [col for col in df.columns if 'lat' in col.lower()]
    lon_col = [col for col in df.columns if 'lon' in col.lower()]
    print(f"Potential latitude columns: {lat_col}")
    print(f"Potential longitude columns: {lon_col}")

In [None]:
import pandas as pd
import geopandas as gpd
import numpy as np
from shapely.geometry import Point
from concurrent.futures import ThreadPoolExecutor
import multiprocessing

# Load the dataset into a dataframe
df = table.to_pandas()

# Vectorized approach for creating Point geometries
def create_geometries_vectorized(df):
    # Extract longitude and latitude columns
    longitudes = df['item__longitude'].values
    latitudes = df['item__latitude'].values
    
    # Create a mask for valid coordinates (both lat and long are not NA)
    valid_mask = ~(np.isnan(longitudes) | np.isnan(latitudes))
    
    # Initialize an empty geometry array with the same length as the DataFrame
    geometries = np.empty(len(df), dtype=object)
    geometries[:] = None  # Explicitly set all to None initially
    
    # Only create Points for valid coordinates
    valid_indices = np.where(valid_mask)[0]
    
    # Create Point objects in parallel using multiple cores
    num_cores = multiprocessing.cpu_count()
    
    def create_point_batch(indices):
        points = []
        for i in indices:
            try:
                points.append(Point(longitudes[i], latitudes[i]))
            except (ValueError, TypeError):
                # If there's any issue creating the point, append None
                points.append(None)
                # Update the valid mask to mark this as invalid
                valid_mask[i] = False
        return points, indices
    
    # Split the valid indices into batches for parallel processing
    batch_size = max(1, len(valid_indices) // (num_cores * 4))
    batches = [valid_indices[i:i + batch_size] for i in range(0, len(valid_indices), batch_size)]
    
    # Process batches in parallel
    with ThreadPoolExecutor(max_workers=num_cores) as executor:
        results = list(executor.map(create_point_batch, batches))
    
    # Flatten the results and assign to corresponding indices
    for batch_points, batch_indices in results:
        for idx, point in enumerate(batch_points):
            if point is not None:  # Only assign valid points
                geometries[batch_indices[idx]] = point
    
    # Recompute valid_mask based on the actual Points created
    valid_mask = np.array([geom is not None for geom in geometries])
    
    return geometries, valid_mask

# Create geometries using the vectorized approach
geometries, valid_mask = create_geometries_vectorized(df)

# Filter the DataFrame to keep only rows with valid geometries
df_valid = df[valid_mask].copy()
df_valid['geometry'] = geometries[valid_mask]

# Convert to GeoDataFrame
gdf = gpd.GeoDataFrame(df_valid, geometry='geometry')

# Set CRS to WGS 84 since we're using lat/long
gdf.set_crs(epsg=4326, inplace=True)

# Write to GeoParquet
gdf.to_parquet(P.home() / 'data/iSample/opencontext' / 'output_geoparquet.parquet')

In [None]:
df.columns

In [None]:
df['latitude'].notnull().sum(), df['longitude'].notnull().sum()