In [1]:
import ibis

file_url = "https://storage.googleapis.com/opencontext-parquet/oc_isamples_pqg.parquet"
LOCAL_PATH = "/Users/raymondyee/Data/iSample/oc_isamples_pqg.parquet"



In [2]:
import os
import urllib.request
from pathlib import Path

# Check if local file exists, download if not
if not os.path.exists(LOCAL_PATH):
    print(f"Local file not found at {LOCAL_PATH}")
    
    # Create directory if it doesn't exist
    os.makedirs(os.path.dirname(LOCAL_PATH), exist_ok=True)
    
    print(f"Downloading {file_url} to {LOCAL_PATH}...")
    urllib.request.urlretrieve(file_url, LOCAL_PATH)
    print("Download completed!")
else:
    print(f"Local file already exists at {LOCAL_PATH}")

# Use local path for parquet operations
parquet_path = LOCAL_PATH
print(f"Using parquet file: {parquet_path}")

Local file already exists at /Users/raymondyee/Data/iSample/oc_isamples_pqg.parquet
Using parquet file: /Users/raymondyee/Data/iSample/oc_isamples_pqg.parquet


In [3]:
# Simple DuckDB starter code
import duckdb

# Create a DuckDB connection
conn = duckdb.connect()

# Execute the DuckDB commands using the local parquet file
conn.execute(f"SET VARIABLE parquet_path = '{parquet_path}';")
conn.execute("CREATE VIEW oc_pqg AS SELECT * FROM read_parquet(getvariable('parquet_path'));")

# Count records
result = conn.execute("SELECT COUNT(*) FROM oc_pqg;").fetchone()
print(f"Total records: {result[0]:,}")
print(f"Using parquet file: {parquet_path}")

Total records: 11,637,144
Using parquet file: /Users/raymondyee/Data/iSample/oc_isamples_pqg.parquet


In [7]:
# Basic exploration queries
print("Schema information:")
schema_result = conn.execute("DESCRIBE oc_pqg;").fetchall()
for row in schema_result:
    print(f"{row[0]:25} | {row[1]}")

print(f"\nFirst 5 rows:")
sample_result = conn.execute("SELECT * FROM oc_pqg LIMIT 5;").fetchdf()
print(sample_result)

Schema information:
row_id                    | INTEGER
pid                       | VARCHAR
tcreated                  | INTEGER
tmodified                 | INTEGER
otype                     | VARCHAR
s                         | INTEGER
p                         | VARCHAR
o                         | INTEGER[]
n                         | VARCHAR
altids                    | VARCHAR[]
geometry                  | BLOB
authorized_by             | VARCHAR[]
has_feature_of_interest   | VARCHAR
affiliation               | VARCHAR
sampling_purpose          | VARCHAR
complies_with             | VARCHAR[]
project                   | VARCHAR
alternate_identifiers     | VARCHAR[]
relationship              | VARCHAR
elevation                 | VARCHAR
sample_identifier         | VARCHAR
dc_rights                 | VARCHAR
result_time               | VARCHAR
contact_information       | VARCHAR
latitude                  | DOUBLE
target                    | VARCHAR
role                      | VARCHAR
sc

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

   row_id                                              pid  tcreated  \
0   10241  geoloc_f401f04667bf510a353d06b7025a7c66e13ea56b      <NA>   
1   10242  geoloc_a133d0d8e1ca8888388c7a22073d2b6441fe3fe1      <NA>   
2   10243  geoloc_09edf9bc6e1a3588eec87b7a538dc16ea9790c4c      <NA>   
3   10244  geoloc_fdaa27592833b5745dbabd9138b12e3d9eef0c6f      <NA>   
4   10245  geoloc_210c6b24821fb1618d50d7fb00a40e5f84c0be73      <NA>   

   tmodified                    otype     s     p     o     n altids  ...  \
0       <NA>  GeospatialCoordLocation  <NA>  None  <NA>  None   <NA>  ...   
1       <NA>  GeospatialCoordLocation  <NA>  None  <NA>  None   <NA>  ...   
2       <NA>  GeospatialCoordLocation  <NA>  None  <NA>  None   <NA>  ...   
3       <NA>  GeospatialCoordLocation  <NA>  None  <NA>  None   <NA>  ...   
4       <NA>  GeospatialCoordLocation  <NA>  None  <NA>  None   <NA>  ...   

   name  longitude obfuscated curation_location last_modified_time  \
0  None  23.131286      False     

## DuckDB CLI Commands

Copy these commands to use directly in the DuckDB CLI terminal:

In [4]:
# Start DuckDB CLI and run these commands:
# duckdb

# Use the local parquet file path
# SET VARIABLE parquet_path = '/Users/raymondyee/Data/iSample/oc_isamples_pqg.parquet';
# CREATE VIEW oc_pqg AS SELECT * FROM read_parquet(getvariable('parquet_path'));

# SELECT COUNT(*) FROM oc_pqg;

# Additional useful queries:
# DESCRIBE oc_pqg;
# SELECT * FROM oc_pqg LIMIT 5;

# Exit when done:
# .exit

print("CLI commands updated to use local file path")
print(f"Local parquet path: {LOCAL_PATH}")
print("\nTo use in DuckDB CLI:")
print("duckdb")
print(f"SET VARIABLE parquet_path = '{LOCAL_PATH}';")
print("CREATE VIEW oc_pqg AS SELECT * FROM read_parquet(getvariable('parquet_path'));")
print("SELECT COUNT(*) FROM oc_pqg;")

CLI commands updated to use local file path
Local parquet path: /Users/raymondyee/Data/iSample/oc_isamples_pqg.parquet

To use in DuckDB CLI:
duckdb
SET VARIABLE parquet_path = '/Users/raymondyee/Data/iSample/oc_isamples_pqg.parquet';
CREATE VIEW oc_pqg AS SELECT * FROM read_parquet(getvariable('parquet_path'));
SELECT COUNT(*) FROM oc_pqg;
