<a href="https://colab.research.google.com/github/kyle-woodward/bq-ee-vectorsearch/blob/main/src/01_earthgenome_embeddings_bq_vectorsearch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## BigQuery ELT of EarthGenome Embeddings for Vector Search

*Note: This notebook will create and consume resources on Google Cloud. Though it should be minimal, be mindful of cost and always delete resources after running demos.*

In order to run this demo you will need Google Cloud IAM permissions to:
* read, write, and create Cloud Storage objects
* read, write, and create BigQuery resources

Refer to [docs](https://cloud.google.com/iam/docs/roles-overview) for more info if you get a permissions-related error

In [1]:
import os
import json
import geopandas as gpd
import subprocess
from google.cloud import bigquery
import datetime

### Configure AWS credentials (you'll need an AWS account and a key created)

In [2]:
!pip install awscli
!aws --version

Collecting awscli
  Downloading awscli-1.41.3-py3-none-any.whl.metadata (11 kB)
Collecting botocore==1.39.3 (from awscli)
  Downloading botocore-1.39.3-py3-none-any.whl.metadata (5.7 kB)
Collecting docutils<=0.19,>=0.18.1 (from awscli)
  Downloading docutils-0.19-py3-none-any.whl.metadata (2.7 kB)
Collecting s3transfer<0.14.0,>=0.13.0 (from awscli)
  Downloading s3transfer-0.13.0-py3-none-any.whl.metadata (1.7 kB)
Collecting colorama<0.4.7,>=0.2.5 (from awscli)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Collecting rsa<4.8,>=3.1.2 (from awscli)
  Downloading rsa-4.7.2-py3-none-any.whl.metadata (3.6 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from botocore==1.39.3->awscli)
  Downloading jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)
Downloading awscli-1.41.3-py3-none-any.whl (4.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.7/4.7 MB[0m [31m39.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading botocore-1.39.3-py3-none-any.whl (13.8 MB)
[2K   

In [3]:
!aws configure

AWS Access Key ID [None]: AKIA3RYC55LP2UOT5KVW
AWS Secret Access Key [None]: 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
Default region name [None]: us-central-1
Default output format [None]: text


### Configure Google Cloud Credentials & Resources

In [19]:
# change to your GCS settings
BUCKET = "gs://embeddings-kenya" # GC Storage bucket
PROJECT_ID = "g4g-eaas" # GC project
LOCATION = "us-central1" # compute region
DATASET_ID = "embeddings_kenya" # BigQuery Dataset
TABLE_ID = "earthgenome_kenya_demo_geedevs_2" # BigQuery Table

In [5]:
# other auth methods that play nice outside of colab
# !gcloud auth login
# !gcloud config set project {PROJECT_ID}

# import google.auth
# scopes = ['https://www.googleapis.com/auth/cloud-platform']
# creds, _ = google.auth.default(scopes=scopes, default_scopes=scopes, quota_project_id=PROJECT_ID)

In [8]:
import google.colab.auth as auth
auth.authenticate_user(project_id=PROJECT_ID)

## Downloading Earthgenome Geoparquet's

### Earth Genome has hosted it on Source.Coop - let's check how its organized -> [link](https://source.coop/repositories/earthgenome/earthindexembeddings/description)

### In [00_s2_tile_management.ipynb](./00_s2_tile_management.ipynb) we've already aggregated UTM tile IDs to country boundaries

#### we'll use that JSON file to help us pull only the EG parquet files we need for a country..

In [9]:
# Read in our country-tile JSON reference
!mkdir -p ../esa_grid && wget https://raw.githubusercontent.com/kyle-woodward/bq-ee-vectorsearch/refs/heads/main/esa_grid/adm0_tiles_by_country.json -O ../esa_grid/adm0_tiles_by_country.json
tile_dict = json.load(open("../esa_grid/adm0_tiles_by_country.json"))
print(tile_dict.keys())

--2025-07-09 14:56:48--  https://raw.githubusercontent.com/kyle-woodward/bq-ee-vectorsearch/refs/heads/main/esa_grid/adm0_tiles_by_country.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1536 (1.5K) [text/plain]
Saving to: ‘../esa_grid/adm0_tiles_by_country.json’


2025-07-09 14:56:48 (16.9 MB/s) - ‘../esa_grid/adm0_tiles_by_country.json’ saved [1536/1536]

dict_keys(['Kenya'])


In [10]:
country = "Kenya"
tiles = tile_dict[country]
tiles.sort()
print(f"{len(tiles)} S2 tiles covering {country}")
for t in tiles:
    print(t)

89 S2 tiles covering Kenya
36MWD
36MWE
36MXD
36MXE
36MYC
36MYD
36MYE
36MZC
36MZD
36MZE
36NWF
36NXF
36NXG
36NXH
36NXJ
36NXK
36NXL
36NYF
36NYG
36NYH
36NYJ
36NYK
36NYL
36NZF
36NZG
36NZH
36NZJ
36NZK
36NZL
37MBS
37MBT
37MBU
37MBV
37MCR
37MCS
37MCT
37MCU
37MCV
37MDQ
37MDR
37MDS
37MDT
37MDU
37MDV
37MEQ
37MER
37MES
37MET
37MEU
37MEV
37MFS
37MFT
37MFU
37MFV
37MGT
37MGU
37MGV
37NBA
37NBB
37NBC
37NBD
37NBE
37NBF
37NCA
37NCB
37NCC
37NCD
37NCE
37NDA
37NDB
37NDC
37NDD
37NDE
37NEA
37NEB
37NEC
37NED
37NEE
37NFA
37NFB
37NFC
37NFD
37NFE
37NGA
37NGB
37NGC
37NGD
37NGE
37NHE


In [11]:
dryrun=False

for i,t in enumerate(tiles):
    # limit data we're downloading..
    if i > 0:
        break

    suffix = "2024-01-01_2025-01-01.parquet"
    pattern = f"s3://earthgenome/earthindexembeddings/2024/{t}_{suffix}"
    cmd = f"aws s3 cp {pattern} ../embeddings/earthgenome/2024/{t}_{suffix} --endpoint-url=https://data.source.coop"
    if dryrun:
        print(cmd)
    else:
        print(f"Running {cmd}")
        try:
            subprocess.run(cmd, shell=True, capture_output=True, check=True)
        except subprocess.CalledProcessError as e:
            print(f"Error copying {t}: {e}")
            # If the file does not exist, we can skip it
            if "does not exist" in e.stderr.decode():
                print(f"File {t} does not exist, skipping.")
                continue
            else:
                raise

Running aws s3 cp s3://earthgenome/earthindexembeddings/2024/36MWD_2024-01-01_2025-01-01.parquet ../embeddings/earthgenome/2024/36MWD_2024-01-01_2025-01-01.parquet --endpoint-url=https://data.source.coop


Look at a geoparquet file

In [12]:
# look at one
files = os.listdir("../embeddings/earthgenome/2024")
print(f"{len(list(files))} files:\n {files}")
file = os.path.join("../embeddings/earthgenome/2024", files[0])
print(file)
df = gpd.read_parquet(file)
print(df.head())


1 files:
 ['36MWD_2024-01-01_2025-01-01.parquet']
../embeddings/earthgenome/2024/36MWD_2024-01-01_2025-01-01.parquet
                  id                                          embedding  \
0  21319106322835290  [4.1400094, 0.23891862, 1.5696436, 0.7769202, ...   
1  21319106300812378  [4.6409683, 0.33002034, 1.5227464, 0.914815, 1...   
2  21319107040076635  [4.379927, 0.3128666, 1.6951394, 0.8385155, 1....   
3  21319107018053723  [4.313345, 0.7920954, 1.5458571, 0.7641265, 2....   
4  21319107044276059  [4.162823, 0.43091944, 2.015767, 1.0073973, 1....   

                    geometry  
0  POINT (33.00129 -1.89434)  
1  POINT (33.00273 -1.89578)  
2  POINT (33.00417 -1.89434)  
3  POINT (33.00561 -1.89578)  
4  POINT (33.00705 -1.89434)  


we'll add a tile column to help us stay organized

In [13]:
# overwrite all files to add tile column
for file in files:
    file_path = os.path.join("../embeddings/earthgenome/2024", file)
    df = gpd.read_parquet(file_path)
    df.loc[:,'tile'] = os.path.basename(file).split("_")[0]
    df.to_parquet(file_path, index=False)
    print(f"Updated {file} with tile column.")
    break


Updated 36MWD_2024-01-01_2025-01-01.parquet with tile column.


In [14]:
print(gpd.read_parquet(file_path).head())

                  id                                          embedding  \
0  21319106322835290  [4.1400094, 0.23891862, 1.5696436, 0.7769202, ...   
1  21319106300812378  [4.6409683, 0.33002034, 1.5227464, 0.914815, 1...   
2  21319107040076635  [4.379927, 0.3128666, 1.6951394, 0.8385155, 1....   
3  21319107018053723  [4.313345, 0.7920954, 1.5458571, 0.7641265, 2....   
4  21319107044276059  [4.162823, 0.43091944, 2.015767, 1.0073973, 1....   

                    geometry   tile  
0  POINT (33.00129 -1.89434)  36MWD  
1  POINT (33.00273 -1.89578)  36MWD  
2  POINT (33.00417 -1.89434)  36MWD  
3  POINT (33.00561 -1.89578)  36MWD  
4  POINT (33.00705 -1.89434)  36MWD  


### Loading Data into BigQuery

You'll need a GCS bucket and a BigQuery Dataset

In [15]:
# create the storage bucket and BigQuery dataset
!gcloud storage buckets create {BUCKET} --location {LOCATION} --project {PROJECT_ID}
!bq mk -d --data_location={LOCATION} --project_id {PROJECT_ID} {DATASET_ID}

Creating gs://embeddings-kenya/...
[1;31mERROR:[0m (gcloud.storage.buckets.create) HTTPError 409: Your previous request to create the named bucket succeeded and you already own it.
BigQuery error in mk operation: Dataset 'g4g-eaas:embeddings_kenya' already
exists.


In [16]:
# upload parquet files to gcs
# try gcloud storage sync..
gcloud_folder = f"{BUCKET}/earthgenome/2024"
!gcloud storage rsync ../embeddings/earthgenome/2024 $gcloud_folder \
    --project=$PROJECT_ID

At file://../embeddings/earthgenome/2024/*, worker process 1222 thread 133769295564800 listed 1...
At gs://embeddings-kenya/earthgenome/2024/*, worker process 1222 thread 133769295564800 listed 88...
uploading large objects. If you would like to opt-out and instead
perform a normal upload, run:
`gcloud config set storage/parallel_composite_upload_enabled False`
`gcloud config set storage/parallel_composite_upload_enabled True`
Note that with parallel composite uploads, your object might be
uploaded as a composite object
(https://cloud.google.com/storage/docs/composite-objects), which means
that any user who downloads your object will need to use crc32c
checksums to verify data integrity. gcloud storage is capable of
computing crc32c checksums, but this might pose a problem for other
clients.

Copying file://../embeddings/earthgenome/2024/36MWD_2024-01-01_2025-01-01.parquet to gs://embeddings-kenya/earthgenome/2024/36MWD_2024-01-01_2025-01-01.parquet

Average throughput: 121.9MiB/s


In [21]:
FULL_TABLE = f"{PROJECT_ID}:{DATASET_ID}.{TABLE_ID}"
FOLDER = "earthgenome/2024"
print(FULL_TABLE)
for i,file in enumerate(files):
    # limit what we're ingesting to BQ
    if i > 0:
        break
    URI = f"{BUCKET}/{FOLDER}/{file}"

    print(URI)
    !bq --location=$LOCATION --project_id=$PROJECT_ID \
            load \
                --source_format=PARQUET \
                $FULL_TABLE \
                $URI

g4g-eaas:embeddings_kenya.earthgenome_kenya_demo_geedevs_2
gs://embeddings-kenya/earthgenome/2024/36MWD_2024-01-01_2025-01-01.parquet
Waiting on bqjob_r1fb6065a0ba7ac5d_00000197efb26803_1 ... (34s) Current status: DONE   


### Minor transforms of the BQ table

we will do a small post-processing query on the loaded embeddings table to get the embedding field converted correctly for vector search..

vector search indexing requires the embedding field to be of type `ARRAY<FLOAT>`

the load operation turns 'embedding' field into a double-nested STRUCT data type, innermost child containing list of floats..

so we have to unpack that list from the nested structure, final data type being `ARRAY<FLOAT64>`

In [22]:
query = f"""
SELECT
  eg.id,
  eg.tile,
  ST_GEOGFROMTEXT(grouped.geometry_text) AS geometry,
  ARRAY_AGG(e.element) AS embedding
FROM
  `{PROJECT_ID}`.`{DATASET_ID}`.`{TABLE_ID}` AS eg
CROSS JOIN
  UNNEST(eg.embedding.list) AS e
JOIN (
  SELECT id, tile, ST_ASTEXT(geometry) AS geometry_text
  FROM `{PROJECT_ID}`.`{DATASET_ID}`.`{TABLE_ID}`
  GROUP BY id, tile, geometry_text
) AS grouped ON eg.id = grouped.id AND eg.tile = grouped.tile AND ST_ASTEXT(eg.geometry) = grouped.geometry_text
GROUP BY eg.id, eg.tile, grouped.geometry_text
"""

# Run the query and save the result to a new table
result_table = f"{PROJECT_ID}.{DATASET_ID}.{TABLE_ID}_v1"
job_config = bigquery.QueryJobConfig(destination=result_table)
client = bigquery.Client(project=PROJECT_ID)
job = client.query(query, job_config=job_config)
job.result()  # Wait for the job to complete

<google.cloud.bigquery.table.RowIterator at 0x7f2990be0e90>

In [23]:
# Check if the result_table exists

def table_exists(client, table_id):
    try:
        client.get_table(table_id)
        print(f"Table {table_id} exists.")
        return True
    except Exception as e:
        print(f"Table {table_id} does not exist. Error: {e}")
        return False

table_exists(client, result_table)

Table g4g-eaas.embeddings_kenya.earthgenome_kenya_demo_geedevs_2_v1 exists.


True

In [24]:
# check the resulting table's schema and data
query = f"SELECT * FROM `{result_table}` LIMIT 10"
query_job = client.query(query)
# print schema
schema = query_job.result().schema
for field in schema:
    print(f"{field.name}: {field.field_type}")
for row in query_job:
    print(row)

id: INTEGER
tile: STRING
geometry: GEOGRAPHY
embedding: FLOAT
Row((21320946583792397, '36MWD', 'POINT(33.1768483096263 -1.15701555463285)', [3.891666889190674, 0.19801615178585052, 2.2466237545013428, 0.800146758556366, 1.2050422430038452, -1.510266900062561, 3.666425943374634, -0.21394479274749756, 3.2591750621795654, 1.6400431394577026, 1.8089557886123657, -3.1976563930511475, -0.12565721571445465, 0.25345203280448914, -1.4848551750183105, 0.1947350949048996, 0.18887382745742798, 0.4111557602882385, -2.4643473625183105, 2.2719671726226807, 1.135674238204956, -0.06623528897762299, 1.68812096118927, -2.2421677112579346, -0.5186740756034851, 0.6467905640602112, 0.1985785812139511, -0.3816933035850525, -0.2771374583244324, 4.188538074493408, -1.1120436191558838, -1.9023206233978271, -2.383863925933838, 1.3219537734985352, -1.7336243391036987, -1.2478221654891968, -0.6090970039367676, -0.7042137384414673, 0.9103711247444153, 0.5013507008552551, -0.2980897128582001, -0.32236552238464355, -

### Index BQ table to enable Vector Search

Following [docs](https://cloud.google.com/bigquery/docs/vector-search#create_a_vector_index) guidance

In [25]:
# test VECTOR SEARCH operations
in_table = '.'.join(result_table.split(".")[1:])
print(f'indexing {in_table} for vector search')
query = f"""
CREATE VECTOR INDEX my_index ON {in_table}(embedding)
OPTIONS(distance_type='COSINE', index_type='IVF', ivf_options='{{"num_lists": 1000}}');
"""

# Run the query to create the index
client = bigquery.Client(project=PROJECT_ID)
job = client.query(query)
job.result()  # Wait for the job to complete

indexing embeddings_kenya.earthgenome_kenya_demo_geedevs_2_v1 for vector search


<google.cloud.bigquery.table._EmptyRowIterator at 0x7f2990bf7ad0>

Create a test target table of 1 record to perform vector search with

In [26]:
result_table = result_table+"_test_target"
query = f"SELECT * FROM {in_table} LIMIT 1"

job_config = bigquery.QueryJobConfig(destination=result_table)
job = client.query(query,job_config=job_config)
job.result()  # Wait for the job to complete

<google.cloud.bigquery.table.RowIterator at 0x7f2990be68d0>

Run a [Vector Search](https://cloud.google.com/bigquery/docs/reference/standard-sql/search_functions#vector_search)!

In [27]:
target_table = '.'.join(result_table.split(".")[1:])
print(target_table)
query = f"""
SELECT query.id AS target_id,
  query.tile AS target_tile,
  base.id AS base_id,
  base.tile AS base_tile,
  distance
FROM
  VECTOR_SEARCH(
    TABLE {in_table},
    'embedding',
    TABLE {target_table},
    top_k => 11,
    distance_type => 'COSINE',
    options => '{{"fraction_lists_to_search": 0.005}}')
ORDER BY distance
LIMIT 10
OFFSET 1;
"""

# Run the query to create the index
client = bigquery.Client(project=PROJECT_ID)
search_result_table = f"{PROJECT_ID}.{DATASET_ID}.vector_search_results_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}"
job_config = bigquery.QueryJobConfig(destination=search_result_table)
job = client.query(query,job_config=job_config)
job.result()  # Wait for the job to complete

embeddings_kenya.earthgenome_kenya_demo_geedevs_2_v1_test_target


<google.cloud.bigquery.table.RowIterator at 0x7f2990bc7c10>

In [28]:
query = f"SELECT * FROM `{search_result_table}` LIMIT 10"
query_job = client.query(query)
# print schema
schema = query_job.result().schema
for field in schema:
    print(f"{field.name}: {field.field_type}")
for row in query_job:
    print(row)

target_id: INTEGER
target_tile: STRING
base_id: INTEGER
base_tile: STRING
distance: FLOAT
Row((21320946583792397, '36MWD', 21321050490119356, '36MWD', 0.004632610137886961), {'target_id': 0, 'target_tile': 1, 'base_id': 2, 'base_tile': 3, 'distance': 4})
Row((21320946583792397, '36MWD', 21320959822310775, '36MWD', 0.00463420661975622), {'target_id': 0, 'target_tile': 1, 'base_id': 2, 'base_tile': 3, 'distance': 4})
Row((21320946583792397, '36MWD', 21321048012760760, '36MWD', 0.0046818560164269), {'target_id': 0, 'target_tile': 1, 'base_id': 2, 'base_tile': 3, 'distance': 4})
Row((21320946583792397, '36MWD', 21319394043992838, '36MWD', 0.0049808743419779145), {'target_id': 0, 'target_tile': 1, 'base_id': 2, 'base_tile': 3, 'distance': 4})
Row((21320946583792397, '36MWD', 21321070070564923, '36MWD', 0.0050157523354722455), {'target_id': 0, 'target_tile': 1, 'base_id': 2, 'base_tile': 3, 'distance': 4})
Row((21320946583792397, '36MWD', 21321061160033990, '36MWD', 0.005157854678888074), {'

### You can take a look at your newly created BQ tables and the vector search results in [BQ studio](https://console.cloud.google.com/bigquery)