In [13]:
#%% @title Imports
from IPython.display import display
import ipywidgets as widgets
from etils import epath
import numpy as np

from perch_hoplite.agile import colab_utils
from perch_hoplite.agile import embed
from perch_hoplite.agile import source_info
from perch_hoplite.db import interface
from perch_hoplite.db import brutalism

## Embed

In [14]:
#@title Configuration { vertical-output: true }

#@markdown Configure the raw dataset location(s).  The format is a mapping from
#@markdown a dataset_name to a (base_path, fileglob) pair.  Note that the file
#@markdown globs are case sensitive.  The dataset name can be anything you want.
#
#@markdown This structure allows you to move your data around without having to
#@markdown re-embed the dataset.  The generated embedding database will be
#@markdown placed in the base path. This allows you to simply swap out
#@markdown the base path here if you ever move your dataset.

#@markdown By default we only process one dataset at a time.  Re-run this entire notebook
#@markdown once per dataset.  The embeddings database will be located in the
#@markdown database_base_path.

#@markdown For example, we might set dataset_base_path to '/home/me/myproject',
#@markdown and use the glob '*/*.wav' if all of the audio files have filepaths
#@markdown like '/home/me/myproject/site_XYZ/audio_ABC.wav'
dataset_name = 'INCT17'  #@param {type:'string'}
dataset_base_path = '/home/Sedlar/data/INCT17'  #@param {type:'string'}
dataset_fileglob = '*.wav'  #@param {type:'string'}
#@markdown You do not need to change this unless you want to maintain multiple
#@markdown distinct embedding databases.
db_path = None #@param

#@markdown Choose a supported model: `perch_8` or `birdnet_v2.3` are most common
#@markdown for birds. Other choices include `surfperch` for coral reefs or
#@markdown `multispecies_whale` for marine mammals.
model_choice = 'perch_8'  #@param['perch_8', 'humpback', 'multispecies_whale', 'surfperch', 'birdnet_V2.3']

#@markdown File sharding automatically splits audio files into one-minute chunks
#@markdown for embedding. This limits both system and GPU memory usage,
#@markdown especially useful when working with long files (>1 hour).
use_file_sharding = True  #@param {type:'boolean'}

audio_glob = source_info.AudioSourceConfig(
    dataset_name=dataset_name,
    base_path=dataset_base_path,
    file_glob=dataset_fileglob,
    min_audio_len_s=1.0,
    target_sample_rate_hz=-2,
    shard_len_s=60.0 if use_file_sharding else None,
)

configs = colab_utils.load_configs(
    source_info.AudioSources((audio_glob,)),
    db_path,
    model_config_key=model_choice,
    db_key = 'sqlite_usearch')
configs

AgileConfigs(audio_sources_config=AudioSources(audio_globs=(AudioSourceConfig(dataset_name='INCT17', base_path='/home/Sedlar/data/INCT17', file_glob='*.wav', min_audio_len_s=1.0, target_sample_rate_hz=-2, shard_len_s=60.0, max_shards_per_file=None),)), db_config=DBConfig(db_key='sqlite_usearch', db_config=db_path: !!python/object/apply:etils.epath.gpath.PosixGPath
- /
- home
- Sedlar
- data
- INCT17
usearch_cfg:
  dtype: float16
  embedding_dim: 1280
  expansion_add: 256
  expansion_search: 128
  metric_name: IP
), model_config=ModelConfig(model_key='taxonomy_model_tf', embedding_dim=1280, model_config=hop_size_s: 5.0
model_path: ''
sample_rate: 32000
tfhub_version: 8
window_size_s: 5.0
))

In [15]:
#@title Initialize the DB { vertical-output: true }
global db
db = configs.db_config.load_db()
num_embeddings = db.count_embeddings()

print('Initialized DB located at ', configs.db_config.db_config.db_path)

def drop_and_reload_db(_) -> interface.HopliteDBInterface:
  db_path = epath.Path(configs.db_config.db_config.db_path)
  for fp in db_path.glob('hoplite.sqlite*'):
    fp.unlink()
  (db_path / 'usearch.index').unlink()
  print('\n Deleted previous db at: ', configs.db_config.db_config.db_path)
  db = configs.db_config.load_db()

drop_existing_db = True  #@param[True, False]

if num_embeddings > 0 and drop_existing_db:
  print('Existing DB contains datasets: ', db.get_dataset_names())
  print('num embeddings: ', num_embeddings)
  print('\n\nClick the button below to confirm you really want to drop the database at ')
  print(f'{configs.db_config.db_config.db_path}\n')
  print(f'This will permanently delete all {num_embeddings} embeddings from the existing database.\n')
  print('If you do NOT want to delete this data, set `drop_existing_db` above to `False` and re-run this cell.\n')

  button = widgets.Button(description='Delete database?')
  button.on_click(drop_and_reload_db)
  display(button)

Initialized DB located at  /home/Sedlar/data/INCT17


In [20]:
#@title Run the embedding { vertical-output: true }

print(f'Embedding dataset: {audio_glob.dataset_name}')

worker = embed.EmbedWorker(
    audio_sources=configs.audio_sources_config,
    db=db,
    model_config=configs.model_config)

worker.process_all(target_dataset_name=audio_glob.dataset_name)

print('\n\nEmbedding complete, total embeddings: ', db.count_embeddings())

Embedding dataset: INCT17


2025-05-07 17:19:56.550044: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 55971840 exceeds 10% of free system memory.
2025-05-07 17:20:05.898316: W tensorflow/compiler/tf2xla/kernels/assert_op.cc:38] Ignoring Assert operator jax2tf_infer_fn_/assert_equal_1/Assert/AssertGuard/Assert
 55%|█████▍    | 11200/20532 [02:27<02:02, 75.98it/s]


RuntimeError: Operation has been terminated

In [21]:
#@title Per dataset statistics { vertical-output: true }

for dataset in db.get_dataset_names():
  print(f'\nDataset \'{dataset}\':')
  print('\tnum embeddings: ', db.get_embeddings_by_source(dataset, source_id=None).shape[0])


Dataset 'INCT17':
	num embeddings:  11201


In [22]:
#@title Show example embedding

q = db.get_embedding(db.get_one_embedding_id())
%time results, scores = brutalism.brute_search(worker.db, query_embedding=q, search_list_size=128, score_fn=np.dot)
print([int(r.embedding_id) for r in results])

CPU times: user 376 ms, sys: 6.06 ms, total: 382 ms
Wall time: 374 ms
[1, 2811, 10832, 7236, 8903, 95, 6694, 10712, 7773, 1560, 502, 7602, 5129, 4595, 5543, 742, 11057, 6562, 5035, 7059, 9862, 6149, 9483, 9223, 5661, 9289, 6624, 3861, 5562, 3281, 4299, 2559, 10410, 5224, 511, 5854, 6961, 8396, 4297, 7863, 2582, 8777, 3600, 1913, 8858, 1575, 5584, 10016, 8424, 1722, 6357, 4303, 7217, 3810, 10402, 4113, 8253, 4489, 10396, 10550, 188, 3135, 5910, 8169, 7319, 10988, 265, 2190, 3762, 1403, 10564, 10972, 7362, 7278, 7526, 10056, 5581, 755, 6651, 2097, 9396, 2431, 7788, 1153, 3062, 5331, 6659, 3257, 3003, 98, 9390, 4094, 10172, 5666, 6006, 8468, 2488, 8689, 6229, 2158, 331, 7706, 4885, 800, 9639, 1412, 8697, 3147, 1895, 5586, 10442, 1774, 10221, 9062, 2830, 6785, 7740, 4915, 6428, 11192, 8664, 3776, 3417, 2356, 8808, 3911, 4462, 3427]
