Intro here

### 1. Set environment variables

In [None]:
%env AWS_PROFILE=jxprtn
%env BEDROCK_MODEL_ID=us.cohere.embed-v4:0
%env EF_CONSTRUCTION=512
%env EF_SEARCH=512
%env EMBEDDING_COLUMN_SUFFIX=_cohere4
%env ENGINE=faiss
%env EVALUATION_FILE=
%env FILE=data/source/Loinc_2.81/LoincTableCore/LoincTableCore.csv
%env FILE_VECTORIZED=data/source/Loinc_2.81/LoincTableCore/LoincTableCore_cohere4_1536_vectorized.csv
%env FILTER_FIELD=CLASS
%env FILTER_VALUE=ABXBACT
%env INDEX=loinc_data_cohere4_1536
%env KNN_COLUMNS=LONG_COMMON_NAME_cohere4
%env LIMIT_ROWS=5000
%env M=48
%env MAX_ATTEMPTS=10
%env ML_CONNECTOR_ROLE=arn:aws:iam::445236798872:role/opensearch-ingest-domain-RoleForMLConnector
%env OPENSEARCH_HOST=localhost
%env OPENSEARCH_PORT=9200
%env SEARCH_COLUMN=LONG_COMMON_NAME_cohere4
%env SEARCH_QUERY=Rifapentine
%env SKIP_ROWS=0   
%env SPACE_TYPE=l2
%env VECTOR_DIMENSION=1536
%env VECTORIZE_COLUMNS=LONG_COMMON_NAME
%env VECTORIZE_STRATEGY=combined
%env WAIT_TIME=0.1

In [None]:
import sys
from pathlib import Path


def load_project_root():
    # Add project root to Python path
    # Notebook is in examples/, so project root is one level up
    current_dir = Path.cwd()
    if current_dir.name == 'examples':
        project_root = current_dir.parent
    else:
        # Try to find project root by looking for pyproject.toml
        project_root = current_dir
        while project_root != project_root.parent:
            if (project_root / 'pyproject.toml').exists():
                break
            project_root = project_root.parent
        else:
            # Fallback: assume we're in examples/ and go up one level
            project_root = Path.cwd().parent

    if str(project_root) not in sys.path:
        sys.path.insert(0, str(project_root))

load_project_root()


### 2. Login to AWS with SSO (optional)

In [None]:
!aws sso login --profile $AWS_PROFILE

### 2.b Verify model access

In [None]:
import os

from lib.bedrock import (
    BedrockClient,
    EmbeddingModelOutput,
    EmbeddingType,
    InputType,
    InvokeEmbeddingModelCommand,
    InvokeModelCommand,
)

result = await InvokeEmbeddingModelCommand(
    InvokeModelCommand(client=BedrockClient(profile=os.environ["AWS_PROFILE"]))
).execute(
    model_id=InvokeEmbeddingModelCommand.get_model_id(os.environ["BEDROCK_MODEL_ID"]),
    inputs=["test"],
    input_type=InputType.CLASSIFICATION,
    embedding_types=[EmbeddingType.FLOAT],
    output_dimension=int(os.environ["VECTOR_DIMENSION"]),
)

for embedding in result:
    assert isinstance(embedding, EmbeddingModelOutput)


### 3. Vectorize reference dataset (offline tower)


In [None]:
load_project_root()
!cd .. && uv run python -m apps.cli.main vectorize \
    --bedrock-model-id $BEDROCK_MODEL_ID \
    --columns $VECTORIZE_COLUMNS \
    --embedding-column-suffix $EMBEDDING_COLUMN_SUFFIX \
    --file $FILE \
    --limit-rows $LIMIT_ROWS \
    --max-attempts $MAX_ATTEMPTS \
    --output $FILE_VECTORIZED \
    --overwrite \
    --profile $AWS_PROFILE \
    --skip-rows $SKIP_ROWS \
    --vector-dimension $VECTOR_DIMENSION \
    --vectorize-strategy $VECTORIZE_STRATEGY


### 4. Setup Opensearch

In [None]:
load_project_root()
!cd .. && uv run python -m apps.cli.main setup \
    --columns $KNN_COLUMNS \
    --ef-construction $EF_CONSTRUCTION \
    --ef-search $EF_SEARCH \
    --embedding-column-suffix $EMBEDDING_COLUMN_SUFFIX \
    --engine $ENGINE \
    --index $INDEX \
    --m $M \
    --opensearch-host $OPENSEARCH_HOST \
    --opensearch-port $OPENSEARCH_PORT \
    --profile $AWS_PROFILE \
    --space-type $SPACE_TYPE \
    --vector-dimension $VECTOR_DIMENSION \
    --delete --no-confirm


### 5. Ingest data into Opensearch

In [None]:
load_project_root()
!cd .. && uv run python -m apps.cli.main ingest \
    --file $FILE_VECTORIZED \
    --index $INDEX \
    --knn-columns $KNN_COLUMNS \
    --opensearch-host $OPENSEARCH_HOST \
    --opensearch-port $OPENSEARCH_PORT \
    --profile $AWS_PROFILE \
    --delete


### 6. Run entity matching

In [None]:
load_project_root()
!cd .. && uv run python -m apps.cli.main search \
    --bedrock-model-id $BEDROCK_MODEL_ID \
    --column $SEARCH_COLUMN \
    --embedding-column-suffix $EMBEDDING_COLUMN_SUFFIX \
    --filter-field $FILTER_FIELD \
    --filter-value $FILTER_VALUE \
    --index $INDEX \
    --opensearch-host $OPENSEARCH_HOST \
    --opensearch-port $OPENSEARCH_PORT \
    --profile $AWS_PROFILE \
    --vector-dimension $VECTOR_DIMENSION \
    --query $SEARCH_QUERY


## Utility commands

### Count tokens in a file

In [None]:
load_project_root()
!cd .. && uv run python -m apps.cli.main tokens --file $FILE

### Direct query to Opensearch

In [None]:
load_project_root()
!cd .. && uv run python -m apps.cli.main dev \
    --opensearch-host $OPENSEARCH_HOST \
    --opensearch-port $OPENSEARCH_PORT