In [1]:
import base64
import json
from io import BytesIO
import time
import numpy as np
from PIL import Image, UnidentifiedImageError
import faiss
import boto3
import psycopg
import requests
import torch
import vecs
from datetime import datetime
import database_admin

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
STACK_NAME = "pg-vectors-similarity-search-test"

cloudformation = boto3.client("cloudformation", region_name="us-east-2")
stack_outputs = cloudformation.describe_stacks(StackName=STACK_NAME)["Stacks"][0]["Outputs"]

[secrets_arn] = [o["OutputValue"] for o in stack_outputs if o["OutputKey"]== "databasesecretsarn"]

secretsmanager = boto3.client("secretsmanager", region_name="us-east-2")
database_secrets = json.loads(secretsmanager.get_secret_value(SecretId=secrets_arn)['SecretString'])
database_secrets = {**{k:v for k,v in database_secrets.items() if k in ["host", "password", "port", "dbname"]}, "user":database_secrets["username"]}

[task_definition] = [o["OutputValue"] for o in stack_outputs if o["OutputKey"]== "taskdefinitionarn"]

ecs = boto3.client("ecs", region_name="us-east-2")

INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


In [20]:
def b64_image_to_tensor(image: str) -> torch.Tensor:
    """convert input b64image to torch tensor"""
    # handle image
    img_bytes = base64.b64decode(image)
    tmp = BytesIO()
    tmp.write(img_bytes)
    try:
        img = Image.open(tmp)
    except UnidentifiedImageError:
        raise HTTPException(status_code=400, detail="Cannot recognize image format.")
    np_img = np.expand_dims(np.moveaxis(np.array(img), -1, 0), axis=0)
    if len(np_img.shape) != 4:
        raise HTTPException(
            status_code=400,
            detail=f"Image with shape {np.moveaxis(np_img, 1, -1).shape[1:]} is not processable. Use image with 3 channels.",
        )
    return torch.tensor(np_img).float()

def get_model():
    """start torch model"""
    # load encoder
    model = torch.jit.load("./encoder.pt", map_location=torch.device("cpu"))
    return model

def prep_query_vector(image_bytes): 

    tensor = b64_image_to_tensor(encoded_image_bytes)
    embedding = model(tensor)[0].detach().numpy()
    
    query_vector = embedding[0]
    return query_vector

model = get_model()

gibs_image_url = "https://gibs.earthdata.nasa.gov/wmts/epsg3857/best/MODIS_Terra_CorrectedReflectance_TrueColor/default/2023-08-08/GoogleMapsCompatible_Level9/8/111/15.jpg"
encoded_image_bytes = base64.b64encode(requests.get(gibs_image_url).content)


In [9]:
task = ecs.run_task(
    taskDefinition="arn:aws:ecs:us-east-2:350996086543:task-definition/pgvectorssimilaritysearchtestpgvectorsingesttaskBC438D7F:60",
    cluster="arn:aws:ecs:us-east-2:350996086543:cluster/pg-vectors-similarity-search-test-ecscluster7830E7B5-eJ0S8m1U6RKk",
    count=1,
    launchType="FARGATE",
    networkConfiguration={
        'awsvpcConfiguration': {
            'subnets': [
                'subnet-0b7ed75d0fa9de2e4',
                'subnet-0d19d2d8e844c6107' 
            ],
            'securityGroups': [
                'sg-03aa718ad98e4588e',
            ],
            'assignPublicIp': 'DISABLED'
        }
    }
)
pgvector_client =  database_admin.PGVectorClient(
    collection_name="similaritysearch",
    database_connection_string=f"postgresql://{database_secrets['user']}:{database_secrets['password']}@{database_secrets['host']}:{database_secrets['port']}/{database_secrets['dbname']}", 
    vector_dimensions=128, 
    index_type="IVF"
)


INFO:database_admin:Created metadata table and collection with name similaritysearch and vector dimensions 128


In [4]:
task

{'tasks': [{'attachments': [{'id': '0a5f613a-b8d5-4340-9c89-7cf02a41cb4e',
     'type': 'ElasticNetworkInterface',
     'status': 'PRECREATED',
     'details': [{'name': 'subnetId', 'value': 'subnet-0b7ed75d0fa9de2e4'}]}],
   'attributes': [{'name': 'ecs.cpu-architecture', 'value': 'x86_64'}],
   'availabilityZone': 'us-east-2b',
   'clusterArn': 'arn:aws:ecs:us-east-2:350996086543:cluster/pg-vectors-similarity-search-test-ecscluster7830E7B5-eJ0S8m1U6RKk',
   'containers': [{'containerArn': 'arn:aws:ecs:us-east-2:350996086543:container/pg-vectors-similarity-search-test-ecscluster7830E7B5-eJ0S8m1U6RKk/d55e13ba19a94915bae6102f84bd7c57/ef1debc5-6ed8-4c9c-b6b4-752a1f2c69bd',
     'taskArn': 'arn:aws:ecs:us-east-2:350996086543:task/pg-vectors-similarity-search-test-ecscluster7830E7B5-eJ0S8m1U6RKk/d55e13ba19a94915bae6102f84bd7c57',
     'name': 'pg-vectors-ingest-container',
     'image': '350996086543.dkr.ecr.us-east-2.amazonaws.com/cdk-hnb659fds-container-assets-350996086543-us-east-2:7a86

In [27]:
query_vector = prep_query_vector(encoded_image_bytes)


distances = [1,2,4,8]
neighors = [10, 25, 100, 1000]
filters = {"and": {"image_dt": {"$lt": "2020-" }}, {"image_dt": {"$gt":}}}
        
for neighbors in 10, 25, 100, 1000: 
        
res = pgvector_client.query(
    vector=query_vector, 
    neighbors=100,
    distance=10,
)

NoResultFound: No row was found when one was required

In [14]:
res

[('2079040', 2.087820976093045, {'quadkey': '03132123', 'image_dt': '2000-07-10'}),
 ('2156041', 2.3478170901896265, {'quadkey': '12221102', 'image_dt': '2000-07-25'}),
 ('744376', 2.551118138849677, {'quadkey': '31011130', 'image_dt': '2000-05-05'}),
 ('2018752', 2.7679285860275202, {'quadkey': '03302233', 'image_dt': '2000-07-26'}),
 ('727639', 2.818507946133093, {'quadkey': '13233001', 'image_dt': '2000-05-18'}),
 ('1201294', 2.9839941209657828, {'quadkey': '03323030', 'image_dt': '2000-06-07'}),
 ('312127', 3.0822567377655448, {'quadkey': '03231323', 'image_dt': '2000-05-29'}),
 ('1744142', 3.171083964059287, {'quadkey': '02030021', 'image_dt': '2000-07-11'}),
 ('1714104', 3.173217690466541, {'quadkey': '02201200', 'image_dt': '2000-07-27'}),
 ('2497736', 3.1863927115855737, {'quadkey': '13312232', 'image_dt': '2000-07-26'}),
 ('2012007', 3.2087059517234855, {'quadkey': '03120201', 'image_dt': '2000-07-14'}),
 ('2015388', 3.2511171431802306, {'quadkey': '03122010', 'image_dt': '200

In [11]:
start = time.time()

query_vector = prep_query_vector(encoded_image_bytes)
neighbors_query = f"""
SELECT *, embedding <-> '{query_vector}' as distance 
FROM images 
ORDER BY embedding <-> '{query_vector}' 
LIMIT 5"""

with psycopg.connect(**database_secrets) as conn:
    with conn.cursor() as cursor:     
        results = list(cursor.execute(neighbors_query).fetchall())

elapsed = round(time.time() - start, 2)
print(f"Total results: {len(results)}. Took {elapsed} seconds")

DataException: different vector dimensions 128 and 512

In [32]:
distance_query = f"""
SELECT *, embedding <-> '{query_vector}' as distance 
FROM images 
WHERE embedding <-> '{query_vector}' < 5 
ORDER BY embedding <-> '{query_vector}'
"""

start = time.time()
query_vector = prep_query_vector(encoded_image_bytes)
with psycopg.connect(**database_secrets) as conn:
    with conn.cursor() as cursor:     
        results = list(cursor.execute(distance_query).fetchall())

elapsed = round(time.time() - start, 2)
print(f"Total results: {len(results)}. Took {elapsed} seconds")

Total results: 938. Took 3.07 seconds


In [36]:
distance_query = f"""
SELECT *, embedding <-> '{query_vector}' as distance 
FROM images 
WHERE embedding <-> '{query_vector}' < 5 
AND datetime BETWEEN '2020-01-01'::timestamp AND '2020-06-01'::timestamp
ORDER BY embedding <-> '{query_vector}'
"""

start = time.time()
query_vector = prep_query_vector(encoded_image_bytes)
with psycopg.connect(**database_secrets) as conn:
    with conn.cursor() as cursor:     
        results = list(cursor.execute(distance_query).fetchall())

elapsed = round(time.time() - start, 2)
print(f"Total results: {len(results)}. Took {elapsed} seconds")

Total results: 305. Took 2.67 seconds


In [14]:
STACK_NAME = "similarity-search-api-v2-dev"
cloudformation = boto3.client("cloudformation")
stack_outputs = cloudformation.describe_stacks(StackName=STACK_NAME)["Stacks"][0]["Outputs"]
[endpoint_url] = [o["OutputValue"] for o in stack_outputs if o["OutputKey"].startswith("apiEndpoint")]


In [25]:
start = time.time()
resp = requests.post(f"https://{endpoint_url}/search", data=json.dumps({"image":encoded_image_bytes.decode(), "neighbors":3})).json()
elapsed = round(time.time() - start, 2)
print(f"Total results: {resp['numberMatched']}. Took {elapsed} seconds")

Total results: 3. Took 1.43 seconds


In [30]:
start = time.time()
resp = requests.post(f"https://{endpoint_url}/distance", data=json.dumps({"image":encoded_image_bytes.decode(), "distance":5})).json()
elapsed = round(time.time() - start, 2)
print(f"Total results: {resp}. Took {elapsed} seconds")

Total results: {'type': 'FeatureCollection', 'features': [{'type': 'Feature', 'bbox': [-157.5, -21.943045533438177, -135.0, 0.0], 'id': 'Tile(x=1, y=8, z=4)', 'geometry': {'type': 'Polygon', 'coordinates': [[[-157.5, -21.943045533438177], [-157.5, 0.0], [-135.0, 0.0], [-135.0, -21.943045533438177], [-157.5, -21.943045533438177]]]}, 'properties': {'title': 'XYZ tile Tile(x=1, y=8, z=4)', 'bbox': [-157.5, -21.943045533438177, -135.0, 0.0], 'bin_start_time': '2001-01-01T00:00:00+00:00', 'count': 6}}, {'type': 'Feature', 'bbox': [-135.0, -21.943045533438177, -112.5, 0.0], 'id': 'Tile(x=2, y=8, z=4)', 'geometry': {'type': 'Polygon', 'coordinates': [[[-135.0, -21.943045533438177], [-135.0, 0.0], [-112.5, 0.0], [-112.5, -21.943045533438177], [-135.0, -21.943045533438177]]]}, 'properties': {'title': 'XYZ tile Tile(x=2, y=8, z=4)', 'bbox': [-135.0, -21.943045533438177, -112.5, 0.0], 'bin_start_time': '2001-01-01T00:00:00+00:00', 'count': 3}}, {'type': 'Feature', 'bbox': [45.0, 0.0, 67.5, 21.943