# Search and Retrieval

## Setup

In [1]:
from curate_gpt.store import get_store

# assumes pre-populated
store = get_store("chromadb", "../../db")

## Similarity Search

Searches based on embedding of text in query against vector database

In [2]:
for obj, distance, info in store.search("forebrain neurons", collection="ont_cl"):
    print(f"* {obj['id']}\n    - Distance: {distance} Text: {info['document'][0:20]}...")

## Retrieval

Boolean queries

In [35]:
objs = list(store.find({"id": "NeuronOfTheForebrain"}, collection="ont_cl"))
objs

## Including Vector Embeddings in results



In [14]:
objs = list(store.find({"id": "NeuronOfTheForebrain"}, collection="ont_cl", include=["metadatas", "documents", "embeddings"]))
_, __, info = objs[0]
info["_embeddings"][0:20]

## Lookup by ID

Assumes that there is an identifier column

In [6]:
store.lookup("NeuronOfTheForebrain", collection="ont_cl")

## All by All

In [15]:
objs = list(store.find({}, collection="ont_cl", include=["metadatas", "documents", "embeddings"]))
len(objs)

In [32]:
import numpy as np

def compute_cosine_similarity(list1, list2):
    # Convert lists to numpy arrays
    matrix1 = np.array(list1)
    matrix2 = np.array(list2)
    
    # Normalize the vectors in both matrices
    matrix1_norm = matrix1 / np.linalg.norm(matrix1, axis=1)[:, np.newaxis]
    matrix2_norm = matrix2 / np.linalg.norm(matrix2, axis=1)[:, np.newaxis]
    
    # Compute dot products (resulting in cosine similarity values)
    cosine_similarity_matrix = np.dot(matrix1_norm, matrix2_norm.T)
    
    return cosine_similarity_matrix

# Sample lists of vectors
list1 = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
list2 = [[7, 8, 9], [1, 0, 0], [0, 1, 0], [0, 0, 1]]

cosine_similarity_matrix = compute_cosine_similarity(list1, list2)
cosine_similarity_matrix


In [33]:
cosine_similarity_matrix[2][0]

In [17]:
vectors = [info["_embeddings"] for _, __, info in objs]

In [18]:
cosine_similarity_matrix = compute_cosine_similarity(vectors, vectors)

In [19]:
len(cosine_similarity_matrix)

In [20]:
def top_matches(cosine_similarity_matrix):
    # Find the index of the maximum value in each row
    top_match_indices = np.argmax(cosine_similarity_matrix, axis=1)
    
    # Find the maximum similarity value in each row
    top_match_values = np.amax(cosine_similarity_matrix, axis=1)
    
    return top_match_indices, top_match_values

In [34]:
list1 = [[0, 0, 1], [0, 1, 0], [1, 0, 0]]
list2 = [[0.5, 0.5, 0.5], [1, 0, 0], [0, 1, 0], [0, 0, 1]]

# Re-compute the cosine similarity matrix
test_matrix = compute_cosine_similarity(list1, list2)

# Find the top matches
top_indices, top_values = top_matches(test_matrix)
top_indices, top_values

In [22]:
tm_ix, tm_vals = top_matches(cosine_similarity_matrix)
len(tm_ix)

In [24]:
tm_ix[0:5]

In [25]:
tm_vals[0:5]

## Latency Check

In [74]:
store = get_store("chromadb", "../../db")
client = store.client

In [75]:
cxn = client.get_collection("ont_ecosim")

In [76]:
x = cxn.get(where={}, include=['metadatas', 'documents', 'embeddings'])

In [77]:
len(x["embeddings"])

In [78]:
len(x["embeddings"][0])

In [69]:
x = cxn.get(where={}, include=['metadatas', 'documents', 'embeddings'])

## DuckDBAdapter

In [7]:
from curate_gpt.store import get_store
duck_store = get_store("duckdb", "./duckdb_jn.db")

ValueError: Unknown view duckdb_vss, not found in [<class 'curate_gpt.store.chromadb_adapter.ChromaDBAdapter'>, <class 'curate_gpt.store.duckdb_adapter.DuckDBAdapter'>, <class 'curate_gpt.store.in_memory_adapter.InMemoryAdapter'>]

In [8]:
duck_store.list_collection_names()

NameError: name 'duck_store' is not defined

In [3]:
obj_to_insert = {
    'id': '10MinuteAPGARScoreOf0',
    'label': '10-minute APGAR score of 0',
    'definition': None,
    'aliases': None,
    'relationships': [{'predicate': 'subClassOf', 'target': 'Low10MinuteAPGARScore'}],
    'logical_definition': None,
    'original_id': 'HP:0033468'
}

In [4]:
duck_store.insert([obj_to_insert], collection="test_collection")
duck_store.insert([{"id": "test7", "label": "This is no test"}])
duck_store.insert([{"id": "test2", "label": "This is no test"}])
duck_store.insert([{"id": "test3", "label": "This is a test"}])
duck_store.insert([{"id": "test4", "label": "This is a test"}])
duck_store.insert([{"id": "test5", "label": "This is no test"}])
duck_store.insert([{"id": "test6", "label": "This is no test"}])

NameError: name 'duck_store' is not defined

In [5]:
for obj, distance, info in duck_store.matches({"id":"test3"}, collection="test_collection")
    print(f"* {obj['id']}\n    - Distance: {distance} Text: {info['document'][0:20]}...")

SyntaxError: expected ':' (2451211334.py, line 2)

In [8]:
duck_store.lookup("test1", collection="test_collection")

In [9]:
objs_find = list(duck_store.find({"id": "test2"}, collection="test_collection"))
_, __, info = objs_find[0]
info["_embeddings"][0:20]

In [10]:
x = list(duck_store.find({}, collection="test_collection"))
x

In [9]:
import json
for o, d, m in duck_store.search("test", collection="test_collection"):
    print(f"* {json.dumps(o)}\n    - Distance: {d}")

NameError: name 'duck_store' is not defined