In [16]:
from numpy import ndarray
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

Loading weights: 100%|██████████| 103/103 [00:00<00:00, 1625.69it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


In [17]:
database_connection_string = "host=localhost port=5432 dbname=appdb user=app password=password"

In [11]:
async def generate_embeddings(string: list):
    return  model.encode(string, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=False, device=["mps", "mps"])

In [12]:
import psycopg

BATCH_SIZE = 100

async def update_embeddings_in_batches():
    conn_str = "dbname=appdb user=app password=password host=localhost"

    async with await psycopg.AsyncConnection.connect(conn_str) as read_conn, \
               await psycopg.AsyncConnection.connect(conn_str) as write_conn:

        async with read_conn.transaction():
            async with read_conn.cursor(name="embedding_cursor") as read_cur:
                await read_cur.execute(
                    "SELECT id, sentence FROM sentences WHERE true"
                )

                async with write_conn.cursor() as write_cur:
                    while True:
                        batch = await read_cur.fetchmany(BATCH_SIZE)
                        if not batch:
                            break

                        batch_ids = [r[0] for r in batch]
                        batch_texts = [r[1] for r in batch]

                        try:
                            embeddings = await generate_embeddings(batch_texts)

                            if hasattr(embeddings[0], "tolist"):
                                embeddings = [e.tolist() for e in embeddings]

                            update_data = list(zip(embeddings, batch_ids))

                            await write_cur.executemany(
                                "UPDATE sentences SET embedding = %s WHERE id = %s",
                                update_data
                            )
                            await write_conn.commit()
                            print(f"Committed {len(batch)} records.")

                        except Exception as e:
                            await write_conn.rollback()
                            print(f"Error: {e}")

await update_embeddings_in_batches()

Committed 100 records.
Committed 99 records.


In [18]:
from pgvector.psycopg import register_vector
import psycopg

query = "5 buckets of life"
query_embedding = await generate_embeddings([query])

# Flatten properly
query_embedding = query_embedding[0]

with psycopg.connect(database_connection_string) as conn:
    register_vector(conn)

    with conn.cursor() as cur:

        cur.execute(
            """
            SELECT sentence, embedding
            FROM sentences
            ORDER BY embedding <=> %s
            LIMIT 10
            """,
            (query_embedding,)
        )

        results = cur.fetchall()

for result in results:
    print(result[0])

If you look at what you have in life, you'll always have more.
Trust is gained in drops and lost in buckets.
You are the average of the five people you spend the most time with.
Hard choices, easy life. Easy choices, hard life.
Life is 10% what happens to you and 90% how you react to it.
The meaning of life is to find your gift. The purpose of life is to give it away.
An unexamined life is not worth living.
Every cloud has a silver lining.
Make your life a masterpiece; imagine no limitations on what you can be.
Live as if you were to die tomorrow. Learn as if you were to live forever.
