In [1]:
import json
import psycopg2
import pandas as pd
from psycopg2.extras import execute_batch
from pathlib import Path
from sentence_transformers import SentenceTransformer
from psycopg2.extras import execute_values

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Database connection parameters
DB_PARAMS = {
    "dbname": "postgres",
    "user": "postgres",
    "password": "postgres",
    "host": "localhost",
    "port": "5433"  # Default is 5432
}

# Connect to PostgreSQL
try:
    conn = psycopg2.connect(**DB_PARAMS)
    conn.autocommit = True
    # cursor = conn.cursor()
except Exception as e:
    print(f"Error connecting to database: {e}")
    exit(1)

init_pgvector = "CREATE EXTENSION IF NOT EXISTS vector;" # execute this first
create_table = """CREATE TABLE "example" (id bigserial PRIMARY KEY, keyword VARCHAR(100), embedding vector(384));"""
insert_embeddings = "INSERT INTO example (keyword, embedding) VALUES %s"

  from tqdm.autonotebook import tqdm, trange


In [None]:
#install pgvector
cur = conn.cursor()
cur.execute(init_pgvector)
cur.execute(create_table)
conn.commit()
cur.execute("select relname from pg_class where relkind='r' and relname !~ '^(pg_|sql_)';")
print(cur.fetchall())


[('example',), ('example1',)]


In [2]:
#limite maximo es de 384 tokens
sentences = ["hello world", "hello python", "bye python", "la girafa es roja", "python", "bython", "el mico esta trepando", "mico", "el mico es cafe"]
embeddings = model.encode(sentences, normalize_embeddings=True).tolist()

In [3]:
print(len(sentences))
print(len(embeddings[0]))

9
384


In [4]:
# embeddings = model.encode(sentences, normalize_embeddings=True).tolist()
data = [(sentence, embedding) for sentence, embedding in zip(sentences, embeddings)]
with conn.cursor() as curs:
    execute_values(
                curs,
                insert_embeddings,
                data,
                template="(%s, %s)"
            )

In [5]:
data = str(model.encode("python", normalize_embeddings=False).tolist())
with conn.cursor() as curs:
    curs.execute("""
            SELECT keyword,
            1-(embedding <=> %(data)s) as similarity
            FROM example
            --WHERE 1-(embedding <=> %(data)s) >= 0.5
            ORDER BY embedding <=> %(data)s
            LIMIT 5;
        """,
        {"data": data}
    )
    result = [(keyword) for keyword in curs.fetchall()]
result

[('python', 1.0),
 ('python', 1.0),
 ('hello python', 0.8450508713722229),
 ('hello python', 0.8450508713722229),
 ('bye python', 0.6831845045089722)]

In [6]:
with conn.cursor() as curs:
    curs.execute("CREATE INDEX ON example USING ivfflat (embedding vector_cosine_ops) WITH (lists = 100);")

In [2]:
cur = conn.cursor()
cur.execute("SELECT * FROM example1")
tuples_list = cur.fetchall()
#print(tuples_list)

df = pd.DataFrame(tuples_list, columns=["id","keyword","vector"])
df.head(5)

Unnamed: 0,id,keyword,vector
0,1,hello world,"[-0.03447727,0.031023229,0.006734971,0.0261089..."
1,2,hello python,"[-0.050150678,0.027626721,-0.014199368,0.05738..."
2,3,bye python,"[-0.0173582,0.055295315,0.044234786,0.05157163..."
3,4,la girafa es roja,"[-0.017561583,0.027168112,-0.05538615,0.040346..."
4,5,python,"[-0.05615145,0.017742552,-0.059133362,0.040266..."


In [3]:
from ast import literal_eval

df["vector"] = df.vector.apply(literal_eval)

df["vector_length"] = df["vector"].apply(len)

In [4]:
df.head(5)

Unnamed: 0,id,keyword,vector,vector_length
0,1,hello world,"[-0.03447727, 0.031023229, 0.006734971, 0.0261...",384
1,2,hello python,"[-0.050150678, 0.027626721, -0.014199368, 0.05...",384
2,3,bye python,"[-0.0173582, 0.055295315, 0.044234786, 0.05157...",384
3,4,la girafa es roja,"[-0.017561583, 0.027168112, -0.05538615, 0.040...",384
4,5,python,"[-0.05615145, 0.017742552, -0.059133362, 0.040...",384


In [13]:
#install pgvector
create_table = """CREATE TABLE "chunks" (id bigserial PRIMARY KEY, keyword VARCHAR(400), embedding vector(384));"""

cur = conn.cursor()
cur.execute(create_table)
conn.commit()

In [14]:
#limite maximo es de 384 tokens
chunk1 = """
El Señor de los Anillos es una épica de fantasía escrita por J.R.R. Tolkien. 
"""
chunk2 = """
La historia sigue a Frodo Bolsón, un hobbit que hereda un anillo poderoso y maligno creado por el Señor Oscuro, 
Sauron
"""
chunk3 = """
para dominar la Tierra Media. Acompañado por un grupo de aliados conocidos como la Comunidad del Anillo 
"""
chunk4 = """
incluyendo a Aragorn, Gandalf, Legolas, Gimli, Sam, Merry y Pippin, 
Frodo debe viajar a Mordor para destruir el Anillo en el único lugar donde puede ser destruido: el Monte del Destino.
"""
chunk5 = """
En su travesía, la Comunidad enfrenta numerosos peligros: desde orcos y traiciones hasta el constante acecho de Gollum, 
una criatura corrompida por el Anillo. 
"""
chunk6 = """
La historia explora temas como la amistad, la corrupción del poder y la lucha entre
el bien y el mal. Al final, tras arduas batallas y sacrificios, el Anillo es destruido y la paz retorna a la Tierra Media.
"""


In [15]:
sentences = [chunk1, chunk2, chunk3, chunk4, chunk5, chunk6]
embeddings = model.encode(sentences, normalize_embeddings=True).tolist()

print(len(sentences))
print(len(embeddings[0]))

6
384


In [16]:
insert_embeddings = "INSERT INTO chunks (keyword, embedding) VALUES %s"
data = [(sentence, embedding) for sentence, embedding in zip(sentences, embeddings)]
with conn.cursor() as curs:
    execute_values(
                curs,
                insert_embeddings,
                data,
                template="(%s, %s)"
            )

In [21]:
data = str(model.encode("grupo de personajes", normalize_embeddings=False).tolist())
with conn.cursor() as curs:
    curs.execute("""
            SELECT keyword,
            1-(embedding <=> %(data)s) as similarity
            FROM chunks
            --WHERE 1-(embedding <=> %(data)s) >= 0.5
            ORDER BY embedding <=> %(data)s
            LIMIT 5;
        """,
        {"data": data}
    )
    result = [(keyword) for keyword in curs.fetchall()]
result

[('\npara dominar la Tierra Media. Acompañado por un grupo de aliados conocidos como la Comunidad del Anillo \n',
  0.462298136558964),
 ('\nLa historia explora temas como la amistad, la corrupción del poder y la lucha entre\nel bien y el mal. Al final, tras arduas batallas y sacrificios, el Anillo es destruido y la paz retorna a la Tierra Media.\n',
  0.4284088994630204),
 ('\nincluyendo a Aragorn, Gandalf, Legolas, Gimli, Sam, Merry y Pippin, \nFrodo debe viajar a Mordor para destruir el Anillo en el único lugar donde puede ser destruido: el Monte del Destino.\n',
  0.4005504012599963),
 ('\nLa historia sigue a Frodo Bolsón, un hobbit que hereda un anillo poderoso y maligno creado por el Señor Oscuro, \nSauron\n',
  0.3714124296378296),
 ('\nEn su travesía, la Comunidad enfrenta numerosos peligros: desde orcos y traiciones hasta el constante acecho de Gollum, \nuna criatura corrompida por el Anillo. \n',
  0.3212281369797021)]