In [1]:
import json
import psycopg2
import pandas as pd
from psycopg2.extras import execute_batch
from pathlib import Path
from sentence_transformers import SentenceTransformer
from psycopg2.extras import execute_values

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Database connection parameters
DB_PARAMS = {
    "dbname": "postgres",
    "user": "postgres",
    "password": "postgres",
    "host": "localhost",
    "port": "5433"  # Default is 5432
}

# Connect to PostgreSQL
try:
    conn = psycopg2.connect(**DB_PARAMS)
    conn.autocommit = True
    # cursor = conn.cursor()
except Exception as e:
    print(f"Error connecting to database: {e}")
    exit(1)

init_pgvector = "CREATE EXTENSION IF NOT EXISTS vector;" # execute this first
create_table = """CREATE TABLE "example1" (id bigserial PRIMARY KEY, keyword VARCHAR(100), embedding vector(384));"""
insert_embeddings = "INSERT INTO example1 (keyword, embedding) VALUES %s"

  from tqdm.autonotebook import tqdm, trange


In [None]:
#install pgvector
cur = conn.cursor()
cur.execute(init_pgvector)
cur.execute(create_table)
conn.commit()
cur.execute("select relname from pg_class where relkind='r' and relname !~ '^(pg_|sql_)';")
print(cur.fetchall())


[('example',), ('example1',)]


In [2]:
#limite maximo es de 384 tokens
sentences = ["hello world", "hello python", "bye python", "la girafa es roja", "python", "bython", "el mico esta trepando", "mico", "el mico es cafe"]
embeddings = model.encode(sentences, normalize_embeddings=True).tolist()

In [3]:
print(len(sentences))
print(len(embeddings[0]))

9
384


In [4]:
# embeddings = model.encode(sentences, normalize_embeddings=True).tolist()
data = [(sentence, embedding) for sentence, embedding in zip(sentences, embeddings)]
with conn.cursor() as curs:
    execute_values(
                curs,
                insert_embeddings,
                data,
                template="(%s, %s)"
            )

In [5]:
data = str(model.encode("python", normalize_embeddings=False).tolist())
with conn.cursor() as curs:
    curs.execute("""
            SELECT keyword,
            1-(embedding <=> %(data)s) as similarity
            FROM example
            --WHERE 1-(embedding <=> %(data)s) >= 0.5
            ORDER BY embedding <=> %(data)s
            LIMIT 5;
        """,
        {"data": data}
    )
    result = [(keyword) for keyword in curs.fetchall()]
result

[('python', 1.0),
 ('python', 1.0),
 ('hello python', 0.8450508713722229),
 ('hello python', 0.8450508713722229),
 ('bye python', 0.6831845045089722)]

In [6]:
with conn.cursor() as curs:
    curs.execute("CREATE INDEX ON example USING ivfflat (embedding vector_cosine_ops) WITH (lists = 100);")

In [25]:
cur = conn.cursor()
cur.execute("SELECT * FROM example1")
tuples_list = cur.fetchall()
#print(tuples_list)

df = pd.DataFrame(tuples_list, columns=["id","keyword","vector"])
df.head(5)

Unnamed: 0,id,keyword,vector
0,1,hello world,"[-0.03447727,0.031023229,0.006734971,0.0261089..."
1,2,hello python,"[-0.050150678,0.027626721,-0.014199368,0.05738..."
2,3,bye python,"[-0.0173582,0.055295315,0.044234786,0.05157163..."
3,4,la girafa es roja,"[-0.017561583,0.027168112,-0.05538615,0.040346..."
4,5,python,"[-0.05615145,0.017742552,-0.059133362,0.040266..."


In [52]:
from ast import literal_eval

df["vector"] = df.vector.apply(literal_eval)

df["vector_length"] = df["vector"].apply(len)

ValueError: malformed node or string: [-0.03447727, 0.031023229, 0.006734971, 0.026108956, -0.039362054, -0.1603025, 0.06692399, -0.0064414283, -0.04745049, 0.014758861, 0.070875295, 0.05552758, 0.019193375, -0.026251333, -0.01010954, -0.026940506, 0.022307457, -0.022226686, -0.1496926, -0.017493034, 0.0076762526, 0.054352295, 0.0032544436, 0.031725906, -0.08462135, -0.029406007, 0.05159558, 0.048124082, -0.0033147954, -0.058279198, 0.04196929, 0.022210686, 0.12818883, -0.022338957, -0.011656217, 0.062928334, -0.03287631, -0.091226056, -0.031175377, 0.05269955, 0.0470348, -0.084203094, -0.030056167, -0.02074481, 0.009517839, -0.0037218125, 0.007343314, 0.039324336, 0.093274035, -0.0037885893, -0.052742094, -0.05805821, -0.0068643717, 0.0052832714, 0.08289297, 0.019362831, 0.006284503, -0.010330768, 0.009032373, -0.037683733, -0.04520607, 0.024016323, -0.006944155, 0.013491643, 0.10005486, -0.07168391, -0.021695092, 0.03161841, -0.051634617, -0.082247764, -0.06569329, -0.009895369, 0.005816376, 0.0735546, -0.034050286, 0.024886087, 0.014488119, 0.026457386, 0.009656749, 0.030217245, 0.052803975, -0.07535987, 0.009897141, 0.029836833, 0.017555563, 0.023091994, 0.0019338616, 0.0014002392, -0.047175948, -0.011194309, -0.114201404, -0.019811928, 0.04026621, 0.0021929985, -0.0797922, -0.025382321, 0.09448301, -0.028981108, -0.14500257, 0.23097746, 0.027731156, 0.03211152, 0.031065047, 0.042832874, 0.06423779, 0.03216321, -0.004876744, 0.05569943, -0.037532415, -0.021505516, -0.02834265, -0.028846953, 0.038353045, -0.017468644, 0.05248531, -0.074876025, -0.03125975, 0.02184157, -0.03989569, -0.008587065, 0.026956527, -0.048495486, 0.0114698745, 0.02961826, -0.020572208, 0.0131038455, 0.028833456, -3.194199e-33, 0.064782135, -0.01813022, 0.05178991, 0.121982746, 0.02878016, 0.00872198, -0.07052119, -0.016907321, 0.040739674, 0.04211619, 0.025447167, 0.035746258, -0.04914478, 0.0021290418, -0.015546542, 0.05073059, -0.048185244, 0.03588061, -0.004067046, 0.10172473, -0.055970006, -0.010681048, 0.011235791, 0.09068652, 0.004234447, 0.035138693, -0.009702851, -0.093865186, 0.092855595, 0.0080049345, -0.007705391, -0.052086715, -0.0125880195, 0.0032669476, 0.0060135256, 0.0075815828, 0.010517144, -0.08634556, -0.06987879, -0.0025339008, -0.0909766, 0.046887293, 0.052076533, 0.007193838, 0.010903631, -0.0052295118, 0.013937317, 0.021968327, 0.03420856, 0.06022471, 0.0001166287, 0.014732014, -0.070089236, 0.028499085, -0.027601719, 0.010768415, 0.034830917, -0.022487888, 0.0097690625, 0.0772278, 0.021588406, 0.11495623, -0.06800117, 0.02376095, -0.015983969, -0.01782696, 0.06439487, 0.032025676, 0.050270278, -0.0059137046, -0.03370804, 0.017840274, 0.016573349, 0.06329653, 0.03467719, 0.04647347, 0.09790612, -0.0066355206, 0.025207091, -0.07798831, 0.01692645, -0.0009458001, 0.022471858, -0.03825319, 0.09570478, -0.005350807, 0.010469062, -0.115240544, -0.013262538, -0.010709436, -0.08311731, 0.07327359, 0.049392264, -0.008994344, -0.095845595, 3.3661493e-33, 0.12493183, 0.019349722, -0.05822576, -0.035988245, -0.050746743, -0.045662414, -0.082603365, 0.14819477, -0.08842121, 0.060274433, 0.05103016, 0.010303189, 0.14121422, 0.030813828, 0.0610331, -0.052851237, 0.13664895, 0.009189892, -0.017325232, -0.012848629, -0.007995269, -0.05098006, -0.052350637, 0.007593023, -0.015166365, 0.016960315, 0.021270562, 0.020558035, -0.12002814, 0.014461805, 0.026759928, 0.02533069, -0.04275463, 0.0067684664, -0.014458545, 0.045261968, -0.091476545, -0.019439096, -0.017833486, -0.054910176, -0.052641056, -0.010459077, -0.05201606, 0.020891989, -0.07997032, -0.01211131, -0.0577314, 0.02317825, -0.008031736, -0.025989288, -0.07995674, -0.020728845, 0.048817642, -0.020389147, -0.0491766, 0.0141596915, -0.063622035, -0.0078074057, 0.01643153, -0.025682492, 0.013381166, 0.02624878, 0.009978411, 0.06322888, 0.00267219, -0.006582785, 0.016631931, 0.032366455, 0.037942518, -0.03637604, -0.006910864, 0.00015970584, -0.0016334952, -0.027278189, -0.02803813, 0.049681455, -0.02886723, -0.002418075, 0.014774918, 0.009764575, 0.005797604, 0.0134861665, 0.005567906, 0.03722711, 0.007232497, 0.040156234, 0.08150331, 0.07199169, -0.013056158, -0.042882044, -0.011011214, 0.00489779, -0.009229707, 0.03519152, -0.05103506, -1.571438e-08, -0.088624395, 0.023909308, -0.016238755, 0.031700518, 0.027284278, 0.052468795, -0.047071014, -0.058847513, -0.06320818, 0.040888563, 0.049828034, 0.10655168, -0.07450239, -0.012495374, 0.018370701, 0.039474104, -0.024797896, 0.0145163, -0.037069198, 0.020015689, -4.85896e-05, 0.009866573, 0.024838768, -0.052458182, 0.029314188, -0.0871919, -0.014499757, 0.02601911, -0.018746357, -0.0762052, 0.03504326, 0.103639536, -0.028050505, 0.012718171, -0.07632548, -0.018652355, 0.024976725, 0.08144535, 0.068758845, -0.064056635, -0.08389384, 0.061362334, -0.03354556, -0.106153384, -0.04008058, 0.03253021, 0.07662487, -0.07301625, 0.00033765417, -0.040871613, -0.075788505, 0.027527638, 0.07462542, 0.017717304, 0.0912185, 0.1102202, 0.0005698155, 0.051463317, -0.014551324, 0.03323203, 0.023792308, -0.02288983, 0.03893752, 0.03020686]

In [None]:
df.head(5)

Unnamed: 0,id,keyword,vector,vector_length
0,1,hello world,"[-0.03447727, 0.031023229, 0.006734971, 0.0261...",384
1,2,hello python,"[-0.050150678, 0.027626721, -0.014199368, 0.05...",384
2,3,bye python,"[-0.0173582, 0.055295315, 0.044234786, 0.05157...",384
3,4,la girafa es roja,"[-0.017561583, 0.027168112, -0.05538615, 0.040...",384
4,5,python,"[-0.05615145, 0.017742552, -0.059133362, 0.040...",384
