In [1]:
import duckdb

In [9]:
con = duckdb.connect(database='jus.duckdb', read_only=False)

In [10]:
con.execute("CREATE TABLE BigBird(id INT PRIMARY KEY, data FLOAT8[]);")

<duckdb.duckdb.DuckDBPyConnection at 0x7fb23e2274f0>

In [11]:
con.execute("CREATE TABLE BigBird_norm(id INT UNIQUE REFERENCES BigBird(id), norm FLOAT8);")

<duckdb.duckdb.DuckDBPyConnection at 0x7fb23e2274f0>

In [12]:
data = con.execute("SELECT * FROM BigBird").df()
data

Unnamed: 0,id,data


In [13]:
import pandas as pd
import numpy as np

# Exemplo de conjunto de dados
dados = {
    'x': [25, 29, 30,1],
    'y': [3, 5, 6,0],
    'z': [3, 4, -3,0],
    'k': [1,0,0,1]
}
dados_selecionados = pd.DataFrame(dados)

def cosine_similarity(A, B):

    # Calcula o produto escalar
    dot_product = np.dot(A, B)
    
    # Calcula as normas dos vetores
    norm_A = np.linalg.norm(A)
    norm_B = np.linalg.norm(B)
    
    if norm_A == 0 or norm_B == 0:
        return 0.0
    
    # Calcula a similaridade de cosseno
    cosine_sim = dot_product / (norm_A * norm_B)
    
    return cosine_sim

# Calculando a similaridade entre candidatos
num_candidatos = len(dados_selecionados)
similaridades = np.zeros((num_candidatos, num_candidatos))

for i in range(num_candidatos):
    for j in range(i+1, num_candidatos):  # começar a partir de i+1 
        sim = cosine_similarity(dados_selecionados.iloc[i], dados_selecionados.iloc[j])
        similaridades[i][j] = sim
        similaridades[j][i] = sim  # preenchendo a matriz simétrica

similaridades

array([[0.        , 0.99779337, 0.97293301, 0.72446168],
       [0.99779337, 0.        , 0.97266384, 0.69047619],
       [0.97293301, 0.97266384, 0.        , 0.69006556],
       [0.72446168, 0.69047619, 0.69006556, 0.        ]])

In [14]:
con.execute("""
INSERT INTO BigBird (id,data) VALUES
	(0,ARRAY[25.0,3.0,3.0,1.0]),
	(1,ARRAY[29.0,5.0,4.0,0.0]),
	(2,ARRAY[30.0,6.0,-3.0,0.0]),
  (3,ARRAY[1.0,0.0,0.0,1.0])
;
""")

<duckdb.duckdb.DuckDBPyConnection at 0x7fb23e2274f0>

In [15]:
con.execute("""
WITH norms AS (
    SELECT 
        id,
        SQRT(SUM(POWER(val, 2))) AS norm
    FROM 
    (
        SELECT
            id,
            UNNEST(data) AS val
        FROM 
            BigBird
    ) AS subquery
    GROUP BY id
)

INSERT INTO BigBird_norm (id, norm)
SELECT id, norm
FROM norms
ON CONFLICT (id) 
DO UPDATE SET norm = EXCLUDED.norm;

""")

<duckdb.duckdb.DuckDBPyConnection at 0x7fb23e2274f0>

In [16]:
con.execute("SELECT * FROM BigBird_norm").df()

Unnamed: 0,id,norm
0,0,25.377155
1,1,29.698485
2,2,30.740852
3,3,1.414214


In [17]:
con.execute("""
WITH DotProducts AS (
    WITH expanded AS (
        SELECT
            a.id AS idA,
            b.id AS idB,
            unnest(a.data) AS a_data,
            unnest(b.data) AS b_data,
            row_number() OVER(PARTITION BY a.id, b.id ORDER BY a.id) as rn
        FROM
            BigBird a
        CROSS JOIN
            BigBird b
        WHERE
            a.id < b.id
    )

    SELECT
        idA,
        idB,
        sum(a_data * b_data) AS dot_product
    FROM
        expanded
    GROUP BY
        idA, idB
                
    )
SELECT 
    d.idA,
    d.idB,
    CASE 
        WHEN n1.norm = 0 OR n2.norm = 0 THEN 0
        ELSE d.dot_product / (n1.norm * n2.norm)
    END AS cosine_similarity
FROM 
    DotProducts d
JOIN 
    BigBird_norm n1 ON d.idA = n1.id
JOIN 
    BigBird_norm n2 ON d.idB = n2.id;
""").df()

Unnamed: 0,idA,idB,cosine_similarity
0,0,1,0.997793
1,0,2,0.972933
2,0,3,0.724462
3,1,2,0.972664
4,1,3,0.690476
5,2,3,0.690066


In [18]:
similaridades

array([[0.        , 0.99779337, 0.97293301, 0.72446168],
       [0.99779337, 0.        , 0.97266384, 0.69047619],
       [0.97293301, 0.97266384, 0.        , 0.69006556],
       [0.72446168, 0.69047619, 0.69006556, 0.        ]])

In [32]:
con.execute(query_cosine_similarity).fetchnumpy()

{'idA': array([0, 0, 0, 1, 1, 2], dtype=int32),
 'idB': array([1, 2, 3, 2, 3, 3], dtype=int32),
 'cosine_similarity': array([0.99779337, 0.97293301, 0.72446168, 0.97266384, 0.69047619,
        0.69006556])}

In [35]:
similaridades

array([[0.        , 0.99779337, 0.97293301, 0.72446168],
       [0.99779337, 0.        , 0.97266384, 0.69047619],
       [0.97293301, 0.97266384, 0.        , 0.69006556],
       [0.72446168, 0.69047619, 0.69006556, 0.        ]])

In [19]:
con.execute("drop table BigBird_norm")

<duckdb.duckdb.DuckDBPyConnection at 0x7fb23e2274f0>

In [20]:
con.execute("drop table BigBird")

<duckdb.duckdb.DuckDBPyConnection at 0x7fb23e2274f0>

In [45]:
con.close()