In [1]:
import duckdb
import numpy as np

In [25]:
con = duckdb.connect(database='jus.duckdb', read_only=False)

In [45]:
table = f"CREATE TABLE BigBird(id INT, {', '.join([f'col_{j} FLOAT8' for j in range(4)])});"
table

'CREATE TABLE BigBird(id INT, col_0 FLOAT8, col_1 FLOAT8, col_2 FLOAT8, col_3 FLOAT8);'

In [46]:
con.execute(table)

<duckdb.DuckDBPyConnection at 0x7fdd6463fa70>

In [None]:
concedendo_label = 1
negando_label = 0

In [34]:
data = con.execute("SELECT * FROM BigBird").fetchnumpy()
data

{'id': array([], dtype=int32),
 'col_0': array([], dtype=float64),
 'col_1': array([], dtype=float64),
 'col_2': array([], dtype=float64),
 'col_3': array([], dtype=float64)}

In [35]:
import pandas as pd
import numpy as np

# Exemplo de conjunto de dados
dados = {
    'x': [25, 29, 30,1],
    'y': [3, 5, 6,0],
    'z': [3, 4, -3,0],
    'k': [1,0,0,1]
}
dados_selecionados = pd.DataFrame(dados)

def cosine_similarity(A, B):

    # Calcula o produto escalar
    dot_product = np.dot(A, B)
    
    # Calcula as normas dos vetores
    norm_A = np.linalg.norm(A)
    norm_B = np.linalg.norm(B)
    
    if norm_A == 0 or norm_B == 0:
        return 0.0
    
    # Calcula a similaridade de cosseno
    cosine_sim = dot_product / (norm_A * norm_B)
    
    return cosine_sim

# Calculando a similaridade entre candidatos
num_candidatos = len(dados_selecionados)
similaridades = np.zeros((num_candidatos, num_candidatos))

for i in range(num_candidatos):
    for j in range(i+1, num_candidatos):  # começar a partir de i+1 
        sim = cosine_similarity(dados_selecionados.iloc[i], dados_selecionados.iloc[j])
        similaridades[i][j] = sim
        similaridades[j][i] = sim  # preenchendo a matriz simétrica

similaridades

array([[0.        , 0.99779337, 0.97293301, 0.72446168],
       [0.99779337, 0.        , 0.97266384, 0.69047619],
       [0.97293301, 0.97266384, 0.        , 0.69006556],
       [0.72446168, 0.69047619, 0.69006556, 0.        ]])

In [47]:
con.execute("""
INSERT INTO BigBird (id,col_0,col_1,col_2,col_3) VALUES
	(0,25.0,3.0,3.0,1.0),
	(1,29.0,5.0,4.0,0.0),
	(2,30.0,6.0,-3.0,0.0),
  (3,1.0,0.0,0.0,1.0)
;
""")

<duckdb.DuckDBPyConnection at 0x7fdd6463fa70>

In [41]:
query_cosine_similarity = f"""
WITH Norms AS (
    SELECT 
        id,
        SQRT(
            {'+'.join([f'col_{j}*col_{j}' for j in range(4)])}
            ) AS norm
    FROM 
        BigBird
),
DotProducts AS (
    SELECT 
        a.id AS idA,
        b.id AS idB,
        SUM(
            {'+'.join([f'a.col_{j}*b.col_{j}' for j in range(4)])}
            ) AS dot_product
    FROM 
        BigBird a
    CROSS JOIN 
        BigBird b
    WHERE 
        a.id < b.id
    GROUP BY 
        a.id, b.id
)
SELECT 
    d.idA,
    d.idB,
    CASE 
        WHEN n1.norm = 0 OR n2.norm = 0 THEN 0
        ELSE d.dot_product / (n1.norm * n2.norm)
    END AS cosine_similarity
FROM 
    DotProducts d
JOIN 
    Norms n1 ON d.idA = n1.id
JOIN 
    Norms n2 ON d.idB = n2.id;
"""
print(query_cosine_similarity)


WITH Norms AS (
    SELECT 
        id,
        SQRT(
            col_0*col_0+col_1*col_1+col_2*col_2+col_3*col_3
            ) AS norm
    FROM 
        BigBird
),
DotProducts AS (
    SELECT 
        a.id AS idA,
        b.id AS idB,
        SUM(
            a.col_0*b.col_0+a.col_1*b.col_1+a.col_2*b.col_2+a.col_3*b.col_3
            ) AS dot_product
    FROM 
        BigBird a
    CROSS JOIN 
        BigBird b
    WHERE 
        a.id < b.id
    GROUP BY 
        a.id, b.id
)
SELECT 
    d.idA,
    d.idB,
    CASE 
        WHEN n1.norm = 0 OR n2.norm = 0 THEN 0
        ELSE d.dot_product / (n1.norm * n2.norm)
    END AS cosine_similarity
FROM 
    DotProducts d
JOIN 
    Norms n1 ON d.idA = n1.id
JOIN 
    Norms n2 ON d.idB = n2.id;



In [48]:
con.execute(query_cosine_similarity).fetchnumpy()

{'idA': array([0, 0, 0, 1, 1, 2], dtype=int32),
 'idB': array([1, 2, 3, 2, 3, 3], dtype=int32),
 'cosine_similarity': array([0.99779337, 0.97293301, 0.72446168, 0.97266384, 0.69047619,
        0.69006556])}

In [43]:
similaridades

array([[0.        , 0.99779337, 0.97293301, 0.72446168],
       [0.99779337, 0.        , 0.97266384, 0.69047619],
       [0.97293301, 0.97266384, 0.        , 0.69006556],
       [0.72446168, 0.69047619, 0.69006556, 0.        ]])

In [49]:
con.execute("drop table BigBird")

<duckdb.DuckDBPyConnection at 0x7fdd6463fa70>

In [None]:
con.close()