In [1]:
import duckdb
from transformers import BigBirdModel, BigBirdTokenizerFast
from tqdm import tqdm

In [2]:
tokenizer = BigBirdTokenizerFast.from_pretrained(
    "google/bigbird-roberta-base")  # large
model = BigBirdModel.from_pretrained("google/bigbird-roberta-base",
                                     attention_type="original_full"
                                     #attention_type="block_sparse"
                                     )

In [2]:
# Conectar (ou criar) um banco de dados
con = duckdb.connect(database='jus4.duckdb', read_only=False)

In [3]:
decisions_description = con.execute("SELECT * FROM decisions_description").df()
decisions_description

Unnamed: 0,ID,content,summarize
0,145522785,EMENT\nADMINISTRATIVE. COMPETITION POLICY CONF...,Gulliver explains the conflict between the Fed...
1,145501377,DECISION > It is treated of habeas corpus impe...,the court decides that it is proper for Valter...
2,145496750,DECISION<br> It is considered a special appeal...,"In this special appeal, the Appellate Court co..."
3,146037675,DECISION<br> It is considered an ordinary appe...,the author defends an ordinary appeal brought ...
4,145049719,DECISION<br> It is treated of habeas corpus im...,Gulliver defends Oscarino's recognition of him...
...,...,...,...
267,145633556,DECISION is dealt with by the STATE OF SÃO PAU...,Gulliver argues that the State's decision to d...
268,143697533,DECISION<br> This is the case of the Court of ...,Gulliver defends the use of photographs as evi...
269,145050021,DECISION<br> This is the case of a misconduct ...,This paper focuses on the case of Elenice Mari...
270,145717143,DECISION<br>1. Take care of agravo interposed ...,"In this case, it is sufficient to prove that t..."


In [None]:
#import spacy
#nlp = spacy.load('pt_core_news_lg')

In [6]:
inputs = tokenizer("the court decides that it is proper for Valter", return_tensors="pt")
outputs = model(**inputs)

In [7]:
outputs.last_hidden_state[0].shape

torch.Size([12, 768])

In [10]:
vetor_achatado = outputs.last_hidden_state[0].flatten()
vetor_achatado.shape

torch.Size([9216])

In [11]:
vetor_original = vetor_achatado.reshape(-1, 768)
vetor_original.shape

torch.Size([12, 768])

In [None]:
con.execute("CREATE TABLE embedding(id INT UNIQUE PRIMARY KEY, data FLOAT8[]);")

In [None]:
for index, content in tqdm(enumerate(decisions_description["summarize"]), total=decisions_description["ID"].size):  
  inputs = tokenizer(content, return_tensors="pt")
  outputs = model(**inputs)

  vetor = outputs.last_hidden_state[0].flatten().detach().numpy()
  con.execute('INSERT INTO embedding (id,data) VALUES (?,?)', (int(decisions_description["ID"][index]),vetor))

con.commit()

In [11]:
con.execute("""
CREATE TABLE embedding_norm(
    id INT UNIQUE REFERENCES embedding(id),
    norm FLOAT8
);
""")
con.execute("CREATE INDEX embedding_norm_index ON embedding_norm (norm);")

<duckdb.duckdb.DuckDBPyConnection at 0x7f616b52fe30>

In [10]:
con.execute("drop table embedding_norm")

<duckdb.duckdb.DuckDBPyConnection at 0x7f616b52fe30>

In [16]:
con.execute("SELECT * FROM embedding_norm").df()

Unnamed: 0,id,norm
0,145613114,143.773278
1,146031494,193.246068
2,144239525,108.100366
3,145185793,167.892896
4,145522785,365.619710
...,...,...
267,145613136,219.633012
268,146030861,145.658971
269,145089879,185.467047
270,143130016,114.084902


In [13]:
con.execute("""
WITH norms AS (
    SELECT 
        id,
        SQRT(SUM(POWER(val, 2))) AS norm
    FROM 
    (
        SELECT
            id,
            UNNEST(data) AS val
        FROM 
            embedding
    ) AS subquery
    GROUP BY id
)
            
INSERT INTO embedding_norm (id, norm)
SELECT DISTINCT id, norm
FROM norms;
""")
con.commit()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

<duckdb.duckdb.DuckDBPyConnection at 0x7f616b52fe30>

In [15]:
con.execute("""
WITH DotProducts AS (
    WITH expanded AS (
        WITH norm_prox AS (
          (SELECT id, norm
          FROM embedding_norm
          WHERE norm < 42
          ORDER BY norm DESC
          LIMIT 128)

          UNION ALL

          (SELECT id, norm
          FROM embedding_norm
          WHERE norm > 42
          ORDER BY norm ASC
          LIMIT 128)
        )
        SELECT
            a.id AS idA,
            b.id AS idB,
            unnest(a_data.data) AS a_data,
            unnest(b_data.data) AS b_data
        FROM
            norm_prox a
        INNER JOIN
            norm_prox b ON a.id < b.id
        JOIN embedding a_data ON a.id = a_data.id
        JOIN embedding b_data ON b.id = b_data.id
    )
    SELECT
        idA,
        idB,
        sum(a_data * b_data) AS dot_product
    FROM
        expanded
    GROUP BY
        idA, idB
),
CosineSimilarities AS (
    SELECT 
        d.idA,
        d.idB,
        CASE 
            WHEN n1.norm = 0 OR n2.norm = 0 THEN 0
            ELSE d.dot_product / (n1.norm * n2.norm)
        END AS cosine_similarity            
    FROM 
        DotProducts d
    JOIN 
        embedding_norm n1 ON d.idA = n1.id
    JOIN 
        embedding_norm n2 ON d.idB = n2.id
    WHERE 
        d.idA < d.idB
)
SELECT 
    idA,
    idB,
    cosine_similarity
FROM 
    CosineSimilarities
WHERE 
    cosine_similarity > 0.7;
""").df()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Unnamed: 0,idA,idB,cosine_similarity
0,145050015,145495279,1.0
1,145049719,146029836,1.0
2,145259043,145613118,1.0


In [17]:
con.close()