In [67]:
import duckdb
import numpy as np
from transformers import BigBirdModel, BigBirdTokenizerFast
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [68]:
con = duckdb.connect(database='jus.duckdb', read_only=False)

In [69]:
table = f"CREATE TABLE BigBird(id INT, {', '.join([f'col_{j} FLOAT8' for j in range(4096)])});"
table

'CREATE TABLE BigBird(id INT, col_0 FLOAT8, col_1 FLOAT8, col_2 FLOAT8, col_3 FLOAT8, col_4 FLOAT8, col_5 FLOAT8, col_6 FLOAT8, col_7 FLOAT8, col_8 FLOAT8, col_9 FLOAT8, col_10 FLOAT8, col_11 FLOAT8, col_12 FLOAT8, col_13 FLOAT8, col_14 FLOAT8, col_15 FLOAT8, col_16 FLOAT8, col_17 FLOAT8, col_18 FLOAT8, col_19 FLOAT8, col_20 FLOAT8, col_21 FLOAT8, col_22 FLOAT8, col_23 FLOAT8, col_24 FLOAT8, col_25 FLOAT8, col_26 FLOAT8, col_27 FLOAT8, col_28 FLOAT8, col_29 FLOAT8, col_30 FLOAT8, col_31 FLOAT8, col_32 FLOAT8, col_33 FLOAT8, col_34 FLOAT8, col_35 FLOAT8, col_36 FLOAT8, col_37 FLOAT8, col_38 FLOAT8, col_39 FLOAT8, col_40 FLOAT8, col_41 FLOAT8, col_42 FLOAT8, col_43 FLOAT8, col_44 FLOAT8, col_45 FLOAT8, col_46 FLOAT8, col_47 FLOAT8, col_48 FLOAT8, col_49 FLOAT8, col_50 FLOAT8, col_51 FLOAT8, col_52 FLOAT8, col_53 FLOAT8, col_54 FLOAT8, col_55 FLOAT8, col_56 FLOAT8, col_57 FLOAT8, col_58 FLOAT8, col_59 FLOAT8, col_60 FLOAT8, col_61 FLOAT8, col_62 FLOAT8, col_63 FLOAT8, col_64 FLOAT8, col_6

In [72]:
con.execute(table)

<duckdb.DuckDBPyConnection at 0x7fdc7e594330>

In [None]:
concedendo_label = 1
negando_label = 0

In [57]:
data = con.execute("SELECT * FROM decisions_description").fetchnumpy()

In [58]:
data["ID"]

array([121184492, 122027196, 123312824, ..., 145089835, 145511648,
       145508379], dtype=int32)

In [None]:
data["content"]

In [None]:
data["summarize"].data

In [None]:
data["ID"].size

In [None]:
# Vamos usar numpy.vectorize para aplicar len (comprimento de string) a cada elemento do array
len_vectorized = np.vectorize(len)

# Aplicar a função vetorizada ao array de strings para obter um array de comprimentos
array_len = len_vectorized(data["content"])

# Encontrar o índice do maior comprimento
index_of_max = np.argmax(array_len)

# Usar o índice para encontrar a string mais longa
longest_str = data["content"][index_of_max]

len(longest_str)

In [None]:
tokenizer = BigBirdTokenizerFast.from_pretrained("google/bigbird-roberta-base") # large
model = BigBirdModel.from_pretrained("google/bigbird-roberta-base",attention_type="block_sparse")

text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
    tokenizer, chunk_size=4096, chunk_overlap=10 #86683 4096
)

In [None]:
texts = text_splitter.split_text(longest_str)
print(len(texts))

inputs = tokenizer(texts[0], return_tensors="pt")
outputs = model(**inputs)

last_hidden_states = outputs.last_hidden_state

print(last_hidden_states.shape)

In [70]:
query_cosine_similarity = f"""
WITH Norms AS (
    SELECT 
        id,
        SQRT(
            {'+'.join([f'col_{j}*col_{j}' for j in range(4096)])}
            ) AS norm
    FROM 
        BigBird
),
DotProducts AS (
    SELECT 
        a.id AS idA,
        b.id AS idB,
        SUM(
            {'+'.join([f'a.col_{j}*b.col_{j}' for j in range(4096)])}
            ) AS dot_product
    FROM 
        BigBird a
    CROSS JOIN 
        BigBird b
    WHERE 
        a.id < b.id
    GROUP BY 
        a.id, b.id
)
SELECT 
    d.idA,
    d.idB,
    CASE 
        WHEN n1.norm = 0 OR n2.norm = 0 THEN 0
        ELSE d.dot_product / (n1.norm * n2.norm)
    END AS cosine_similarity
FROM 
    DotProducts d
JOIN 
    Norms n1 ON d.idA = n1.id
JOIN 
    Norms n2 ON d.idB = n2.id;
"""

In [73]:
data = con.execute("SELECT * FROM BigBird").fetchnumpy()
data

{'id': array([], dtype=int32),
 'col_0': array([], dtype=float64),
 'col_1': array([], dtype=float64),
 'col_2': array([], dtype=float64),
 'col_3': array([], dtype=float64),
 'col_4': array([], dtype=float64),
 'col_5': array([], dtype=float64),
 'col_6': array([], dtype=float64),
 'col_7': array([], dtype=float64),
 'col_8': array([], dtype=float64),
 'col_9': array([], dtype=float64),
 'col_10': array([], dtype=float64),
 'col_11': array([], dtype=float64),
 'col_12': array([], dtype=float64),
 'col_13': array([], dtype=float64),
 'col_14': array([], dtype=float64),
 'col_15': array([], dtype=float64),
 'col_16': array([], dtype=float64),
 'col_17': array([], dtype=float64),
 'col_18': array([], dtype=float64),
 'col_19': array([], dtype=float64),
 'col_20': array([], dtype=float64),
 'col_21': array([], dtype=float64),
 'col_22': array([], dtype=float64),
 'col_23': array([], dtype=float64),
 'col_24': array([], dtype=float64),
 'col_25': array([], dtype=float64),
 'col_26': array(

In [74]:
con.execute(query_cosine_similarity).fetchnumpy()

ParserException: Parser Error: Max expression depth limit of 1000 exceeded. Use "SET max_expression_depth TO x" to increase the maximum expression depth.

In [60]:
con.close()