In [75]:
import duckdb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import os
import shutil
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import numpy as np
import torch

In [2]:
# Conectar (ou criar) um banco de dados
con = duckdb.connect(database='jus.duckdb', read_only=False)

In [3]:
con.execute("""
CREATE TABLE metadadosPublicacao202202 AS SELECT * FROM read_json_auto('./data/202202-stj/metadadosPublicacao202202.json');
""")

<duckdb.DuckDBPyConnection at 0x7f196f99ff30>

In [4]:
# tabela de texto dos juiz
con.execute("""
  CREATE TABLE decisions_description (
    ID INT PRIMARY KEY,
    content TEXT,
  );
""")

<duckdb.DuckDBPyConnection at 0x7f196f99ff30>

In [None]:
# tabela de texto dos juiz
con.execute("""
  ALTER TABLE decisions_description ADD CONSTRAINT fk_decisions
  FOREIGN KEY (doc_id) REFERENCES decisions(seqDocumento);
""")

In [122]:
tokenizer_trad = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-mul-en",top_k=124)
m_trad = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-mul-en",top_k=124)

In [124]:
translated = m_trad.generate(**tokenizer_trad("PROCESSUAL CIVIL. OFENSA AO ART. 1.022 DO CPC/2015 NÃO CONFIGURADA. EXECUÇÃO FISCAL. MULTA. INFRAÇÃO ADMINISTRATIVA. MASSA FALIDA. INEXIGIBILIDADE.", return_tensors="pt", add_special_tokens=True))

for t in translated:
    print(tokenizer_trad.decode(t, skip_special_tokens=True) )

CIVIL PROCESSING TO ARTICLE 1.0022 OF CPC/2015 NOT CONFIGUARDED. FISCAL EXECUTION. MULTA. ADMINISTRATIVE INFRATION. MASS FAILED. INEXIGIBILITY.


In [126]:
def create_segments(text, tokenizer):
    # Split text into sentences
    sentences = text.splitlines()

    segments = []
    current_segment = np.array([], dtype=int)
    current_token_count = 0

    for sentence in sentences:
        tokens = tokenizer(sentence, return_tensors="pt", add_special_tokens=True).input_ids[0].numpy()
        token_count = tokens.shape[0]

        # If the sentence itself is larger than 512 tokens, we need to handle it separately
        if token_count >= 512:
            # Save current segment if not empty
            if current_segment.size > 0:
                segments.append(current_segment)
                current_segment = np.array([], dtype=int)
                current_token_count = 0

            # Split the large sentence and add to segments
            for i in range(0, token_count, 512):
                segments.append(tokens[i:i+512])
            continue

        # Add tokens to current segment or create new segment
        if current_token_count + token_count < 512:
            current_segment = np.concatenate((current_segment, tokens))
            current_token_count += token_count
        else:
            segments.append(current_segment)
            current_segment = tokens
            current_token_count = token_count

    if current_segment.size:
        segments.append(current_segment)

    # Filter out empty segments
    segments = [item for item in segments if item.size > 0]

    return segments


In [None]:
segments = create_segments("Quais são os benefícios de banana ?",tokenizer_trad)

In [127]:
for segment in segments:
    translated = m_trad.generate(torch.tensor([segment]))
    print(translated)
    for t in translated:
        print(tokenizer_trad.decode(t, skip_special_tokens=True) )


tensor([[64171,   312,  9839, 10312, 17645, 37290,   301, 34888,     4,     2,
             4, 36266,   823,   677,  3233,   312, 29717,   169, 35938,     4,
         30825,   649, 31589,     4, 30825,   649, 31589,     4, 30825,   649,
         31589,     4, 30825,   649, 31589,     4, 30825,   649, 31589,     4,
         30825,   649, 31589,     4, 30825,   649, 31589,     4, 30825,   649,
         31589,     4, 30825,   649, 31589,     4, 30825,   649, 31589,     4,
         30825,   649, 31589,     4, 30825,   649, 31589,     4, 30825,   649,
         31589,     4,     2,     4, 36266,   823,   677,     4, 30825,   372,
         21432,  3233,  2707,   312, 32566,   649,  3233, 20272,   792, 37485,
          3135,     4, 30825,   649, 31589,     4,     2,     4, 30825,   649,
         31589,     4,     2,     4, 30825,   649, 31589,     4,     2,     4,
         30825,   649, 31589,     4,     2,     4,     2,     4,     2,     4,
             2,     4,     2,     4,     2,     4,  

KeyboardInterrupt: 

In [130]:
with open("./data/202202-stj/20220201/94552609.txt", 'r') as f:
  content = f.read()
  segments = create_segments(content,tokenizer_trad)
  
  for i,segment in enumerate(segments):
      print(i,segment.size)
      translated = m_trad.generate(torch.tensor([segment]))
      print(translated)
      for t in translated:
          print(tokenizer_trad.decode(t, skip_special_tokens=True))

Token indices sequence length is longer than the specified maximum sequence length for this model (1582 > 512). Running this sequence through the model will result in indexing errors


0 400




tensor([[64171,   312,  9839, 10312, 17645, 37290,   301, 34888,     4,     2,
             4, 36266,   823,   677,  3233,   312, 29717,   169, 35938,     4,
         30825,   649, 31589,     4, 30825,   649, 31589,     4, 30825,   649,
         31589,     4, 30825,   649, 31589,     4, 30825,   649, 31589,     4,
         30825,   649, 31589,     4, 30825,   649, 31589,     4, 30825,   649,
         31589,     4, 30825,   649, 31589,     4, 30825,   649, 31589,     4,
         30825,   649, 31589,     4, 30825,   649, 31589,     4, 30825,   649,
         31589,     4,     2,     4, 36266,   823,   677,     4, 30825,   372,
         21432,  3233,  2707,   312, 32566,   649,  3233, 20272,   792, 37485,
          3135,     4, 30825,   649, 31589,     4,     2,     4, 30825,   649,
         31589,     4,     2,     4, 30825,   649, 31589,     4,     2,     4,
         30825,   649, 31589,     4,     2,     4,     2,     4,     2,     4,
             2,     4,     2,     4,     2,     4,  

In [89]:
# Caminho da pasta contendo os arquivos de texto
folder_path = './data/202202-stj/'

# Percorre a pasta e suas subpastas de forma recursiva
for subdir, dirs, files in os.walk(folder_path):
    for filename in files:
        if filename.endswith('.txt'):
            file_path = os.path.join(subdir, filename)
            # Extrai o ID do nome do arquivo
            file_id = int(filename.split('.')[0])
            with open(file_path, 'r') as f:
                content = f.read()
                segments = create_segments(content)
                print(segments[0].size)
                break
                for segment in segments:
                    translated = m_trad.generate(torch.tensor([segment]))
                    print(translated)
                    for t in translated:
                        print(tokenizer_trad.decode(t, skip_special_tokens=True) )

                #con.execute("INSERT INTO decisions_description (id,content) VALUES (?,?)", (file_id,content))
                print(file_id)
                break

403
4787
1927
3023
4748
5951
8608
369
2840
469
12499
419
4125
3975
3338
507
456
2939
346


In [None]:
con.execute("""
DELETE FROM metadadosPublicacao202202 WHERE teor NOT IN ('Concedendo', 'Negando') or teor IS NULL;
""")

In [None]:
data = con.execute("SELECT seqDocumento,teor FROM metadadosPublicacao202202").df()
data

In [None]:
# Drop rows with missing values and reset the index
data = data.dropna().reset_index(drop=True)
#35404 rows × 3 columns

In [None]:
teor_type = LabelEncoder()
data['teor_encoder'] = teor_type.fit_transform(data['teor'])
data

In [None]:
# Para saber a correspondência entre o valor codificado e o rótulo original:
for index, label in enumerate(teor_type.classes_):
    print(f"{index}: {label}")

In [None]:
# Para saber a correspondência entre o valor codificado e o rótulo original:
for index, label in enumerate(teor_type.classes_):
    print(f"{index}: {label}")
    con.execute("""
    UPDATE metadadosPublicacao202202
    SET teor = ?
    WHERE teor = ?;
    """,(index,label))

In [None]:
data = con.execute("SELECT seqDocumento,teor FROM metadadosPublicacao202202").df()
data

In [None]:
data.dtypes