In [1]:
import duckdb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import os
import shutil
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import numpy as np
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Conectar (ou criar) um banco de dados
con = duckdb.connect(database='jus.duckdb', read_only=False)

In [None]:
con.execute("""
CREATE TABLE metadadosPublicacao202202 AS SELECT * FROM read_json_auto('./data/202202-stj/metadadosPublicacao202202.json');
""")

In [None]:
# tabela de texto dos juiz
con.execute("""
  CREATE TABLE decisions_description (
    ID INT PRIMARY KEY,
    content TEXT,
    summarize TEXT
  );
""")

In [3]:
import argostranslate.package
import argostranslate.translate

In [4]:
# Download and install Argos Translate package
argostranslate.package.update_package_index()
available_packages = argostranslate.package.get_available_packages()
package_to_install = next(
    filter(
        lambda x: x.from_code == "pt" and x.to_code == "en", available_packages
    )
)
argostranslate.package.install_from_path(package_to_install.download())

# Translate
translatedText = argostranslate.translate.translate("Quais são os benefícios de banana ?", "pt", "en")
print(translatedText)

What are the benefits of banana?


In [None]:
def create_segments(text, tokenizer):
    # Split text into sentences
    sentences = text.splitlines()

    segments = []
    current_segment = np.array([], dtype=int)
    current_token_count = 0

    for sentence in sentences:
        tokens = tokenizer(sentence, return_tensors="pt", add_special_tokens=True).input_ids[0].numpy()
        token_count = tokens.shape[0]

        # If the sentence itself is larger than 512 tokens, we need to handle it separately
        if token_count >= 512:
            # Save current segment if not empty
            if current_segment.size > 0:
                segments.append(current_segment)
                current_segment = np.array([], dtype=int)
                current_token_count = 0

            # Split the large sentence and add to segments
            for i in range(0, token_count, 512):
                segments.append(tokens[i:i+512])
            continue

        # Add tokens to current segment or create new segment
        if current_token_count + token_count < 512:
            current_segment = np.concatenate((current_segment, tokens))
            current_token_count += token_count
        else:
            segments.append(current_segment)
            current_segment = tokens
            current_token_count = token_count

    if current_segment.size:
        segments.append(current_segment)

    result = []
    # Preenchendo segmentos com zeros para que tenham tamanho 512
    for segment in segments:
        if segment.size > 0:
            padding_length = 512 - segment.shape[0]
            result.append(np.concatenate([segment, np.zeros(padding_length, dtype=int)]))

    return result



In [None]:
segments = create_segments("Quais são os benefícios de banana ?",tokenizer_trad)

In [None]:
for segment in segments:
    translated = m_trad.generate(torch.tensor([segment]))
    print(translated)
    for t in translated:
        print(tokenizer_trad.decode(t, skip_special_tokens=True) )


In [None]:
with open("./data/202202-stj/20220201/94552609.txt", 'r') as f:
  content = f.read()
  translatedText = argostranslate.translate.translate(content, "pt", "en")
  print(translatedText)

In [None]:
def split_encoded_text(encoded_text, max_length=512):
    total_length = encoded_text['input_ids'].shape[1]
    
    chunks = []
    for i in range(0, total_length, max_length):
        end = min(i+max_length, total_length)
        chunk = {
            "input_ids": encoded_text["input_ids"][0, i:end].unsqueeze(0),
            "attention_mask": encoded_text["attention_mask"][0, i:end].unsqueeze(0)
        }
        chunks.append(chunk)
    return chunks

In [None]:
con.execute("""
DELETE FROM metadadosPublicacao202202 WHERE teor NOT IN ('Concedendo', 'Negando') or teor IS NULL;
""")

In [None]:
concedendo_label = 1
negando_label = 0

In [None]:
data = con.execute("SELECT seqDocumento FROM metadadosPublicacao202202").fetchnumpy()
data["seqDocumento"]

In [None]:
data["seqDocumento"].size

In [None]:
# Caminho da pasta contendo os arquivos de texto
folder_path = './data/202202-stj/'
index = 0
# Percorre a pasta e suas subpastas de forma recursiva
for subdir, dirs, files in os.walk(folder_path):
    for filename in files:
        index = index + 1
        if index > 4226:
            if filename.endswith('.txt'):
                file_path = os.path.join(subdir, filename)
                # Extrai o ID do nome do arquivo
                file_id = int(filename.split('.')[0])
                if np.where(data["seqDocumento"] == file_id)[0].size == 1:
                    with open(file_path, 'r') as f:
                        content = f.read()
                        translatedText = argostranslate.translate.translate(content, "pt", "en")
                        try:
                            con.execute('INSERT INTO decisions_description (id,content) VALUES (?,?)', (file_id, translatedText))
                        except Exception as e:
                            if 'violates primary key constraint' in str(e):
                                print("Duplicate key, ignoring.")
                            else:
                                raise
                        print(index, file_id)

In [None]:
# Drop rows with missing values and reset the index
data = data.dropna().reset_index(drop=True)

In [None]:
teor_type = LabelEncoder()
data['teor_encoder'] = teor_type.fit_transform(data['teor'])
data

In [None]:
# Para saber a correspondência entre o valor codificado e o rótulo original:
for index, label in enumerate(teor_type.classes_):
    print(f"{index}: {label}")

In [None]:
# Para saber a correspondência entre o valor codificado e o rótulo original:
for index, label in enumerate(teor_type.classes_):
    print(f"{index}: {label}")
    con.execute("""
    UPDATE metadadosPublicacao202202
    SET teor = ?
    WHERE teor = ?;
    """,(index,label))

In [None]:
data = con.execute("SELECT seqDocumento,teor FROM metadadosPublicacao202202").df()
data

In [None]:
data.dtypes

In [None]:
con.close()