In [1]:
import duckdb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import os
import shutil
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Conectar (ou criar) um banco de dados
con = duckdb.connect(database='jus.duckdb', read_only=False)

In [3]:
con.execute("""
CREATE TABLE metadadosPublicacao202202 AS SELECT * FROM read_json_auto('./data/202202-stj/metadadosPublicacao202202.json');
""")

<duckdb.DuckDBPyConnection at 0x7f196f99ff30>

In [4]:
# tabela de texto dos juiz
con.execute("""
  CREATE TABLE decisions_description (
    ID INT PRIMARY KEY,
    content TEXT,
  );
""")

<duckdb.DuckDBPyConnection at 0x7f196f99ff30>

In [None]:
# tabela de texto dos juiz
con.execute("""
  ALTER TABLE decisions_description ADD CONSTRAINT fk_decisions
  FOREIGN KEY (doc_id) REFERENCES decisions(seqDocumento);
""")

In [2]:
tokenizer_trad = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-mul-en")
m_trad = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-mul-en")

Downloading (…)okenizer_config.json: 100%|██████████| 44.0/44.0 [00:00<00:00, 212kB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 1.40k/1.40k [00:00<00:00, 8.80MB/s]
Downloading (…)olve/main/source.spm: 100%|██████████| 707k/707k [00:00<00:00, 2.69MB/s]
Downloading (…)olve/main/target.spm: 100%|██████████| 791k/791k [00:00<00:00, 5.02MB/s]
Downloading (…)olve/main/vocab.json: 100%|██████████| 1.42M/1.42M [00:00<00:00, 4.12MB/s]
Downloading pytorch_model.bin: 100%|██████████| 310M/310M [00:35<00:00, 8.86MB/s] 
Downloading (…)neration_config.json: 100%|██████████| 293/293 [00:00<00:00, 1.81MB/s]


In [32]:

translated = m_trad.generate(**tokenizer_trad("Quais são os benefícios de banana ?", return_tensors="pt", add_special_tokens=True))

for t in translated:
    print(tokenizer_trad.decode(t, skip_special_tokens=True) )



What are the benefits of bananas?


In [43]:
tokenizer_trad("Quais são os benefícios de banana ?", return_tensors="pt", add_special_tokens=True).input_ids[0]

tensor([ 3875,  5558,     4, 10369,  1524, 11165,  8621,  4731,     6,    43,
         7161,    60,     4,    12,     0])

In [45]:
# Function to create segments
def create_segments(text):
    # text into sentences
    sentences = text.splitlines()

    segments = []
    current_segment = []
    current_token_count = 0

    for sentence in sentences:
        tokens = tokenizer_trad(sentence, return_tensors="pt", add_special_tokens=True).input_ids[0]
        token_count = len(tokens)

        if current_token_count + token_count < 512:
            current_segment.append(sentence)
            current_token_count += token_count
        else:
            segments.append(current_segment)
            current_segment = [sentence]
            current_token_count = token_count

    if current_segment:
        segments.append(current_segment)

    segments_r = []
    for segment in segments:
        segments_r.append('\n'.join(segment))
    return segments_r


In [40]:
segments = create_segments("Quais são os benefícios de banana ?")

for i, value in enumerate(segments):
  translated = m_trad.generate(**tokenizer_trad(value, return_tensors="pt"))
  print(translated)

tensor([[64171,   163,    50,     5,  5939,     8, 37743,     6,    12,     0]])


In [46]:
# Caminho da pasta contendo os arquivos de texto
folder_path = './data/202202-stj/'

# Percorre a pasta e suas subpastas de forma recursiva
for subdir, dirs, files in os.walk(folder_path):
    for filename in files:
        if filename.endswith('.txt'):
            file_path = os.path.join(subdir, filename)
            # Extrai o ID do nome do arquivo
            file_id = int(filename.split('.')[0])
            with open(file_path, 'r') as f:
                content = f.read()
                
                segments = create_segments(content)
                
                for i, value in enumerate(segments):
                    translated = m_trad.generate(**tokenizer_trad(value, return_tensors="pt", add_special_tokens=True))
                    print(translated)
                #con.execute("INSERT INTO decisions_description (id,content) VALUES (?,?)", (file_id,content))
                print(file_id)
                break





tensor([[64171, 32762, 35430, 16739,  1126, 42186, 24176,  4856, 37253,   301,
             2, 30965,  5854, 23949,  3233, 35633,  7418,   378, 23949,     2,
           312,  9839,   378,  3135, 11050, 23135,  7116,  8726,  3135,  1230,
         18122,   792, 34104,     2, 40832, 33335,  3135,   437, 34091, 34547,
         24176,  4856, 37253,   301,     2, 23187,   918,  5791,  9047, 30818,
          5257,  7032, 18010, 32005, 34888,     4, 30817, 11207, 37205,   800,
          6579, 22301,  2707,     4,  6827,   695,     2, 32426, 27601,  3135,
         21432,  3233, 30965, 34547,  1009,    62, 13341, 34547, 21432, 36509,
         33935,   918,  6579, 22301, 23135, 28265,     2,   499, 47129,  5819,
          5257,  2707,     4, 38369,  3135,  1230, 44599,   649,     2, 24465,
           372, 35676, 21432,  3233,  2707,   312, 32566,   649,  3233,    62,
         13341, 34547, 21432,     2,     0]])
tensor([[64171,  3277,    16,   470,     9,  1670,     8,    73,  2258,     3,
      

In [None]:
con.execute("""
DELETE FROM metadadosPublicacao202202 WHERE teor NOT IN ('Concedendo', 'Negando') or teor IS NULL;
""")

In [None]:
data = con.execute("SELECT seqDocumento,teor FROM metadadosPublicacao202202").df()
data

In [None]:
# Drop rows with missing values and reset the index
data = data.dropna().reset_index(drop=True)
#35404 rows × 3 columns

In [None]:
teor_type = LabelEncoder()
data['teor_encoder'] = teor_type.fit_transform(data['teor'])
data

In [None]:
# Para saber a correspondência entre o valor codificado e o rótulo original:
for index, label in enumerate(teor_type.classes_):
    print(f"{index}: {label}")

In [None]:
# Para saber a correspondência entre o valor codificado e o rótulo original:
for index, label in enumerate(teor_type.classes_):
    print(f"{index}: {label}")
    con.execute("""
    UPDATE metadadosPublicacao202202
    SET teor = ?
    WHERE teor = ?;
    """,(index,label))

In [None]:
data = con.execute("SELECT seqDocumento,teor FROM metadadosPublicacao202202").df()
data

In [None]:
data.dtypes