In [1]:
import duckdb
from sklearn.preprocessing import LabelEncoder
import os
import numpy as np
import torch
import argostranslate.package
import argostranslate.translate
import threading
from multiprocessing import cpu_count
from concurrent.futures import ThreadPoolExecutor
import subprocess
from textsum.summarize import Summarizer
import gc

In [2]:
num_cores = cpu_count()
print(f"Quantidade de núcleos: {num_cores}")

Quantidade de núcleos: 8


In [3]:
# Conectar (ou criar) um banco de dados
con = duckdb.connect(database='jus.duckdb', read_only=False)

In [4]:
# Download and install Argos Translate package
argostranslate.package.update_package_index()
available_packages = argostranslate.package.get_available_packages()
package_to_install = next(
    filter(
        lambda x: x.from_code == "pt" and x.to_code == "en", available_packages
    )
)
argostranslate.package.install_from_path(package_to_install.download())

# Translate
translatedText = argostranslate.translate.translate("Quais são os benefícios de banana ?", "pt", "en")
print(translatedText)

What are the benefits of banana?


In [5]:
con.execute("""
DELETE FROM metadadosPublicacao202202 WHERE teor NOT IN ('Concedendo', 'Negando') or teor IS NULL;
""")

<duckdb.DuckDBPyConnection at 0x7fa894794b30>

In [6]:
data = con.execute("SELECT seqDocumento FROM metadadosPublicacao202202").fetchnumpy()
data["seqDocumento"]

array([144948780, 145620297, 139579332, ..., 142924179, 145440916,
       144826578])

In [7]:
data["seqDocumento"].size

35404

In [11]:
def process_file(file_info):
    file_path, file_id = file_info
    with open(file_path, 'r') as f:
        content = f.read()
        translatedText = argostranslate.translate.translate(content, "pt", "en")
        try:
          con.execute('INSERT INTO decisions_description (id,content) VALUES (?,?)', (int(file_id), translatedText))
          con.commit()
          print(file_id)
        except Exception as e:
          if 'violates primary key constraint' in str(e):
            print("Duplicate key, ignoring.",file_id)
          else:
            raise

In [9]:
file_paths = []  # para armazenar os caminhos completos dos arquivos
file_names = []  # para armazenar apenas os nomes dos arquivos

# caminhando através do diretório raiz e todos os seus subdiretórios
for dirpath, dirnames, filenames in os.walk('./data/202202-stj/', topdown=True):
    for filename in filenames:
        if filename.endswith('.txt'):
            full_path = os.path.join(dirpath, filename)
            file_paths.append(full_path)
            file_names.append(int(filename.split('.')[0]))

file_names = set(file_names)

In [12]:
for index,doc in enumerate(data["seqDocumento"]):
  if doc in file_names:
    process_file((file_paths[index], doc))

144948780
145620297
139579332
141816757
144148942
145620328
141146541
146151036
145620304
145620331
145729863
145620309
144978248
Duplicate key, ignoring. 144978248
140420607
141047357
145049821
142079128
140023292
Duplicate key, ignoring. 140023292
146136272
145049828
143486995
141842062
146136290
146151023
145716661
145620339
145049802
145049781
146136286
144978242
144978249
143075799
145716678
146136277
146136294
144462665
145620310
142870194
143035920
146182571
145142820
Duplicate key, ignoring. 141279576
144998370
142345836
144978245
145620312
145729878
145029208
137913170
145580002
145620319
145620324
145620301
116453450
146136271
146151030
140920370
146136275
140395973
Duplicate key, ignoring. 140395973
Duplicate key, ignoring. 140395973
138549525
Duplicate key, ignoring. 138549525
Duplicate key, ignoring. 138549525
140396000
Duplicate key, ignoring. 140396000
Duplicate key, ignoring. 140396000
144827077
99776484
Duplicate key, ignoring. 141285645
144776712
144269312
142154399
D

KeyboardInterrupt: 

In [15]:
# Preparar lista de tarefas
tasks = []

for index,doc in enumerate(data["seqDocumento"]):
  if doc in file_names:
    tasks.append((file_paths[index], doc))

with ThreadPoolExecutor() as executor:
    results = list(executor.map(process_file, tasks))

In [None]:
con.close()