In [3]:
import duckdb
import os
import argostranslate.package
import argostranslate.translate
from tqdm import tqdm
import pandas as pd

In [4]:
con = duckdb.connect(database='jus4.duckdb', read_only=False)

In [3]:
con.execute("CREATE TABLE metadadosPublicacao202202 AS SELECT * FROM read_json_auto('./data/202202-stj/metadadosPublicacao202202.json');")

<duckdb.duckdb.DuckDBPyConnection at 0x7f5005a47b70>

In [4]:
con.execute("DELETE FROM metadadosPublicacao202202 WHERE teor NOT IN ('Concedendo', 'Negando') or teor IS NULL;")

<duckdb.duckdb.DuckDBPyConnection at 0x7f5005a47b70>

In [5]:
# tabela de texto dos juiz
con.execute("""
  CREATE TABLE decisions_description (
    ID INT PRIMARY KEY,
    content TEXT,
    summarize TEXT
  );
""")

<duckdb.duckdb.DuckDBPyConnection at 0x7f5005a47b70>

In [8]:
# Download and install Argos Translate package
argostranslate.package.update_package_index()
available_packages = argostranslate.package.get_available_packages()
package_to_install = next(
    filter(
        lambda x: x.from_code == "pt" and x.to_code == "en", available_packages
    )
)
argostranslate.package.install_from_path(package_to_install.download())

In [9]:
file_paths = []  # para armazenar os caminhos completos dos arquivos
file_names = []  # para armazenar apenas os nomes dos arquivos

# caminhando através do diretório raiz e todos os seus subdiretórios
for dirpath, dirnames, filenames in os.walk('./data/202202-stj/', topdown=True):
    for filename in filenames:
        if filename.endswith('.txt'):
            full_path = os.path.join(dirpath, filename)
            file_paths.append(full_path)
            file_names.append(int(filename.split('.')[0]))

In [20]:
jurisprudencias = pd.read_csv("count_jurisprudencias0-2000.csv")
jurisprudencias

Unnamed: 0,seqDocumento,ref,ref_numers,count
0,145522785,t no AREsp 1.621.900/MG,1,2818
1,145501377,t no AREsp. 1.504.312/AL,1,2818
2,145496750,Esp n. 1.740.911/DF,1,2818
3,145496750,1-02.2017.8.26.0576,1,2818
4,145501377,t no REsp 1.816.648/AL,1,2818
...,...,...,...,...
25500,145717236,HC no 0627902-36.2020.8.06.0000,627902,1
25501,146031577,t no AREsp 1816495/RS,1816495,1
25502,146038784,t no REsp 1758267/RN,1758267,1
25503,146038784,t no AgRg no REsp 1580246/RS,1580246,1


In [21]:
df_filtered = jurisprudencias[jurisprudencias['ref_numers'].str.len() > 3]
df_filtered = df_filtered.query('count > 3')
df_filtered

Unnamed: 0,seqDocumento,ref,ref_numers,count
16733,146030689,t no AREsp 1039553/PR,1039553,25
16734,145494993,t no AREsp 1490629/SP,1490629,25
16741,145495283,t no AREsp 1490629/SP,1490629,25
16742,145495283,t no AREsp 1039553/PR,1039553,25
16747,146030104,t no AREsp 1490629/SP,1490629,25
...,...,...,...,...
22830,146032641,t no AREsp 1720732/PR,1720732,4
22833,143858788,g nos EDcl no AgRg no AREsp 1764481/PR,1764481,4
22835,145050021,g no REsp n. 143071/AM,143071,4
22836,145495303,t nos EREsp 1120356/RS,1120356,4


In [22]:
seqDocumento_ord = 
seqDocumento_ord

array([146030689, 145494993, 145495283, 146030104, 146029836, 146030084,
       145495291, 146030615, 145495279, 146032641, 145494989, 146029798,
       146029800, 146029793, 146032633, 146032629, 145494923, 145495154,
       145495299, 146029857, 145495206, 146029851, 145495285, 145495293,
       146029845, 145495287, 145495277, 145494964, 145494931, 145494948,
       145495301, 145495140, 145494962, 146030561, 145259062, 146257181,
       145716619, 146031467, 145523395, 145550802, 144257048, 145523412,
       142471506, 146030238, 145522690, 146030424, 146030664, 143079424,
       145494911, 145089890, 146257171, 146066318, 146065525, 145520657,
       145519466, 146067178, 145519547, 146066783, 145519343, 145521041,
       145520158, 146066539, 146066572, 146065757, 145519035, 146066265,
       146066301, 145613114, 145613140, 145613134, 145613138, 145613128,
       145613126, 145613130, 145613142, 145613120, 145613118, 145613124,
       145613122, 145613136, 145613116, 145259043, 

In [23]:
def process_file(file_info):
    file_path, file_id = file_info
    with open(file_path, 'r') as f:
        content = f.read()
        translatedText = argostranslate.translate.translate(content, "pt", "en")
        try:
          con.execute('INSERT INTO decisions_description (id,content) VALUES (?,?)', (int(file_id), translatedText))
          con.commit()
        except Exception as e:
          if 'violates primary key constraint' not in str(e):
            print(file_id,e)
            raise

In [24]:
for index,doc in tqdm(enumerate(seqDocumento_ord), total=len(seqDocumento_ord)):
  if doc in file_names:
    process_file((file_paths[index], doc))

100%|██████████| 264/264 [2:47:28<00:00, 38.06s/it]   


In [6]:
con.execute("SELECT * FROM decisions_description").df()

Unnamed: 0,ID,content,summarize
0,145522785,EMENT\nADMINISTRATIVE. COMPETITION POLICY CONF...,
1,145501377,DECISION > It is treated of habeas corpus impe...,
2,145496750,DECISION<br> It is considered a special appeal...,
3,146037675,DECISION<br> It is considered an ordinary appe...,
4,145049719,DECISION<br> It is treated of habeas corpus im...,
...,...,...,...
267,145633556,DECISION is dealt with by the STATE OF SÃO PAU...,
268,143697533,DECISION<br> This is the case of the Court of ...,
269,145050021,DECISION<br> This is the case of a misconduct ...,
270,145717143,DECISION<br>1. Take care of agravo interposed ...,


In [None]:
con.close()