# Carregar/Atualizar Parâmetros

Após preencher os dados de configuração e executar a célula, os dados das abas `subsituicao_simples` e `substituicao_regex` da planilha ficarão armazenados nas variáveis `df_simples` e `df_regex` no formato de `DataFrames` do `pandas`.

In [7]:
#@title Configuração do Projeto
PROJECT_PATH = "/LIIA-3R/PROJETOS/ia-dispositivos-legais" #@param {type:"string"}
PARAMS_PATH = "/params" #@param {type:"string"}
SPREADSHEET_NAME = "cmlima_substituicoes.xlsx" #@param {type:"string"}
USER_NAME = "cmlima" #@param {type:"string"}

from google.colab import drive
import pandas as pd

ROOT = '/content/drive/My\ Drive'
WORKING_PATH = ROOT + PROJECT_PATH + PARAMS_PATH

def path_exists(path):
  output = !test -e {path} && echo 1 || echo 0
  return output[0] == '1'

def file_exists(path):
  output = !(ls {path} >> /dev/null 2>&1 && echo 1) || echo 0
  return output[0] == '1'

def mount_drive():
  if not path_exists(ROOT):
    print('integrando Google Drive ao ambiente...')
    drive.mount('/content/drive')
    print()

def read_excel(file, sheet_name):
  if not file_exists(file):
    raise Exception('Planilha não localizada.')
  return pd.read_excel(file, sheet_name=sheet_name)

mount_drive()
%cd {WORKING_PATH}
df_simples = read_excel(SPREADSHEET_NAME, 'substituicao_simples')
df_simples = df_simples.fillna('')
df_regex = read_excel(SPREADSHEET_NAME, 'substituicao_regex')
df_regex = df_regex.fillna('')
print('Script concluído.')


/content/drive/My Drive/LIIA-3R/PROJETOS/ia-dispositivos-legais/params
Script concluído.


# Exibir DataFrames em Formato HTML

In [0]:
from IPython.display import display, HTML

def render_as_table(name, pandas_data_frame):
  print('\n' + name + ':')
  display(HTML(pandas_data_frame.to_html()))

render_as_table('substituicao_simples', df_simples)
render_as_table('substituicao_regex', df_regex)

# Exibir DataFrames em Formato JSON

In [0]:
import json

def render_as_json(name, pandas_data_frame):
  print('\n' + name + ':')
  parsed = json.loads(pandas_data_frame.to_json(orient='records'))
  print(json.dumps(parsed, ensure_ascii=False, indent=4))

render_as_json('substituicao_simples', df_simples)
render_as_json('substituicao_regex', df_regex)

# Salvar DataFrames em Formato JSON

In [4]:
def save_as_json(file_name, pandas_data_frame):
  with open(file_name, 'w') as file:
    file.write(pandas_data_frame.to_json(force_ascii=False, orient='records'))

%cd {WORKING_PATH}
save_as_json(USER_NAME + '_substituicao_simples.json', df_simples)
save_as_json(USER_NAME + '_substituicao_regex.json', df_regex)
print('Script concluído.')

/content/drive/My Drive/LIIA-3R/PROJETOS/ia-dispositivos-legais/params
Script concluído.


# Processar Substituições

In [5]:
#@title Configuração { vertical-output: false }
ALGORITHM_TYPE = "both" #@param ["simple", "regex", "both", "none"]
INPUT_PATH = "/data" #@param {type:"string"}
INPUT_FILE = "" #@param {type:"string"}
OUTPUT_PATH = "/output" #@param {type:"string"}
OUTPUT_TO_FILE = True #@param {type:"boolean"}

!pip install PyPDF2
!pip install python-docx

import json, re, os, PyPDF2, docx
from google.colab import files
from IPython.display import clear_output

clear_output()

def simple_subs_algorithm(text_input, params_as_json):
  params = json.loads(params_as_json)
  for item in params:
    text_input = text_input.replace(item['buscar'], item['substituir_por'])
  return text_input

def regex_subs_algorithm(text_input, params_as_json):
  params = json.loads(params_as_json)
  for item in params:
    text_input = re.sub(item['padrao'], item['substituir_por'], text_input, count=0, flags=0)
  return text_input

def split_paragraphs(text_input):
  return [ para + '.' for para in re.split('[.!?](?![0-9])', text_input) ]

def paragraphs_to_text(paragraphs):
  return '\n--------------------------------------------------------------------------------------------\n'.join(paragraphs)

def print_in_shell(paragraphs):
  print(paragraphs_to_text(paragraphs))

def print_to_file(file_name, paragraphs):
  with open(file_name, 'w') as file:
    file.write(paragraphs_to_text(paragraphs))

def get_text_from_pdf(file_name):
  pdfFileObj = open(file_name, 'rb')
  pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
  content = []
  for num in range(pdfReader.numPages):
    page = pdfReader.getPage(num)
    try:
      text = page.extractText()
    except:
      text = ''
    content.append(text)
  return ''.join(content)

def get_text_from_docx(file_name):
  doc = docx.Document(file_name)
  content = []
  for para in doc.paragraphs:
    content.append(para.text)
  return '\n'.join(content)

def get_text_from_txt(file_name):
  with open(file_name, 'r') as file:
    content = file.read()
  return content

def get_text_from_file(file_name):
  if re.match('^.+\.(pdf)$', file_name):
    return get_text_from_pdf(file_name)
  if re.match('^.+\.(docx)$', file_name):
    return get_text_from_docx(file_name)
  if re.match('^.+\.(txt)$', file_name):
    return get_text_from_txt(file_name)
  raise Exception('Arquivo inválido.')

def get_text_from_files():
  contents = []
  for file_name in os.listdir():
    if re.match('^.+\.(txt|docx|pdf)$', file_name):
      info = {
          'name': file_name,
          'content': get_text_from_file(file_name)
      }
      contents.append(info)
  return contents

def file_name_without_extension(file_name):
  return re.match('(.+?)(\.[^.]*$|$)', file_name).group(1)

def process_text(raw_text, file_name):
  new_name = file_name_without_extension(file_name) + '.txt'
  if ALGORITHM_TYPE == 'simple' or ALGORITHM_TYPE == 'both':
    raw_text = simple_subs_algorithm(raw_text, df_simples.to_json(force_ascii=False, orient='records'))
  if ALGORITHM_TYPE == 'regex' or ALGORITHM_TYPE == 'both':
    raw_text = regex_subs_algorithm(raw_text, df_regex.to_json(force_ascii=False, orient='records'))

  paragraphs = split_paragraphs(raw_text)

  if OUTPUT_TO_FILE:
    print_to_file(new_name, paragraphs)
  else:
    print_in_shell(paragraphs)
  

# # #


DATA_PATH = ROOT + PROJECT_PATH + INPUT_PATH
RESULTS_PATH = ROOT + PROJECT_PATH + OUTPUT_PATH

%cd {DATA_PATH}

if len(INPUT_FILE) > 0:
  raw_text = get_text_from_file(INPUT_FILE)
  %cd {RESULTS_PATH}
  process_text(raw_text, INPUT_FILE)
  print('arquivo ' + INPUT_FILE + ' processado.')
else:
  items = get_text_from_files()
  %cd {RESULTS_PATH}
  for item in items:
    process_text(item['content'], item['name'])
    print('arquivo ' + item['name'] + ' processado.')

%cd {WORKING_PATH}

print('Script concluído.')

/content/drive/My Drive/LIIA-3R/PROJETOS/ia-dispositivos-legais/data
/content/drive/My Drive/LIIA-3R/PROJETOS/ia-dispositivos-legais/output
arquivo sample_2.pdf processado.
arquivo sample_3.docx processado.
arquivo sample_1.txt processado.
/content/drive/My Drive/LIIA-3R/PROJETOS/ia-dispositivos-legais/params
Script concluído.
