In [7]:
import re
import pdfplumber
import json
import numpy as np

In [8]:
def open_json(path):
    with open(path, 'r') as openfile:
        json_object = json.load(openfile)
        return json_object
    
def wirte_json_file(jsd,path):
    class NpEncoder(json.JSONEncoder):
        def default(self, obj):
            if isinstance(obj, np.integer):
                return int(obj)
            if isinstance(obj, np.floating):
                return float(obj)
            if isinstance(obj, np.ndarray):
                return obj.tolist()
            return super(NpEncoder, self).default(obj)
    js = json.dumps(jsd, cls=NpEncoder,indent=4, ensure_ascii=False).encode('utf8')
    with open(path, "w") as outfile:
        outfile.write(js.decode())

In [11]:
def get_words(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        words_pdf = pdf.pages[0].dedupe_chars(tolerance=1).extract_words()
        words = []
        for w in words_pdf:
            x0,x1,y0,y1,text = round(w['x0']),round(w['x1']),round(w['top']),round(w['bottom']),w['text']
            words.append({'box':[x0,y0,x1,y1],'text':text})
        return words
            
    

def get_max_rect(objects):
    x0s,y0s,x1s,y1s = [],[],[],[]
    for obj in objects:
        x0,y0,x1,y1 = obj['box']
        x0s.append(x0),y0s.append(y0),x1s.append(x1),y1s.append(y1)
    max_rect = [min(x0s),min(y0s),max(x1s),max(y1s)]
    return max_rect

def is_date(text):
    return re.match(r"\d+\/\d+\/\d+",text) is not None

def get_segment(id, label, words):
    rect = get_max_rect(words)
    txt = " ".join([w['text'] for w in words])
    segment = {'id':id,
                'box':rect,
                'label':label,
                'text': txt,
                'words': words}
    return segment

def get_header(words):
    w_header = []
    for w in words:
        if not (is_date(w['text']) and w['box'][0] < 130):
            w_header.append(w)
        else:
            break
    w_header = [w for w in w_header if w['box'][0] > 40 ]
    header = get_segment(0,'header',w_header)
    return header 

def get_colegiado(words):
    coleg_words = []
    lasty0 = words[0]['box'][1]
    for w in words:
        dif = w['box'][1] - lasty0 
        if dif < 5:
            coleg_words.append(w)
        else:
            break
        lasty0 = w['box'][1]    
    colegseg = get_segment(2,'other',coleg_words)
    return colegseg
  
def get_title(words):
    title_words = []
    lasty0 = words[0]['box'][1]
    for w in words:
        dif = w['box'][1] - lasty0 
        if dif < 20:
            title_words.append(w)
        else:
            break
        lasty0 = w['box'][1]    
    titleseg = get_segment(3,'title',title_words)
    return titleseg

def get_parte(words):
    parte_words = []
    lasty0 = words[0]['box'][1]
    for w in words:
        dif = w['box'][1] - lasty0 
        if dif < 20:
            parte_words.append(w)
        else:
            break
        lasty0 = w['box'][1]    
    parteseg = get_segment(4,'partes',parte_words)
    return parteseg

def check_acordao(w1,w2,w3):
    if 'ACORDAO' in w1 or 'ACÓRDAO' in w1 or 'ACÓRDÃO' in w1 or 'ACORDÃO' in w1 or (('A' in w1 and len(w1) == 1) and ('C' in w2 and len(w2) == 1) and (('Ó' in w3 and len(w3) == 1) or ('O' in w3 and len(w3) == 1))):
        return True
    else:
        return False

def get_ementa_segs(words):
    esegs = []
    idx = 5
    if ('EMENTA' in words[0]['text'] or 'Ementa' in words[0]['text'] ) and words[0]['box'][0] > 250:
        heading_seg = get_segment(idx,'heading',[words[0]])
        words = words[1:]
        esegs.append(heading_seg)
        idx+=1
    ementa_words = []
    for i in range(len(words)):
        w1,w2,w3 = words[i],words[i + 1],words[i+2]
        if check_acordao(w1['text'],w2['text'],w3['text']):
            break
        ementa_words.append(w1)
    if len(ementa_words) > 0:   
        ementaseg = get_segment(idx,'ementa',ementa_words)
        esegs.append(ementaseg)
    return esegs

def get_acordao(idx,words):
    awords = []
    count = 0
    for w in words:
        if len(w['text']) == 1: 
            awords.append(w)
            count+=1
        else:
            if (count > 6):
                break
            awords.append(w)
            break
    acaseg = get_segment(idx,'heading',awords)
    return acaseg

def get_acordao_txt(idx,words):
    acordao_words = []
    lasty0 = words[0]['box'][1]
    for w in words:
        dif = w['box'][1] - lasty0 
        if dif < 27:
            acordao_words.append(w)
        else:
            break
        lasty0 = w['box'][1]    
    acordaoseg = get_segment(idx,'acordao',acordao_words)
    return acordaoseg

def get_last_other_seqs(idx,words):
    seqs = []
    idxseq = idx
    seq = []
    lasty0 = words[0]['box'][1]
    for w in words:
        dif = w['box'][1] - lasty0 
        if dif < 20:
            seq.append(w)
        else:
            if len(seq) > 0:
                titleseg = get_segment(idxseq,'other',seq)
                seqs.append(titleseg)
                idxseq+=1
                seq = []
                lasty0 = w['box'][1]
                seq.append(w)
        lasty0 = w['box'][1]
    return seqs



In [9]:
ids_itd_m = open_json('ids_idt_mini.json')

In [16]:
data = []
for idx in ids_itd_m:
    id = idx['id']
   # try:
    pdf_idtm_path = f'./ITD_MINI/{id}/{id}.pdf'
    segs = []
    new_doc = {"id":id}
    words = get_words(pdf_idtm_path)
    header = get_header(words)
    header_y1 = header['box'][3]
    words = [w for w in words if w['box'][1] > header_y1]
    segs.append(header)
    date = words[0]
    dateseg = get_segment(1,'other',[date])
    segs.append(dateseg)
    words = words[1:]  
    colegseg = get_colegiado(words)
    segs.append(colegseg)
    words = words[len(colegseg['words']):]
    titleseg = get_title(words)
    segs.append(titleseg)
    words = words[len(titleseg['words']):]
    parteseg = get_parte(words)
    segs.append(parteseg)
    words = words[len(parteseg['words']):]
    ementa_segs = get_ementa_segs(words)
    ementa_qtd = [len(e['words']) for e in ementa_segs]
    ementa_qtd = sum(ementa_qtd)
    last_id = ementa_segs[-1]['id']
    segs+=ementa_segs
    words = words[ementa_qtd:]
    acordao_seq = get_acordao(last_id+1,words)
    segs.append(acordao_seq)
    last_id+=1
    words = words[len(acordao_seq['words']):]
    acordaotxt_seq = get_acordao_txt(last_id+1,words)
    segs.append(acordaotxt_seq)
    last_id+=1
    words = words[len(acordaotxt_seq['words']):]
    last_seqs = get_last_other_seqs(last_id+1,words)
    if len(last_seqs) > 0:
        segs+=last_seqs
        last_seqs_qtd = [len(e['words']) for e in last_seqs]
        last_seqs_qtd = sum(last_seqs_qtd)
        last_id = last_seqs[-1]['id']
        words = words[last_seqs_qtd:]
    footerseg = get_segment(last_id+1,'footer',words)
    segs.append(footerseg)
    new_doc["documento"] = segs
    data.append(new_doc)

In [17]:
anotacao_path = 'heuristica_anotacao.json'
wirte_json_file(data,anotacao_path)
