In [1]:
import json, os, random, sys
import psutil

DONUT_ANN_PATH = {
    'val': 'anns/val/',
    'test': 'anns/test/',
    'train': 'anns/train/'   
}

In [2]:
def tokens_to_string(cell):
    return ''.join(cell['tokens'])

In [3]:
def get_span_type(coords, max_tup):
    
    i, j = coords
    max_row, max_col = max_tup
    
    neigs = {'up': "0",
            'right': "0",
            'down': "0",
            'left': "0"
           }
    
    if i > 0:
        neigs['up'] = "1"
        if i < max_row:
            neigs['down'] = "1"
    elif max_row != 0:
        neigs['down'] = "1"
    
    if j > 0:
        neigs['left'] = "1"
        if j < max_col:
            neigs['right'] = "1"
    elif max_col != 0:
        neigs['right'] = "1"
    
    return "span_type=" + neigs['right'] + neigs['down'] + neigs['up'] + neigs['left']

In [4]:
def create_cell(table, i, j, header, content, content_holder, span_type = "span_type=0000"):    
    aux_cell = {
        "row": i,
        "col": j,
        "col_header": header,
        "row_header": False,
        "span_type": span_type,
        "content_holder": content_holder,
        "content": ""
    }
    
    
    aux_content = tokens_to_string(content)
    aux_cell["content"] = aux_content
        
    table[i][j] = aux_cell 


def create_cells(table, start_i, start_j, header, span_tup, content):
    rowspan, colspan = span_tup
    max_i = start_i + rowspan
    max_j = start_j + colspan
    content_holder = False
    
    for i in range(start_i, max_i):
        for j in range(start_j, max_j):
            #if(i == (start_i + max_i)//2) and (j == (start_j + max_j)//2):
            if i == max_i-1 and j == max_j-1:
                content_holder = True
            else:
                content_holder = False
            create_cell(table, i, j, header, content, content_holder, get_span_type((i - start_i, j - start_j), (rowspan-1, colspan-1)))
             
    
    return max_j

In [5]:
def decode_span(rowspan, colspan, tag):
    if tag[:4] == " row":
        return "row", int(tag.split('"')[1])
    elif tag[:4] == " col":
        return "col", int(tag.split('"')[1])
    else:
        print(tag)
        raise unk


def crop_table(table, max_tuple):
    max_row, max_col = max_tuple
    
    new_table = [[] for i in range(max_row)]
    for i in range(max_row):
        header = False
        for j in range(max_col):
            if(table[i][j] == None):
                new_table[i].append({
                    "row": i,
                    "col": j,
                    "col_header": header,
                    "row_header": False,
                    "span_type": "span_type=0000",
                    "content_holder": True,
                    "content": ""
                })
                continue
            elif(table[i][j] != None and table[i][j]['col_header']):
                header = True
            
            new_table[i].append(table[i][j])
            
    return new_table


In [6]:
def json_to_ann(line):
    annotation = json.loads(line)
    
    span = False
    
    header = False
        
    max_row = 0
    max_col = 0
        
    rowspan = 1
    colspan = 1
        
    row_i = 0
    col_i = 0
    content_i = 0

    table = [[] for i in range(100)]
    for i in range(100):
        table[i] = [None for j in range(100)]
        
    
    ann_html = annotation['html']

    for html_tag in ann_html['structure']['tokens']:
        match html_tag:
            case "<td>":
                while(table[row_i][col_i] != None):
                    col_i += 1
                create_cell(table, row_i, col_i, header, ann_html['cells'][content_i], True)
                col_i += 1
            case "<td":
                pass
            case ">":
                while(table[row_i][col_i] != None):
                    col_i += 1
                col_i = create_cells(table, row_i, col_i, header, (rowspan, colspan), ann_html['cells'][content_i])
            case "</td>":
                content_i += 1
                rowspan = 1
                colspan = 1
            case "<thead>":
                header = True
            case "</thead>":
                header = False
            case "<tbody>":
                pass
            case "</tbody>":
                pass
            case "<tr>":
                col_i = 0
            case "</tr>":
                row_i += 1
                max_row = max(max_row, row_i)
                max_col = max(max_col, col_i)
            case _:
                span_coord, span_size = decode_span(rowspan, colspan, html_tag)
                
                if(span_coord == "row"):
                    rowspan = span_size
                else:
                    colspan = span_size
    
    return crop_table(table, (max_row, max_col)), annotation['filename'], annotation['split']

In [7]:
from tqdm.auto import tqdm

with open("PubTabNet_2.0.0.jsonl", encoding="utf-8") as f:
    lines = f.readlines()

lines.reverse()
    
for line in tqdm(lines):
    table, file, split = json_to_ann(line)
    with open(DONUT_ANN_PATH[split] + file[:-4] +".json", 'w') as out:
        json.dump({'tables': [table]}, out, ensure_ascii=False, indent=4)

  from .autonotebook import tqdm as notebook_tqdm
100%|██████████████████████████████████| 509892/509892 [10:33<00:00, 804.97it/s]


In [8]:
table

[[{'row': 0,
   'col': 0,
   'col_header': True,
   'row_header': False,
   'span_type': 'span_type=0000',
   'content_holder': True,
   'content': '<b>Variable</b>'},
  {'row': 0,
   'col': 1,
   'col_header': True,
   'row_header': False,
   'span_type': 'span_type=0000',
   'content_holder': True,
   'content': '<b>Hazard ratio</b>'},
  {'row': 0,
   'col': 2,
   'col_header': True,
   'row_header': False,
   'span_type': 'span_type=0000',
   'content_holder': True,
   'content': '<b>95 % CI</b>'},
  {'row': 0,
   'col': 3,
   'col_header': True,
   'row_header': False,
   'span_type': 'span_type=0000',
   'content_holder': True,
   'content': '<b><i>p</i> value*</b>'}],
 [{'row': 1,
   'col': 0,
   'col_header': False,
   'row_header': False,
   'span_type': 'span_type=0000',
   'content_holder': True,
   'content': 'Age (median)'},
  {'row': 1,
   'col': 1,
   'col_header': False,
   'row_header': False,
   'span_type': 'span_type=0000',
   'content_holder': True,
   'content': ''

In [None]:
with open("PubTabNet_2.0.0.jsonl", encoding="utf-8") as f:
    lines = f.readlines()

lines.reverse()

In [None]:
for i, line in enumerate(lines):
    table, file, split = json_to_ann(line)
    if(i == 5):
        print(table)
        break

In [1]:
import os
from PIL import Image
img_list = os.listdir("imgs/train")

max_size = [0,0]
sizes = []

for i, img in enumerate(img_list):
    aux_img = Image.open("imgs/train/"+img)
    max_size[0] = max(max_size[0], aux_img.size[0])
    max_size[1] = max(max_size[1], aux_img.size[1])
    sizes.append(aux_img.size)
    if i%10000 == 0:
        print(max_size)

[251, 83]
[773, 683]
[773, 683]
[773, 683]
[773, 713]
[773, 713]
[773, 713]
[773, 713]
[773, 713]
[773, 713]
[773, 715]
[773, 715]
[773, 715]
[773, 716]
[773, 716]
[773, 724]
[773, 724]
[773, 724]
[773, 724]
[773, 724]
[773, 724]
[773, 724]
[773, 724]
[773, 724]
[773, 724]
[773, 724]
[1223, 724]
[1223, 724]
[1223, 733]
[1223, 733]
[1223, 733]
[1223, 733]
[1223, 733]
[1223, 733]
[1223, 733]
[1223, 733]
[1223, 733]
[1223, 733]
[1223, 733]
[1223, 733]
[1223, 733]
[1223, 733]
[1223, 733]
[1223, 733]
[1223, 733]
[1223, 742]
[1223, 742]
[1223, 742]
[1223, 742]
[1223, 742]
[1223, 742]


NameError: name 'np' is not defined

In [5]:
import numpy as np

sizes = np.array(sizes)
print(np.median(sizes, axis=0))

lenghts = [size[0] for size in sizes]

lenghts= sorted(lenghts, reverse=True)

print(lenghts[:10000])

[486. 172.]
[1223, 773, 769, 765, 763, 762, 759, 759, 758, 753, 751, 751, 751, 750, 749, 749, 747, 747, 747, 745, 745, 744, 742, 741, 740, 740, 739, 739, 738, 736, 736, 735, 735, 733, 733, 731, 730, 730, 730, 730, 729, 729, 728, 728, 728, 728, 727, 727, 727, 726, 726, 726, 725, 725, 725, 724, 724, 724, 723, 723, 723, 723, 723, 722, 722, 722, 722, 722, 722, 722, 721, 721, 721, 721, 720, 720, 720, 720, 720, 720, 720, 719, 719, 719, 719, 719, 719, 719, 718, 718, 718, 718, 718, 718, 718, 718, 718, 718, 718, 717, 717, 717, 717, 717, 717, 717, 717, 717, 717, 716, 716, 716, 716, 716, 716, 715, 715, 715, 715, 715, 715, 715, 715, 715, 715, 715, 715, 715, 715, 714, 714, 714, 714, 714, 714, 714, 714, 713, 713, 713, 713, 713, 713, 713, 713, 713, 713, 713, 713, 713, 712, 712, 712, 712, 712, 712, 712, 712, 711, 711, 711, 711, 711, 711, 711, 711, 710, 710, 710, 710, 710, 710, 710, 710, 709, 709, 709, 709, 709, 709, 709, 709, 709, 709, 709, 709, 709, 709, 709, 709, 708, 708, 708, 708, 708, 708, 708, 7

In [12]:
np.percentile(sizes[:,0] , 99.9)

690.0

In [7]:
np.percentile(sizes[:,1] , 99.9)

651.0

In [None]:
np.max(sizes, axis=0)