In [17]:
import warnings
warnings.filterwarnings("ignore", message="numpy.dtype size changed")

In [18]:
import sys
import os
import re
import pandas as pd
from bs4 import BeautifulSoup

### Load ground truth data

In [2]:
def get_xml_tables(path,filename):
    
    with open(os.path.join(path,filename)) as filepath:
        
        # load xml data
        name = filename.split('.')[-2]
        xml = BeautifulSoup(filepath,'lxml')
        
        # remove unneccessary tags
        for tag in xml.find_all('instruction'):
            tag.decompose()
        for tag in xml.find_all('bounding-box'):
            tag.decompose()
            
        #return all tables as a list
        return xml.find_all('table')

In [3]:
def get_all_documents_xml(datapath):
    pattern = re.compile(".*-str\.xml")
    documents = {}
    filenames = [f for f in os.listdir(datapath) if pattern.match(f)]
    for filename in filenames:
        name = filename.split('.')[-2]
        documents[name] = get_xml_tables(datapath,filename)
    return documents

### Transform data

In [4]:
def get_cell_position(cell):
    start_col = int(cell.attrs['start-col']) if cell.has_attr('start-col') else 0
    start_row = int(cell.attrs['start-row']) if cell.has_attr('start-row') else 0
    end_col = int(cell.attrs['end-col']) if cell.has_attr('end-col') else start_col
    end_row = int(cell.attrs['end-row']) if cell.has_attr('end-row') else start_row
    return start_col,start_row,end_col,end_row

In [5]:
def get_row_and_columns_count(table):
    cells = table.find_all('cell')
    max_cols = 0
    max_rows = 0
    for cell in cells:
        start_col,start_row,end_col,end_row = get_cell_position(cell)
        max_cols = max(max_cols, start_col, end_col)
        max_rows = max(max_rows, start_row, end_row)
    return max_rows,max_cols

In [44]:
def lookup_cell(table,row,column,max_rows,max_cols, type='simple'):
    if type == 'simple':
        return table.find('cell', attrs={'start-col':str(column),'start-row':str(row)})
    elif type == 'spanning':
        cells = table.find_all('cell')
        for cell in cells:
            start_col,start_row,end_col,end_row = get_cell_position(cell)
            if column >= start_col and column <= end_col and row >= start_row and row <= end_row:
                return cell
    else:
        return None

In [38]:
def xml_to_list(table):
    max_rows, max_cols = get_row_and_columns_count(table)
    output = []
    for r in range(max_rows+1):
        row = []
        for c in range(max_cols+1):
            cell = lookup_cell(table,r,c,max_rows,max_cols,type='simple')
            entry = cell.content.string if cell is not None else ''
            row.append(entry)
        output.append(row)
    return output

In [14]:
def batch_xml_to_list(tables):
    output = []
    for tbl in tables:
        output.append(xml_to_list(tbl))
    return output

In [9]:
def transform_all(documents_xml):
    documents = {}
    for key in documents_xml.keys():
        documents[key] = batch_xml_to_list(documents_xml[key])
    return documents

### Output transformed data

In [58]:
def output_as_csv(documents, output_dir):
    for k in documents.keys():        
        n_tables = len(documents[k])
        for n in range(n_tables):
            df = pd.DataFrame(documents[k][n])
            folder = os.path.join(output_dir,k)
            filename = 'table' + str(n) + '.csv'
            filepath = os.path.join(folder,filename)
            if not os.path.exists(folder):
                os.mkdir(folder)
            df.to_csv(filepath,index=False,header=False)

### Scratch pad..

In [10]:
datapath = '/home/imran/work/datasets/icdar2013-competition-dataset-with-gt/competition-dataset-eu/'

In [45]:
documents = transform_all(get_all_documents_xml(datapath))

In [46]:
df = pd.DataFrame(documents['eu-001-str'][0])

In [51]:
df

Unnamed: 0,0,1,2,3
0,,THRESHOLD FOR RELEASES,,
1,,to air kg/year,to water kg/year,to land kg/year
2,Carbon dioxide (CO2),100 million,-,-
3,Hydro-fluorocarbons (HFCs),100,-,-
4,Methane (CH4),100 000,-,-
5,Nitrous oxide (N2O),10 000,-,-
6,Perfluorocarbons (PFCs),100,-,-
7,Sulphur hexafluoride (SF6),50,-,-


In [59]:
output_as_csv(documents,datapath)