# Pipeline

In [6]:
# Importing other necessary packages
import glob
import pandas as pd
import numpy as np
import cv2 # image transformation
import os
import re
import concurrent # for parallel instances
import functools # for creating partial functions
import shutil

from io import StringIO # to convert string to csv
import time # to measure time

# to add the path where to search for modules
import sys
sys.path.append('/home/hennes/Internship/table_scanner')

# Importing table_ocr modules 
from table_ocr import pdf_to_images
from table_ocr import extract_tables
from table_ocr import extract_cells
from table_ocr import ocr_image
from table_ocr import ocr_to_csv

## Creating Pipeline

In [7]:
## setting folders and pdfs

folder = "/home/hennes/Internship/pdfs" # should be folder containing pdfs of election
save_folder = '/home/hennes/Internship/constituencies/' # folder into which csvs should be saved
old = '/home/hennes/Internship/old_files/' # folder into which old files are moved
allpdf = [pdf for pdf in glob.glob(folder+'/*') if pdf.endswith(".pdf")] # list with all pdfs from folder

# exclude pdfs for which there is already a csv in save folder
pdflist = sorted([pdf for pdf in allpdf if pdf.split('/')[-1].split('_')[0].split('.')[0] not in
           [file.split('/')[-1].split('.')[0] for file in glob.glob(save_folder+'*')]])

# add pdf file names in case they should be manually excluded
exclude = ('AC069.pdf', 'AC100.pdf')
pdflist = [x for x in pdflist if not x.endswith(exclude)]

In [3]:
# move old files and folders into old_files folder
[shutil.move(folder+'/'+e, '/home/hennes/Internship/old_files/b')
 for e in next(os.walk(folder))[1] if not e.endswith('.pdf')]
[shutil.move(e, '/home/hennes/Internship/old_files/b')
 for e in glob.glob(folder+'/*') if not e.endswith('.pdf')]

[]

In [8]:
# Importing images
def ocr_pipeline(pdf):
    
    stop = None
    
    # extract pages from pdf and save as images 
    pdf_to_images.pdf_to_images(pdf)

    # define list of images thus created
    imglist = [img for img in glob.glob(folder+'/*') if img.endswith('.png')]
    print(f"created images of {pdf.split('/')[-1]}")
    
    # rotate and correct images for skew
    with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:
        executor.map(pdf_to_images.preprocess_img, imglist)
    print(f"preprocessed images of {pdf.split('/')[-1]}")

    # define imglist as list of lists because next function needs list as argument
    imglist = [[img] for img in imglist]

    # crop images to table and save in new folder
    with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:
        executor.map(extract_tables.main, imglist)
    print(f"extracted tables of {pdf.split('/')[-1]}")

    # define list of all images of tables in newly created subfolders
    dirlist = sorted([directory for directory in glob.glob(folder+'/*/*') if directory.endswith('.png')])

    # Extract individual cell images
    with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:
        executor.map(extract_cells.main, dirlist)
    print(f"extracted cells of {pdf.split('/')[-1]}")

        
    # define list of directories containing cell images
    dirlist = [directory for directory in glob.glob(folder+'/*/*/')]
    
    # define list of images of cells within directories
    celllists = [glob.glob(cellfolder+'*') for cellfolder in dirlist]
    
    # Specify that there should be no multiple threads.
    # This is important because I am already using multiple processors.
    os.environ['OMP_THREAD_LIMIT'] = '1'
    p_ocr_image = functools.partial(ocr_image.main, None)
    
    # perform OCR on each image
    for image_list in celllists:
        with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:
            executor.map(p_ocr_image, image_list)
    print(f"completed ocr of {pdf.split('/')[-1]}")

    gathered_data = []

    try:
        # get the names for the individual pages
        pages = sorted([filename for directory, filename in
                 [os.path.split(x) for x in glob.glob(folder+'/*') if not x.endswith(('.pdf', '.png'))]])

        # create list of alphabetically ordered lists of ocred files
        ocrlists = [sorted(y) for y in
                    [glob.glob(f'{folder}/{x}/cells/ocr_data/*.txt') for x in pages]]
        zippie = zip(pages, ocrlists)
        
        # for each pair of page and ocred cells, create a csv
        for y, x in zippie:
            output = ocr_to_csv.main(x)
            csv = StringIO(output)
            print(f'working on {y}')

            # Turning csv into dataframe
            # Skipping the first two rows because they have fewer columns than rest
            # Also useful for chaining of tables later
            df = pd.read_csv(csv,  header = None, skiprows=[0, 1])
            gathered_data.append(df)
            df = pd.concat(gathered_data)
            
            # give df a name and save it
            constituency_name = pages[0][0:5]
            df.to_csv(save_folder+constituency_name+'.csv')

        print(f'Saved {constituency_name} to folder.')

        # move old files and folders into old_files folder
        [shutil.move(folder+'/'+e, '/home/hennes/Internship/old_files/b')
         for e in next(os.walk(folder))[1] if not e.endswith('.pdf')]
        [shutil.move(e, '/home/hennes/Internship/old_files/b')
         for e in glob.glob(folder+'/*') if not e.endswith('.pdf')]
    
    # If there is an error, print error message.
    # Most likely eror is that header lines (which are not turned into cells properly)
    # are three instead of two rows. So we try reading the csvs again, this
    # time ignoring the first three instead of just two rows.
    except Exception as e:
        print(e)
        print('will try again ignoring one more line.')
            # get the names for the individual pages

        try:
            pages = sorted([filename for directory, filename in
                     [os.path.split(x) for x in glob.glob(folder+'/*') if not x.endswith(('.pdf', '.png'))]])

            # create list of alphabetically ordered lists of ocred files

            ocrlists = [sorted(y) for y in
                        [glob.glob(f'{folder}/{x}/cells/ocr_data/*.txt') for x in pages]]
            zippie = zip(pages, ocrlists)
            
            # for each pair of directory and files, create csv file
            for y, x in zippie:
                output = ocr_to_csv.main(x)
                csv = StringIO(output)
                print(f'working on {y}')
                
                # Turning csv files into single dataframe
                # Skipping the first two rows because they have fewer columns than rest
                # Also useful for chaining of tables later
                df = pd.read_csv(csv,  header = None, skiprows=[0, 1, 2])
                gathered_data.append(df)
                df = pd.concat(gathered_data)
                
                # give df a name and save it
                constituency_name = pages[0][0:5]
                df.to_csv(save_folder+constituency_name+'.csv')

            print(f'Saved {constituency_name} to folder.')
        
            # move old files and folders into old_files folder
            [shutil.move(folder+'/'+e, '/home/hennes/Internship/old_files/b')
             for e in next(os.walk(folder))[1] if not e.endswith('.pdf')]
            [shutil.move(e, '/home/hennes/Internship/old_files/b')
             for e in glob.glob(folder+'/*') if not e.endswith('.pdf')]
        
        # If this still does not work, stop the program and try to manually correct mistakes.
        except Exception as e:
            print(e)
            print(f'There is a problem with {pdf.split("/")[-1]}. Continuing with next pdf.')
            # move old files and folders into old_files folder
            [shutil.move(folder+'/'+e, '/home/hennes/Internship/old_files/b')
             for e in next(os.walk(folder))[1] if not e.endswith('.pdf')]
            [shutil.move(e, '/home/hennes/Internship/old_files/b')
             for e in glob.glob(folder+'/*') if not e.endswith('.pdf')]

In [9]:
for x in pdflist:
    print(f' \nWORKING ON {x}\n ')
    stop = ocr_pipeline(x)
    if stop:
        break

 
WORKING ON /home/hennes/Internship/pdfs/AC012.pdf
 
created images of AC012.pdf
preprocessed images of AC012.pdf
extracted tables of AC012.pdf
tried /home/hennes/Internship/pdfs/AC012-000 with 5.
tried /home/hennes/Internship/pdfs/AC012-003 with 5.
tried /home/hennes/Internship/pdfs/AC012-001 with 5.
tried /home/hennes/Internship/pdfs/AC012-002 with 5.
tried /home/hennes/Internship/pdfs/AC012-004 with 5.
tried /home/hennes/Internship/pdfs/AC012-003 with 6.
tried /home/hennes/Internship/pdfs/AC012-001 with 6.
tried /home/hennes/Internship/pdfs/AC012-005 with 5.
tried /home/hennes/Internship/pdfs/AC012-000 with 6.
tried /home/hennes/Internship/pdfs/AC012-006 with 5.
tried /home/hennes/Internship/pdfs/AC012-001 with 7.
tried /home/hennes/Internship/pdfs/AC012-007 with 5.
tried /home/hennes/Internship/pdfs/AC012-008 with 5.
tried /home/hennes/Internship/pdfs/AC012-006 with 6.
tried /home/hennes/Internship/pdfs/AC012-009 with 5.
tried /home/hennes/Internship/pdfs/AC012-010 with 5.
tried /

tried /home/hennes/Internship/pdfs/AC016-004 with 8.
tried /home/hennes/Internship/pdfs/AC016-001 with 8.
tried /home/hennes/Internship/pdfs/AC016-006 with 11.
There are still several column lengths in table /home/hennes/Internship/pdfs/AC016-006. Manually check this
tried /home/hennes/Internship/pdfs/AC016-005 with 9.
tried /home/hennes/Internship/pdfs/AC016-004 with 9.
tried /home/hennes/Internship/pdfs/AC016-001 with 9.
tried /home/hennes/Internship/pdfs/AC016-004 with 10.
tried /home/hennes/Internship/pdfs/AC016-007 with 5.
tried /home/hennes/Internship/pdfs/AC016-005 with 10.
tried /home/hennes/Internship/pdfs/AC016-001 with 10.
tried /home/hennes/Internship/pdfs/AC016-004 with 11.
There are still several column lengths in table /home/hennes/Internship/pdfs/AC016-004. Manually check this
There are rows that are too short in table /home/hennes/Internship/pdfs/AC016-004. Manually check this
tried /home/hennes/Internship/pdfs/AC016-007 with 6.
tried /home/hennes/Internship/pdfs/AC016

tried /home/hennes/Internship/pdfs/AC019-009 with 6.
tried /home/hennes/Internship/pdfs/AC019-009 with 7.
tried /home/hennes/Internship/pdfs/AC019-009 with 8.
tried /home/hennes/Internship/pdfs/AC019-009 with 9.
tried /home/hennes/Internship/pdfs/AC019-009 with 10.
tried /home/hennes/Internship/pdfs/AC019-009 with 11.
There are still several column lengths in table /home/hennes/Internship/pdfs/AC019-009. Manually check this
extracted cells of AC019.pdf
completed ocr of AC019.pdf
working on AC019-000
No columns to parse from file
will try again ignoring one more line.
working on AC019-000
No columns to parse from file
There is a problem with AC019.pdf. Continuing with next pdf.
 
WORKING ON /home/hennes/Internship/pdfs/AC020.pdf
 
created images of AC020.pdf
preprocessed images of AC020.pdf
extracted tables of AC020.pdf
tried /home/hennes/Internship/pdfs/AC020-000 with 5.
tried /home/hennes/Internship/pdfs/AC020-001 with 5.
tried /home/hennes/Internship/pdfs/AC020-002 with 5.
tried /hom

tried /home/hennes/Internship/pdfs/AC021-004 with 9.
tried /home/hennes/Internship/pdfs/AC021-007 with 7.
tried /home/hennes/Internship/pdfs/AC021-006 with 7.
tried /home/hennes/Internship/pdfs/AC021-005 with 8.
tried /home/hennes/Internship/pdfs/AC021-004 with 10.
tried /home/hennes/Internship/pdfs/AC021-007 with 8.
tried /home/hennes/Internship/pdfs/AC021-006 with 8.
tried /home/hennes/Internship/pdfs/AC021-007 with 9.
tried /home/hennes/Internship/pdfs/AC021-007 with 10.
tried /home/hennes/Internship/pdfs/AC021-004 with 11.
There are still several column lengths in table /home/hennes/Internship/pdfs/AC021-004. Manually check this
There are rows that are too short in table /home/hennes/Internship/pdfs/AC021-004. Manually check this
tried /home/hennes/Internship/pdfs/AC021-005 with 9.
tried /home/hennes/Internship/pdfs/AC021-006 with 9.
tried /home/hennes/Internship/pdfs/AC021-007 with 11.
There are still several column lengths in table /home/hennes/Internship/pdfs/AC021-007. Manually

tried /home/hennes/Internship/pdfs/AC023-013 with 7.
tried /home/hennes/Internship/pdfs/AC023-015 with 6.
tried /home/hennes/Internship/pdfs/AC023-016 with 6.
tried /home/hennes/Internship/pdfs/AC023-013 with 8.
tried /home/hennes/Internship/pdfs/AC023-015 with 7.
tried /home/hennes/Internship/pdfs/AC023-016 with 7.
tried /home/hennes/Internship/pdfs/AC023-013 with 9.
tried /home/hennes/Internship/pdfs/AC023-016 with 8.
tried /home/hennes/Internship/pdfs/AC023-013 with 10.
tried /home/hennes/Internship/pdfs/AC023-016 with 9.
tried /home/hennes/Internship/pdfs/AC023-013 with 11.
There are still several column lengths in table /home/hennes/Internship/pdfs/AC023-013. Manually check this
There are rows that are too short in table /home/hennes/Internship/pdfs/AC023-013. Manually check this
tried /home/hennes/Internship/pdfs/AC023-016 with 10.
tried /home/hennes/Internship/pdfs/AC023-016 with 11.
There are still several column lengths in table /home/hennes/Internship/pdfs/AC023-016. Manually

tried /home/hennes/Internship/pdfs/AC026-002 with 7.
tried /home/hennes/Internship/pdfs/AC026-000 with 8.
tried /home/hennes/Internship/pdfs/AC026-004 with 6.
tried /home/hennes/Internship/pdfs/AC026-001 with 7.
tried /home/hennes/Internship/pdfs/AC026-000 with 9.
tried /home/hennes/Internship/pdfs/AC026-002 with 8.
tried /home/hennes/Internship/pdfs/AC026-005 with 5.
tried /home/hennes/Internship/pdfs/AC026-000 with 10.
tried /home/hennes/Internship/pdfs/AC026-004 with 7.
tried /home/hennes/Internship/pdfs/AC026-002 with 9.
tried /home/hennes/Internship/pdfs/AC026-000 with 11.
There are still several column lengths in table /home/hennes/Internship/pdfs/AC026-000. Manually check this
There are rows that are too short in table /home/hennes/Internship/pdfs/AC026-000. Manually check this
tried /home/hennes/Internship/pdfs/AC026-006 with 5.
tried /home/hennes/Internship/pdfs/AC026-007 with 5.
tried /home/hennes/Internship/pdfs/AC026-008 with 5.
tried /home/hennes/Internship/pdfs/AC026-010 

tried /home/hennes/Internship/pdfs/AC028-004 with 10.
tried /home/hennes/Internship/pdfs/AC028-006 with 9.
tried /home/hennes/Internship/pdfs/AC028-007 with 9.
tried /home/hennes/Internship/pdfs/AC028-004 with 11.
There are rows that are too short in table /home/hennes/Internship/pdfs/AC028-004. Manually check this
tried /home/hennes/Internship/pdfs/AC028-005 with 10.
tried /home/hennes/Internship/pdfs/AC028-008 with 5.
tried /home/hennes/Internship/pdfs/AC028-006 with 10.
tried /home/hennes/Internship/pdfs/AC028-007 with 10.
tried /home/hennes/Internship/pdfs/AC028-008 with 5.
tried /home/hennes/Internship/pdfs/AC028-008 with 6.
tried /home/hennes/Internship/pdfs/AC028-005 with 11.
There are rows that are too short in table /home/hennes/Internship/pdfs/AC028-005. Manually check this
tried /home/hennes/Internship/pdfs/AC028-006 with 11.
There are rows that are too short in table /home/hennes/Internship/pdfs/AC028-006. Manually check this
tried /home/hennes/Internship/pdfs/AC028-007 wit

tried /home/hennes/Internship/pdfs/AC031-000 with 5.
tried /home/hennes/Internship/pdfs/AC031-003 with 5.
tried /home/hennes/Internship/pdfs/AC031-002 with 5.
tried /home/hennes/Internship/pdfs/AC031-001 with 5.
tried /home/hennes/Internship/pdfs/AC031-000 with 6.
tried /home/hennes/Internship/pdfs/AC031-002 with 6.
tried /home/hennes/Internship/pdfs/AC031-004 with 5.
tried /home/hennes/Internship/pdfs/AC031-005 with 5.
tried /home/hennes/Internship/pdfs/AC031-000 with 7.
tried /home/hennes/Internship/pdfs/AC031-004 with 6.
tried /home/hennes/Internship/pdfs/AC031-006 with 5.
tried /home/hennes/Internship/pdfs/AC031-005 with 6.
tried /home/hennes/Internship/pdfs/AC031-008 with 5.
tried /home/hennes/Internship/pdfs/AC031-007 with 5.
tried /home/hennes/Internship/pdfs/AC031-008 with 5.
tried /home/hennes/Internship/pdfs/AC031-006 with 6.
tried /home/hennes/Internship/pdfs/AC031-006 with 7.
tried /home/hennes/Internship/pdfs/AC031-006 with 8.
extracted cells of AC031.pdf
completed ocr of 

tried /home/hennes/Internship/pdfs/AC032-013 with 11.
There are still several column lengths in table /home/hennes/Internship/pdfs/AC032-013. Manually check this
There are rows that are too short in table /home/hennes/Internship/pdfs/AC032-013. Manually check this
tried /home/hennes/Internship/pdfs/AC032-012 with 11.
There are still several column lengths in table /home/hennes/Internship/pdfs/AC032-012. Manually check this
There are rows that are too short in table /home/hennes/Internship/pdfs/AC032-012. Manually check this
tried /home/hennes/Internship/pdfs/AC032-011 with 11.
There are still several column lengths in table /home/hennes/Internship/pdfs/AC032-011. Manually check this
There are rows that are too short in table /home/hennes/Internship/pdfs/AC032-011. Manually check this
extracted cells of AC032.pdf
completed ocr of AC032.pdf
working on AC032-000
Error tokenizing data. C error: Expected 27 fields in line 4, saw 28

will try again ignoring one more line.
working on AC032-00

There are rows that are too short in table /home/hennes/Internship/pdfs/AC034-013. Manually check this
tried /home/hennes/Internship/pdfs/AC034-015 with 11.
There are still several column lengths in table /home/hennes/Internship/pdfs/AC034-015. Manually check this
tried /home/hennes/Internship/pdfs/AC034-012 with 11.
There are still several column lengths in table /home/hennes/Internship/pdfs/AC034-012. Manually check this
There are rows that are too short in table /home/hennes/Internship/pdfs/AC034-012. Manually check this
There are rows that are too short in table /home/hennes/Internship/pdfs/AC034-015. Manually check this
tried /home/hennes/Internship/pdfs/AC034-016 with 7.
tried /home/hennes/Internship/pdfs/AC034-016 with 8.
tried /home/hennes/Internship/pdfs/AC034-016 with 9.
tried /home/hennes/Internship/pdfs/AC034-016 with 10.
tried /home/hennes/Internship/pdfs/AC034-016 with 11.
There are still several column lengths in table /home/hennes/Internship/pdfs/AC034-016. Manually che

tried /home/hennes/Internship/pdfs/AC036-007 with 7.
tried /home/hennes/Internship/pdfs/AC036-006 with 9.
tried /home/hennes/Internship/pdfs/AC036-003 with 9.
tried /home/hennes/Internship/pdfs/AC036-001 with 9.
tried /home/hennes/Internship/pdfs/AC036-007 with 8.
tried /home/hennes/Internship/pdfs/AC036-006 with 10.
tried /home/hennes/Internship/pdfs/AC036-003 with 10.
tried /home/hennes/Internship/pdfs/AC036-007 with 9.
tried /home/hennes/Internship/pdfs/AC036-006 with 11.
There are still several column lengths in table /home/hennes/Internship/pdfs/AC036-006. Manually check this
There are rows that are too short in table /home/hennes/Internship/pdfs/AC036-006. Manually check this
tried /home/hennes/Internship/pdfs/AC036-001 with 10.
tried /home/hennes/Internship/pdfs/AC036-008 with 5.
tried /home/hennes/Internship/pdfs/AC036-003 with 11.
There are still several column lengths in table /home/hennes/Internship/pdfs/AC036-003. Manually check this
tried /home/hennes/Internship/pdfs/AC036

There are rows that are too short in table /home/hennes/Internship/pdfs/AC038-000. Manually check this
tried /home/hennes/Internship/pdfs/AC038-001 with 10.
tried /home/hennes/Internship/pdfs/AC038-002 with 11.
There are rows that are too short in table /home/hennes/Internship/pdfs/AC038-002. Manually check this
tried /home/hennes/Internship/pdfs/AC038-004 with 11.
There are rows that are too short in table /home/hennes/Internship/pdfs/AC038-004. Manually check this
tried /home/hennes/Internship/pdfs/AC038-006 with 5.
tried /home/hennes/Internship/pdfs/AC038-001 with 11.
There are rows that are too short in table /home/hennes/Internship/pdfs/AC038-001. Manually check this
tried /home/hennes/Internship/pdfs/AC038-005 with 5.
tried /home/hennes/Internship/pdfs/AC038-006 with 6.
tried /home/hennes/Internship/pdfs/AC038-005 with 6.
tried /home/hennes/Internship/pdfs/AC038-006 with 7.
tried /home/hennes/Internship/pdfs/AC038-006 with 8.
tried /home/hennes/Internship/pdfs/AC038-005 with 7.
t

tried /home/hennes/Internship/pdfs/AC041-004 with 9.
tried /home/hennes/Internship/pdfs/AC041-005 with 10.
tried /home/hennes/Internship/pdfs/AC041-006 with 9.
tried /home/hennes/Internship/pdfs/AC041-005 with 11.
There are still several column lengths in table /home/hennes/Internship/pdfs/AC041-005. Manually check this
tried /home/hennes/Internship/pdfs/AC041-004 with 10.
tried /home/hennes/Internship/pdfs/AC041-006 with 10.
tried /home/hennes/Internship/pdfs/AC041-004 with 11.
There are still several column lengths in table /home/hennes/Internship/pdfs/AC041-004. Manually check this
There are rows that are too short in table /home/hennes/Internship/pdfs/AC041-004. Manually check this
tried /home/hennes/Internship/pdfs/AC041-006 with 11.
There are still several column lengths in table /home/hennes/Internship/pdfs/AC041-006. Manually check this
There are rows that are too short in table /home/hennes/Internship/pdfs/AC041-006. Manually check this
extracted cells of AC041.pdf
completed o

tried /home/hennes/Internship/pdfs/AC044-005 with 5.
tried /home/hennes/Internship/pdfs/AC044-004 with 11.
There are still several column lengths in table /home/hennes/Internship/pdfs/AC044-004. Manually check this
tried /home/hennes/Internship/pdfs/AC044-006 with 5.
tried /home/hennes/Internship/pdfs/AC044-007 with 5.
tried /home/hennes/Internship/pdfs/AC044-005 with 6.
tried /home/hennes/Internship/pdfs/AC044-008 with 5.
tried /home/hennes/Internship/pdfs/AC044-006 with 6.
tried /home/hennes/Internship/pdfs/AC044-005 with 7.
tried /home/hennes/Internship/pdfs/AC044-007 with 6.
tried /home/hennes/Internship/pdfs/AC044-008 with 6.
tried /home/hennes/Internship/pdfs/AC044-006 with 7.
tried /home/hennes/Internship/pdfs/AC044-005 with 8.
tried /home/hennes/Internship/pdfs/AC044-007 with 7.
tried /home/hennes/Internship/pdfs/AC044-008 with 7.
tried /home/hennes/Internship/pdfs/AC044-006 with 8.
tried /home/hennes/Internship/pdfs/AC044-005 with 9.
tried /home/hennes/Internship/pdfs/AC044-00

tried /home/hennes/Internship/pdfs/AC047-005 with 7.
tried /home/hennes/Internship/pdfs/AC047-006 with 7.
tried /home/hennes/Internship/pdfs/AC047-004 with 9.
tried /home/hennes/Internship/pdfs/AC047-005 with 8.
tried /home/hennes/Internship/pdfs/AC047-007 with 7.
tried /home/hennes/Internship/pdfs/AC047-006 with 8.
tried /home/hennes/Internship/pdfs/AC047-004 with 10.
tried /home/hennes/Internship/pdfs/AC047-007 with 8.
tried /home/hennes/Internship/pdfs/AC047-006 with 9.
tried /home/hennes/Internship/pdfs/AC047-005 with 9.
tried /home/hennes/Internship/pdfs/AC047-004 with 11.
There are still several column lengths in table /home/hennes/Internship/pdfs/AC047-004. Manually check this
There are rows that are too short in table /home/hennes/Internship/pdfs/AC047-004. Manually check this
tried /home/hennes/Internship/pdfs/AC047-008 with 5.
tried /home/hennes/Internship/pdfs/AC047-007 with 9.
tried /home/hennes/Internship/pdfs/AC047-006 with 10.
tried /home/hennes/Internship/pdfs/AC047-005

extracted cells of AC048.pdf
completed ocr of AC048.pdf
working on AC048-000
Error tokenizing data. C error: Expected 15 fields in line 4, saw 16

will try again ignoring one more line.
working on AC048-000
working on AC048-001
working on AC048-002
working on AC048-003
working on AC048-004
working on AC048-005
working on AC048-006
working on AC048-007
working on AC048-008
working on AC048-009
Error tokenizing data. C error: Expected 8 fields in line 5, saw 15

There is a problem with AC048.pdf. Continuing with next pdf.
 
WORKING ON /home/hennes/Internship/pdfs/AC049.pdf
 
created images of AC049.pdf
preprocessed images of AC049.pdf
extracted tables of AC049.pdf
tried /home/hennes/Internship/pdfs/AC049-000 with 5.
tried /home/hennes/Internship/pdfs/AC049-003 with 5.
tried /home/hennes/Internship/pdfs/AC049-002 with 5.
tried /home/hennes/Internship/pdfs/AC049-001 with 5.
tried /home/hennes/Internship/pdfs/AC049-000 with 6.
tried /home/hennes/Internship/pdfs/AC049-003 with 6.
tried /home

tried /home/hennes/Internship/pdfs/AC050-006 with 11.
There are still several column lengths in table /home/hennes/Internship/pdfs/AC050-006. Manually check this
tried /home/hennes/Internship/pdfs/AC050-007 with 8.
tried /home/hennes/Internship/pdfs/AC050-008 with 7.
tried /home/hennes/Internship/pdfs/AC050-007 with 9.
tried /home/hennes/Internship/pdfs/AC050-008 with 8.
tried /home/hennes/Internship/pdfs/AC050-007 with 10.
tried /home/hennes/Internship/pdfs/AC050-008 with 9.
tried /home/hennes/Internship/pdfs/AC050-007 with 11.
There are still several column lengths in table /home/hennes/Internship/pdfs/AC050-007. Manually check this
tried /home/hennes/Internship/pdfs/AC050-008 with 10.
tried /home/hennes/Internship/pdfs/AC050-008 with 11.
There are still several column lengths in table /home/hennes/Internship/pdfs/AC050-008. Manually check this
extracted cells of AC050.pdf
completed ocr of AC050.pdf
working on AC050-000
Error tokenizing data. C error: Expected 13 fields in line 4, sa

tried /home/hennes/Internship/pdfs/AC052-003 with 11.
There are still several column lengths in table /home/hennes/Internship/pdfs/AC052-003. Manually check this
There are rows that are too short in table /home/hennes/Internship/pdfs/AC052-003. Manually check this
tried /home/hennes/Internship/pdfs/AC052-004 with 5.
tried /home/hennes/Internship/pdfs/AC052-006 with 5.
tried /home/hennes/Internship/pdfs/AC052-001 with 11.
There are still several column lengths in table /home/hennes/Internship/pdfs/AC052-001. Manually check this
There are rows that are too short in table /home/hennes/Internship/pdfs/AC052-001. Manually check this
tried /home/hennes/Internship/pdfs/AC052-006 with 6.
tried /home/hennes/Internship/pdfs/AC052-006 with 7.
tried /home/hennes/Internship/pdfs/AC052-004 with 6.
tried /home/hennes/Internship/pdfs/AC052-006 with 8.
tried /home/hennes/Internship/pdfs/AC052-005 with 5.
tried /home/hennes/Internship/pdfs/AC052-006 with 9.
tried /home/hennes/Internship/pdfs/AC052-006 w

tried /home/hennes/Internship/pdfs/AC054-001 with 7.
tried /home/hennes/Internship/pdfs/AC054-000 with 8.
tried /home/hennes/Internship/pdfs/AC054-004 with 5.
tried /home/hennes/Internship/pdfs/AC054-003 with 8.
tried /home/hennes/Internship/pdfs/AC054-000 with 9.
tried /home/hennes/Internship/pdfs/AC054-001 with 8.
tried /home/hennes/Internship/pdfs/AC054-004 with 6.
tried /home/hennes/Internship/pdfs/AC054-003 with 9.
tried /home/hennes/Internship/pdfs/AC054-000 with 10.
tried /home/hennes/Internship/pdfs/AC054-001 with 9.
tried /home/hennes/Internship/pdfs/AC054-004 with 7.
tried /home/hennes/Internship/pdfs/AC054-000 with 11.
There are still several column lengths in table /home/hennes/Internship/pdfs/AC054-000. Manually check this
tried /home/hennes/Internship/pdfs/AC054-003 with 10.
tried /home/hennes/Internship/pdfs/AC054-001 with 10.
tried /home/hennes/Internship/pdfs/AC054-004 with 8.
tried /home/hennes/Internship/pdfs/AC054-003 with 11.
There are still several column lengths 

tried /home/hennes/Internship/pdfs/AC055-047 with 10.
tried /home/hennes/Internship/pdfs/AC055-047 with 11.
There are rows that are too short in table /home/hennes/Internship/pdfs/AC055-047. Manually check this
extracted cells of AC055.pdf
completed ocr of AC055.pdf
working on AC055-000
No columns to parse from file
will try again ignoring one more line.
working on AC055-000
No columns to parse from file
There is a problem with AC055.pdf. Continuing with next pdf.
 
WORKING ON /home/hennes/Internship/pdfs/AC057.pdf
 
created images of AC057.pdf
preprocessed images of AC057.pdf
extracted tables of AC057.pdf
tried /home/hennes/Internship/pdfs/AC057-000 with 5.
tried /home/hennes/Internship/pdfs/AC057-002 with 5.
tried /home/hennes/Internship/pdfs/AC057-001 with 5.
tried /home/hennes/Internship/pdfs/AC057-003 with 5.
tried /home/hennes/Internship/pdfs/AC057-004 with 5.
tried /home/hennes/Internship/pdfs/AC057-005 with 5.
tried /home/hennes/Internship/pdfs/AC057-007 with 5.
tried /home/hen

tried /home/hennes/Internship/pdfs/AC064-004 with 5.
tried /home/hennes/Internship/pdfs/AC064-005 with 5.
tried /home/hennes/Internship/pdfs/AC064-008 with 5.
tried /home/hennes/Internship/pdfs/AC064-007 with 5.
tried /home/hennes/Internship/pdfs/AC064-006 with 5.
tried /home/hennes/Internship/pdfs/AC064-010 with 5.
tried /home/hennes/Internship/pdfs/AC064-009 with 5.
extracted cells of AC064.pdf
completed ocr of AC064.pdf
working on AC064-000
working on AC064-001
working on AC064-002
working on AC064-003
working on AC064-004
working on AC064-005
working on AC064-006
working on AC064-007
working on AC064-008
working on AC064-009
working on AC064-010
Saved AC064 to folder.
 
WORKING ON /home/hennes/Internship/pdfs/AC065.pdf
 
created images of AC065.pdf
preprocessed images of AC065.pdf
extracted tables of AC065.pdf
tried /home/hennes/Internship/pdfs/AC065-000 with 5.
tried /home/hennes/Internship/pdfs/AC065-002 with 5.
tried /home/hennes/Internship/pdfs/AC065-001 with 5.
tried /home/hen

tried /home/hennes/Internship/pdfs/AC070-015 with 5.
tried /home/hennes/Internship/pdfs/AC070-014 with 5.
tried /home/hennes/Internship/pdfs/AC070-015 with 6.
tried /home/hennes/Internship/pdfs/AC070-015 with 7.
tried /home/hennes/Internship/pdfs/AC070-015 with 8.
tried /home/hennes/Internship/pdfs/AC070-015 with 9.
tried /home/hennes/Internship/pdfs/AC070-015 with 10.
tried /home/hennes/Internship/pdfs/AC070-015 with 11.
There are rows that are too short in table /home/hennes/Internship/pdfs/AC070-015. Manually check this
extracted cells of AC070.pdf
completed ocr of AC070.pdf
working on AC070-000
working on AC070-001
working on AC070-002
working on AC070-003
working on AC070-004
working on AC070-005
working on AC070-006
working on AC070-007
working on AC070-008
working on AC070-009
working on AC070-010
working on AC070-011
working on AC070-012
working on AC070-013
working on AC070-014
working on AC070-015
Saved AC070 to folder.
 
WORKING ON /home/hennes/Internship/pdfs/AC071.pdf
 
cr

completed ocr of AC072.pdf
working on AC072-000
working on AC072-001
working on AC072-002
working on AC072-003
working on AC072-004
working on AC072-005
working on AC072-006
working on AC072-007
working on AC072-008
working on AC072-009
working on AC072-010
working on AC072-011
working on AC072-012
working on AC072-013
working on AC072-014
working on AC072-015
Saved AC072 to folder.
 
WORKING ON /home/hennes/Internship/pdfs/AC073.pdf
 
created images of AC073.pdf
preprocessed images of AC073.pdf
extracted tables of AC073.pdf
tried /home/hennes/Internship/pdfs/AC073-000 with 5.
tried /home/hennes/Internship/pdfs/AC073-002 with 5.
tried /home/hennes/Internship/pdfs/AC073-001 with 5.
tried /home/hennes/Internship/pdfs/AC073-003 with 5.
tried /home/hennes/Internship/pdfs/AC073-004 with 5.
tried /home/hennes/Internship/pdfs/AC073-005 with 5.
tried /home/hennes/Internship/pdfs/AC073-006 with 5.
tried /home/hennes/Internship/pdfs/AC073-007 with 5.
tried /home/hennes/Internship/pdfs/AC073-009 

tried /home/hennes/Internship/pdfs/AC077-003 with 5.
tried /home/hennes/Internship/pdfs/AC077-002 with 5.
tried /home/hennes/Internship/pdfs/AC077-000 with 6.
tried /home/hennes/Internship/pdfs/AC077-004 with 5.
tried /home/hennes/Internship/pdfs/AC077-003 with 6.
tried /home/hennes/Internship/pdfs/AC077-005 with 5.
tried /home/hennes/Internship/pdfs/AC077-000 with 7.
tried /home/hennes/Internship/pdfs/AC077-006 with 5.
tried /home/hennes/Internship/pdfs/AC077-000 with 8.
tried /home/hennes/Internship/pdfs/AC077-003 with 7.
tried /home/hennes/Internship/pdfs/AC077-006 with 6.
tried /home/hennes/Internship/pdfs/AC077-005 with 6.
tried /home/hennes/Internship/pdfs/AC077-000 with 9.
tried /home/hennes/Internship/pdfs/AC077-008 with 5.
tried /home/hennes/Internship/pdfs/AC077-007 with 5.
tried /home/hennes/Internship/pdfs/AC077-010 with 5.
tried /home/hennes/Internship/pdfs/AC077-009 with 5.
tried /home/hennes/Internship/pdfs/AC077-003 with 8.
tried /home/hennes/Internship/pdfs/AC077-000 w

KeyboardInterrupt: 