# Pipeline

In [1]:
# Importing other necessary packages
import glob
import pandas as pd
import numpy as np
import cv2 # image transformation
import os
import re
import concurrent # for parallel instances
import functools # for creating partial functions
import shutil

from io import StringIO # to convert string to csv
import time # to measure time

# to add the path where to search for modules
import sys
sys.path.append('/home/hennes/Internship/table_scanner')

# Importing table_ocr modules 
from table_ocr import pdf_to_images
from table_ocr import extract_tables
from table_ocr import extract_cells
from table_ocr import ocr_image
from table_ocr import ocr_to_csv

## Creating Pipeline

In [2]:
## setting folders and pdfs

folder = "/home/hennes/Internship/pdfs" # should be folder containing pdfs of election
save_folder = '/home/hennes/Internship/constituencies/' # folder into which csvs should be saved
allpdf = [pdf for pdf in glob.glob(folder+'/*') if pdf.endswith(".pdf")] # list with all pdfs from folder

# exclude pdfs for which there is already a csv in save folder
pdflist = sorted([pdf for pdf in allpdf if pdf.split('/')[-1].split('_')[0].split('.')[0] not in
           [file.split('/')[-1].split('.')[0] for file in glob.glob(save_folder+'*')]])

# add pdf file names in case they should be manually excluded
exclude = ('AC069.pdf', 'AC100.pdf', 'AC005.pdf')
pdflist = [x for x in pdflist if not x.endswith(exclude)]

In [3]:
# delete all non-pdf files and folders
[shutil.rmtree(folder+'/'+e) for e in next(os.walk(folder))[1]]
[os.remove(e) for e in glob.glob(folder+'/*') if not e.endswith('.pdf')]

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [4]:
# Importing images
def ocr_pipeline(pdf):
    
    stop = None
    
    # extract pages from pdf and save as images 
    pdf_to_images.pdf_to_images(pdf)

    # define list of images thus created
    imglist = [img for img in glob.glob(folder+'/*') if img.endswith('.png')]
    print(f"created images of {pdf.split('/')[-1]}")
    
    # rotate and correct images for skew
    with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:
        executor.map(pdf_to_images.preprocess_img, imglist)
    print(f"preprocessed images of {pdf.split('/')[-1]}")

    # define imglist as list of lists because next function needs list as argument
    imglist = [[img] for img in imglist]

    # crop images to table and save in new folder
    with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:
        executor.map(extract_tables.main, imglist)
    print(f"extracted tables of {pdf.split('/')[-1]}")

    # define list of all images of tables in newly created subfolders
    dirlist = [directory for directory in glob.glob(folder+'/*/*') if directory.endswith('.png')]

    # Extract individual cell images
    with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:
        executor.map(extract_cells.main, dirlist)
    print(f"extracted cells of {pdf.split('/')[-1]}")

        
    # define list of directories containing cell images
    dirlist = [directory for directory in glob.glob(folder+'/*/*/')]
    
    # define list of images of cells within directories
    celllists = [glob.glob(cellfolder+'*') for cellfolder in dirlist]
    
    # Specify that there should be no multiple threads.
    # This is important because I am already using multiple processors.
    os.environ['OMP_THREAD_LIMIT'] = '1'
    p_ocr_image = functools.partial(ocr_image.main, None)
    
    # perform OCR on each image
    for image_list in celllists:
        with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:
            executor.map(p_ocr_image, image_list)
    print(f"completed ocr of {pdf.split('/')[-1]}")

    gathered_data = []

    try:
        # get the names for the individual pages
        pages = sorted([filename for directory, filename in
                 [os.path.split(x) for x in glob.glob(folder+'/*') if not x.endswith(('.pdf', '.png'))]])

        # create list of alphabetically ordered lists of ocred files
        ocrlists = [sorted(y) for y in
                    [glob.glob(f'{folder}/{x}/cells/ocr_data/*.txt') for x in pages]]
        zippie = zip(pages, ocrlists)
        
        # for each pair of page and ocred cells, create a csv
        for y, x in zippie:
            output = ocr_to_csv.main(x)
            csv = StringIO(output)
            print(f'working on {y}')

            # Turning csv into dataframe
            # Skipping the first two rows because they have fewer columns than rest
            # Also useful for chaining of tables later
            df = pd.read_csv(csv,  header = None, skiprows=[0, 1])
            gathered_data.append(df)
            df = pd.concat(gathered_data)

            constituency_name = pages[0][0:5]
            df.to_csv(save_folder+constituency_name+'.csv')

        print(f'Saved {constituency_name} to folder.')

        # delete folders and images created in this iteration of loop
        [shutil.rmtree(folder+'/'+e) for e in next(os.walk(folder))[1]]
        [os.remove(e) for e in glob.glob(folder+'/*') if not e.endswith('.pdf')]
    
    # If there is an error, print error message.
    # Most likely eror is that header lines (which are not turned into cells properly)
    # are three instead of two rows. So we try reading the csvs again, this
    # time ignoring the first three instead of just two rows.
    except Exception as e:
        print(e)
        print('will try again ignoring one more line.')
            # get the names for the individual pages

        try:
            pages = sorted([filename for directory, filename in
                     [os.path.split(x) for x in glob.glob(folder+'/*') if not x.endswith(('.pdf', '.png'))]])

            # create list of alphabetically ordered lists of ocred files

            ocrlists = [sorted(y) for y in
                        [glob.glob(f'{folder}/{x}/cells/ocr_data/*.txt') for x in pages]]
            zippie = zip(pages, ocrlists)

            for y, x in zippie:
                output = ocr_to_csv.main(x)
                csv = StringIO(output)
                print(f'working on {y}')
                # Turning csv into dataframe
                # Skipping the first two rows because they have fewer columns than rest
                # Also useful for chaining of tables later

                df = pd.read_csv(csv,  header = None, skiprows=[0, 1, 2])
                gathered_data.append(df)
                df = pd.concat(gathered_data)

                constituency_name = pages[0][0:5]
                df.to_csv(save_folder+constituency_name+'.csv')

            print(f'Saved {constituency_name} to folder.')

            [shutil.rmtree(folder+'/'+e) for e in next(os.walk(folder))[1]]
            [os.remove(e) for e in glob.glob(folder+'/*') if not e.endswith('.pdf')]
        
        # If this still does not work, stop the program and try to manually correct mistakes.
        except Exception as e:
            print(e)
            stop = True
    return stop

In [5]:
for x in pdflist:
    stop = ocr_pipeline(x)
    if stop:
        break

created images of AC006.pdf
preprocessed images of AC006.pdf
extracted tables of AC006.pdf
extracted cells of AC006.pdf
completed ocr of AC006.pdf
working on AC006-000
working on AC006-001
working on AC006-002
working on AC006-003
working on AC006-004
working on AC006-005
working on AC006-006
Error tokenizing data. C error: Expected 15 fields in line 7, saw 16

will try again ignoring one more line.
working on AC006-000
working on AC006-001
working on AC006-002
working on AC006-003
working on AC006-004
working on AC006-005
working on AC006-006
Error tokenizing data. C error: Expected 15 fields in line 7, saw 16

