# Pipeline

### Steps

1. Import packages and define folders
2. Choose which files to use as input
3. Instantiate functions
4. Run Program

In [1]:
# Importing other necessary packages
import glob
import pandas as pd
import numpy as np
import cv2 # image transformation
import os
import re
import concurrent # for parallel instances
import functools # for creating partial functions
import shutil
from pathlib import Path

from io import StringIO # to convert string to csv
import time # to measure time

# to add the path where to search for modules
import sys
sys.path.append('/home/hennes/Internship/table_scanner')

# Importing table_ocr modules 
from table_ocr import pdf_to_images
from table_ocr import extract_tables
from table_ocr import extract_cells
from table_ocr import ocr_image
from table_ocr import ocr_to_csv

## setting folders and pdfs
folder = "/home/hennes/Internship/pdfs/" # should be folder containing pdfs of election
save_folder = '/home/hennes/Internship/constituencies/' # folder into which csvs should be saved
saved = [os.path.splitext(csv)[0] for csv in next(os.walk(save_folder))[2]]
old = '/home/hennes/Internship/old_files/' # folder into which old files are moved
old_files = [folder for folder in next(os.walk(old))[1]]
allpdf = [pdf for pdf in glob.glob(folder+'*') if pdf.endswith(".pdf")] # list with all pdfs from folder

## Choosing files (Choose one of the following options)

### input = pdfs

all pdfs that do not have a corresponding file in output folder

In [42]:
# exclude pdfs for which there is already a csv in save folder
pdflist = sorted([pdf for pdf in allpdf if pdf.split('/')[-1].split('_')[0].split('.')[0] not in
           [file.split('/')[-1].split('.')[0] for file in glob.glob(save_folder+'*')]])

pdfs as per excel sheet

In [9]:
df = pd.read_excel('/home/hennes/Downloads/Book.xlsx')

# get relevant pdf numbers

worklist = df[df['Comments'] == 'redo']['Constituency number'].tolist()

# give appropriate filename endings to items

for idx, item in enumerate(worklist):
    if len(str(item)) == 1:
        worklist[idx] = f'AC00{item}.pdf'
    if len(str(item)) == 2:
        worklist[idx] = f'AC0{item}.pdf'
    if len(str(item)) == 3:
        worklist[idx] = f'AC{item}.pdf'
        
worklist = tuple(worklist)
pdflist = [pdf for pdf in allpdf if pdf.endswith(worklist)]

### input = files

all ACs that do not have a corresponding file in output folder

In [27]:
# creates list of lists with each list containing the folders of one AC constituency
filelist = []
for e in set(sorted([folder.split('-')[1] for folder in next(os.walk(old))[1] if folder.split('-')[1] not in saved])):
    filelist.append(sorted([folder for folder in old_files if folder.split('-')[1] in e]))
filelist = sorted(filelist)

All ACs as per excel sheet

In [2]:
filelist = []
for e in set(sorted([folder.split('-')[1] for folder in next(os.walk(old))[1]])):
    filelist.append(sorted([folder for folder in old_files if folder.split('-')[1] in e]))
filelist = sorted(filelist)

In [13]:
# first get list of lists with all files

filelist = []
for e in set(sorted([folder.split('-')[1] for folder in next(os.walk(old))[1]])):
    filelist.append(sorted([folder for folder in old_files if folder.split('-')[1] in e]))
filelist = sorted(filelist)


# then filter as per excel
df = pd.read_excel('/home/hennes/Downloads/Book.xlsx')

# get relevant pdf numbers

worklist = df[df['Comments'] == 'redo OCR']['Constituency number'].tolist()

# give appropriate filename endings to items

for idx, item in enumerate(worklist):
    if len(str(item)) == 1:
        worklist[idx] = f'AC00{item}'
    if len(str(item)) == 2:
        worklist[idx] = f'AC0{item}'
    if len(str(item)) == 3:
        worklist[idx] = f'AC{item}'
        
worklist = tuple(worklist)

filelist = [l for l in filelist if l[0].split('-')[1] in worklist]

IndexError: list index out of range

## Running Program

Cleaning eventual files still in pdf folder.

In [8]:
move_files(folder, old)

For when input is folders from old folder (tables already extracted):

In [45]:
for l in filelist:
    [shutil.copytree(old+x, folder+x) for x in l]
    pdf = f'/{l[0].split("-")[1]}'
    print(f' \nWORKING ON {pdf.split("/")[-1]}\n ')
    stop = ocr_pipeline(pdf, from_cell=True, dilate=None)
    if stop:
        break

 
WORKING ON 12
 
extracted cells of 12
completed ocr of 12
working on AC017-02
working on AC017-03
working on AC017-07
working on AC017-08
working on AC017-09
working on AC017-10
working on AC017-11
working on AC017-12
working on AC017-13
working on AC017-14
working on AC017-15
Saved 02 to folder.


TypeError: 'NoneType' object is not iterable

For when input is image files from old folder:

In [4]:
pdflist = [e.split('.')[0] for e in worklist]

NameError: name 'worklist' is not defined

In [5]:
for x in pdflist:
    imglist = [img for img in next(os.walk(old))[2] if img.startswith(os.path.split(x)[1].split('.')[0])]
    [shutil.copy(old+x, folder) for x in imglist]
    pdf = f'{folder}/{x}.pdf'
    print(f' \nWORKING ON {x}\n ')
    stop = ocr_pipeline(pdf, preprocess = False, image_conversion = False)
    if stop:
        break

 
WORKING ON /home/hennes/Internship/pdfs/AC177.pdf
 
extracted tables of AC177.pdf.pdf
extracted cells of AC177.pdf.pdf
completed ocr of AC177.pdf.pdf
working on AC177-01
working on AC177-02
working on AC177-03
working on AC177-04
working on AC177-05
working on AC177-06
working on AC177-07
working on AC177-08
Error tokenizing data. C error: Expected 14 fields in line 9, saw 15

will try again ignoring one more line.
working on AC177-01
working on AC177-02
working on AC177-03
working on AC177-04
working on AC177-05
working on AC177-06
working on AC177-07
working on AC177-08
Saved AC177 to folder.


TypeError: expected str, bytes or os.PathLike object, not NoneType

For when input files are pdf:

In [6]:
# no options
pdflist = ['/home/hennes/Internship/pdfs/AC229.pdf', None]

In [9]:
for x in pdflist:
    print(f' \nWORKING ON {x}\n ')
    stop = ocr_pipeline(x, thresh = True, no_noise = True)
    if stop:
        break

 
WORKING ON /home/hennes/Internship/pdfs/AC229.pdf
 
created images of AC229.pdf
preprocessed images of AC229.pdf
Extraction error: /home/hennes/Internship/pdfs/AC229-15.png.
extracted tables of AC229.pdf
extracted cells of AC229.pdf
completed ocr of AC229.pdf
working on AC229-01
working on AC229-02
working on AC229-03
working on AC229-04
working on AC229-05
working on AC229-06
working on AC229-07
working on AC229-08
working on AC229-09
working on AC229-10
working on AC229-11
working on AC229-12
working on AC229-13
working on AC229-14
Saved AC229 to folder.
 
WORKING ON None
 


TypeError: expected str, bytes or os.PathLike object, not NoneType

## Defining Functions

In [2]:
def move_files(folder, old):
    # if folder already exist, delete it. Then move folders to old directory.
    [shutil.rmtree(old+'/'+e)
     for e in next(os.walk(folder))[1] if Path(old+'/'+e).is_dir()]

    [shutil.move(folder+''+e, old)
     for e in next(os.walk(folder))[1] if not e.endswith('.pdf')]

    # if files already exist, delete them. Then move files to old directory.
    [os.remove(old+'/'+e)
     for e in next(os.walk(folder))[2] if not e.endswith('.pdf') and Path(old+'/'+e).is_file()]

    [shutil.move(e, old)
     for e in glob.glob(folder+'*') if not e.endswith('.pdf')]

def ocr_pipeline(pdf, thresh=None, no_noise=None, preprocess=True, dilate=True, image_conversion=True,
                 AC = False, from_cell = False):
    '''Function which binds together the functions necessary to turn one pdf of several pages of tables
    into a csv file which will be stored under the same name in the specified folder.
    The options are:
    
    image_conversion - whether images should be converted from pdf. If value is None, then converted 
                       images should already be supplied in the folder.
    
    thresh           - whether otsu thresholding should be applied to cell images before performing ocr.
                       Useful if the images contain a lot of grey, in which case not thresholding likely
                       results in many artifacts wrongly identified as numbers.
    
    no_noise         - whether noise reduction techniques should be used before ocr. It should be avoided in
                       images with very thin/small font. Helpful in case of cropping error (in that case, put TRUE).
                
    preprocess       - whether preprocessing should be used before table extraction. Useful to avoid if
                       preprocessing results in wrongly rotated images. Can only be used if function is
                       used on already extracted images.
                 
    dilate           - whether image should be dilated before cell extraction. Useful for thin/frail cell
                       lines, but results in numbers becoming so thick that they are recognised as cell 
                       walls themselves in images with tightly written fonts.
    
    AC               - whether the folders created with table_extraction should be named after the AC con-
                       stituency, which is extracted from the images using OCR. This is necessary for the 
                       national elections, which use a different constituency system. Pipeline stops after
                       the renaming.
                       
    from_cell        - "True" = workflow starts from cell extraction. Only works if input is folders from 
                       old folder.
                       "False" = workflow starts from image conversion. Input are pdf files.'''
    
    stop = None
    
    if from_cell == False:
        if image_conversion == True:
            # extract pages from pdf and save as images 
            pdf_to_images.pdf_to_images(pdf)
            print(f"created images of {pdf.split('/')[-1]}")

        # define list of images thus created
        imglist = [img for img in glob.glob(folder+'*') if img.endswith('.png')]

        if preprocess == True:
            # rotate and correct images for skew
            with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:
                executor.map(pdf_to_images.preprocess_img, imglist)
            print(f"preprocessed images of {pdf.split('/')[-1]}")

        # crop images to table and save in new folder
        if AC == False:
            # define imglist as list of lists because next function needs list as argument
            imglist = [[img] for img in imglist]

            # Create partial function for table extraction in which AC is negative
            p_extract_tables = functools.partial(extract_tables.main, AC)

            with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:
                executor.map(p_extract_tables, imglist)
            print(f"extracted tables of {pdf.split('/')[-1]}")

        elif AC == True:
            # sort imglist
            imglist = sorted(imglist)

            extract_tables.main(AC, imglist)
            print(f"extracted tables of {pdf.split('/')[-1]} and renamed them.")
            move_files(folder, old)
            return stop

    # define list of all images of tables in newly created subfolders
    dirlist = sorted([directory for directory in glob.glob(folder+'*/*') if directory.endswith('.png')])
    
    # If one of the images is smaller than 50 KB, table extraction probably did not work.
    # In that case, stop and continue with next pdf.
    
    if any(e for e in dirlist if os.path.getsize(e)/1000 < 50):
        print(f'problem tables: {[e.split("/")[-2] for e in dirlist if os.path.getsize(e)/1000 < 50]}')
        print(f'Table extraction did not work correctly. Continuing with next pdf.')
        move_files(folder, old)
        return stop
        
    # Create partial function for cell extraction in which dilation is specified
    p_extract_cells = functools.partial(extract_cells.main, dilate)

    # Extract individual cell images
    with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:
        executor.map(p_extract_cells, dirlist)
    print(f"extracted cells of {pdf.split('/')[-1]}")

    # define list of directories containing cell images
    dirlist = [directory for directory in glob.glob(folder+'*/*/')]

    # define list of images of cells within directories
    celllists = [glob.glob(cellfolder+'*') for cellfolder in dirlist]

    # Specify that there should be no multiple threads.
    # This is important because I am already using multiple processors.
    # And create partial function to pre-specify thresh and no_noise options.
    os.environ['OMP_THREAD_LIMIT'] = '1'
    p_ocr_image = functools.partial(ocr_image.main, None, thresh, no_noise)

    # perform OCR on each image
    for image_list in celllists:
        with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:
            executor.map(p_ocr_image, image_list)
    print(f"completed ocr of {pdf.split('/')[-1]}")

    try:
        gathered_data = []
        # get the names for the individual pages
        pages = sorted([filename for directory, filename in
                 [os.path.split(x) for x in glob.glob(folder+'*') if not x.endswith(('.pdf', '.png'))]])

        # create list of alphabetically ordered lists of ocred files
        ocrlists = [sorted(y) for y in
                    [glob.glob(f'{folder}/{x}/cells/ocr_data/*.txt') for x in pages]]
        zippie = zip(pages, ocrlists)

        # for each pair of page and ocred cells, create a csv
        for y, x in zippie:
            output = ocr_to_csv.main(x)
            csv = StringIO(output)
            print(f'working on {y}')

            # Turning csv into dataframe
            # Skipping the first two rows because they have fewer columns than rest
            # Also useful for chaining of tables later
            df = pd.read_csv(csv,  header = None, skiprows=[0, 1])
            gathered_data.append(df)
            df = pd.concat(gathered_data)

            # give df a name and save it
            if from_cell == False:
                constituency_name = pages[0][0:5]
            else:
                constituency_name = pages[0].split('-')[1]
            df.to_csv(save_folder+constituency_name+'.csv')

        print(f'Saved {constituency_name} to folder.')

        # move old files and folders into old_files folder
        move_files(folder, old)

    # If there is an error, print error message.
    # Most likely eror is that header lines (which are not turned into cells properly)
    # are three instead of two rows. So we try reading the csvs again, this
    # time ignoring the first three instead of just two rows.
    except Exception as e:
        print(e)
        print('will try again ignoring one more line.')
            # get the names for the individual pages

        try:
            gathered_data = []
            pages = sorted([filename for directory, filename in
                     [os.path.split(x) for x in glob.glob(folder+'*') if not x.endswith(('.pdf', '.png'))]])

            # create list of alphabetically ordered lists of ocred files

            ocrlists = [sorted(y) for y in
                        [glob.glob(f'{folder}/{x}/cells/ocr_data/*.txt') for x in pages]]
            zippie = zip(pages, ocrlists)

            # for each pair of directory and files, create csv file
            for y, x in zippie:
                output = ocr_to_csv.main(x)
                csv = StringIO(output)
                print(f'working on {y}')

                # Turning csv files into single dataframe
                # Skipping the first two rows because they have fewer columns than rest
                # Also useful for chaining of tables later
                col_names = [str(e) for e in list(range(0,25))]
                df = pd.read_csv(csv,  header = None, skiprows=[0, 1], names=col_names)
                gathered_data.append(df)
                df = pd.concat(gathered_data)

                # give df a name and save it
                if from_cell == False:
                    constituency_name = pages[0][0:5]
                else:
                    constituency_name = pages[0].split('-')[1]
                df.to_csv(save_folder+constituency_name+'.csv')

            print(f'Saved {constituency_name} to folder.')

            move_files(folder, old)

        # If this still does not work, stop the program and try to manually correct mistakes.
        except Exception as e:
            print(e)
            print(f'There is a problem with {pdf.split("/")[-1]}. Continuing with next pdf.')
            # move old files and folders into old_files folder
            move_files(folder, old)