# Pipeline

In [28]:
# Importing other necessary packages
import glob
import pandas as pd
import numpy as np
import cv2 # image transformation
import os
import re
import concurrent # for parallel instances
import functools # for creating partial functions
import shutil

from io import StringIO # to convert string to csv
import time # to measure time

# to add the path where to search for modules
import sys
sys.path.append('/home/hennes/Internship/table_scanner')

# Importing table_ocr modules 
from table_ocr import pdf_to_images
from table_ocr import extract_tables
from table_ocr import extract_cells
from table_ocr import ocr_image
from table_ocr import ocr_to_csv

## Creating Pipeline

In [57]:
## Creating Pipeline

folder = "/home/hennes/Internship/pdfs"
save_folder = '/home/hennes/Internship/constituencies/'
allpdf = [pdf for pdf in glob.glob(folder+'/*') if pdf.endswith(".pdf")]
pdflist = sorted([pdf for pdf in allpdf if pdf.split('/')[-1].split('_')[0].split('.')[0] not in
           [file.split('/')[-1].split('.')[0] for file in glob.glob(save_folder+'*')]])

# only collect pdfs that have not yet been turned into csv

In [58]:
# delete all non-pdf files and folders
[shutil.rmtree(folder+'/'+e) for e in next(os.walk(folder))[1]]
[os.remove(e) for e in glob.glob(folder+'/*') if not e.endswith('.pdf')]

[]

In [59]:
exclude = ['AC69_Form20.pdf', 'AC100_Form20.pdf']
pdflist = [x for x in pdflist if not x.endswith('AC69_Form20.pdf')]

In [61]:
for x in pdflist:
    ocr_pipeline(x)

created images of AC003.pdf
preprocessed images of AC003.pdf
extracted tables of AC003.pdf
extracted cells of AC003.pdf
completed ocr of AC003.pdf
Error tokenizing data. C error: Expected 13 fields in line 4, saw 14

will try again ignoring one more line.
Error tokenizing data. C error: Expected 9 fields in line 6, saw 10

created images of AC004.pdf
preprocessed images of AC004.pdf
extracted tables of AC004.pdf
extracted cells of AC004.pdf


KeyboardInterrupt: 

In [2]:
# Importing images
def ocr_pipeline(pdf):
    pdf_to_images.pdf_to_images(pdf)

    imglist = [img for img in glob.glob(folder+'/*') if img.endswith('.png')]
    print(f"created images of {pdf.split('/')[-1]}")
    # Preprocessing Table

    with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:
        executor.map(pdf_to_images.preprocess_img, imglist)
    print(f"preprocessed images of {pdf.split('/')[-1]}")

    # Extracting Table Image from PDF Page image
    imglist = [[img] for img in imglist]

    with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:
        executor.map(extract_tables.main, imglist)
    print(f"extracted tables of {pdf.split('/')[-1]}")

        
    # list of individual tables in subfolders 

    dirlist = [directory for directory in glob.glob(folder+'/*/*') if directory.endswith('.png')]

    # Extract individual cell images

    with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:
        executor.map(extract_cells.main, dirlist)
    print(f"extracted cells of {pdf.split('/')[-1]}")

        
    # cells folder is first in list of two objects in individual image folders.
    # That is why this code works (but only if executed after cells were extracted).

    dirlist = [directory for directory in glob.glob(folder+'/*/*/')]
    celllists = [glob.glob(cellfolder+'*') for cellfolder in dirlist]

    # perform OCR on each image

    os.environ['OMP_THREAD_LIMIT'] = '1'
    p_ocr_image = functools.partial(ocr_image.main, None)
    
    for image_list in celllists:
        # perform OCR on each image
        with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:
            executor.map(p_ocr_image, image_list)
    print(f"completed ocr of {pdf.split('/')[-1]}")

    gathered_data = []

    try:
        # get the names for the individual pages

        pages = sorted([filename for directory, filename in
                 [os.path.split(x) for x in glob.glob(folder+'/*') if not x.endswith(('.pdf', '.png'))]])

        # create list of alphabetically ordered lists of ocred files

        ocrlists = [sorted(y) for y in
                    [glob.glob(f'{folder}/{x}/cells/ocr_data/*.txt') for x in pages]]
        zippie = zip(pages, ocrlists)

        for y, x in zippie:
            output = ocr_to_csv.main(x)
            csv = StringIO(output)
            print(f'working on {y}')

            # Turning csv into dataframe
            # Skipping the first two rows because they have fewer columns than rest
            # Also useful for chaining of tables later

            df = pd.read_csv(csv,  header = None, skiprows=[0, 1])
            gathered_data.append(df)
            df = pd.concat(gathered_data)

            constituency_name = pages[0][0:5]
            df.to_csv(save_folder+constituency_name+'.csv')

        print(f'Saved {constituency_name} to folder.')

        [shutil.rmtree(folder+'/'+e) for e in next(os.walk(folder))[1]]
        [os.remove(e) for e in glob.glob(folder+'/*') if not e.endswith('.pdf')]
    except Exception as e:
        print(e)
        print('will try again ignoring one more line.')
            # get the names for the individual pages

        try:
            pages = sorted([filename for directory, filename in
                     [os.path.split(x) for x in glob.glob(folder+'/*') if not x.endswith(('.pdf', '.png'))]])

            # create list of alphabetically ordered lists of ocred files

            ocrlists = [sorted(y) for y in
                        [glob.glob(f'{folder}/{x}/cells/ocr_data/*.txt') for x in pages]]
            zippie = zip(pages, ocrlists)

            for y, x in zippie:
                output = ocr_to_csv.main(x)
                csv = StringIO(output)
                print(f'working on {y}')
                # Turning csv into dataframe
                # Skipping the first two rows because they have fewer columns than rest
                # Also useful for chaining of tables later

                df = pd.read_csv(csv,  header = None, skiprows=[0, 1, 2])
                gathered_data.append(df)
                df = pd.concat(gathered_data)

                constituency_name = pages[0][0:5]
                df.to_csv(save_folder+constituency_name+'.csv')

            print(f'Saved {constituency_name} to folder.')

            [shutil.rmtree(folder+'/'+e) for e in next(os.walk(folder))[1]]
            [os.remove(e) for e in glob.glob(folder+'/*') if not e.endswith('.pdf')]
        except Exception as e:
            print(e)
            exit()