In [7]:
import os
import pandas as pd
from PIL import Image
from pdf2image import convert_from_path
import pytesseract
import json
import concurrent.futures
from tqdm.notebook import tqdm

In [8]:
def extract_text_from_page(i, page):
    txt = pytesseract.image_to_string(page, lang='nld')\
            .replace(',,', '"')\
            .replace('”', '"')
    
    return(i, txt)

In [11]:
def extract_text_from_pdf(row):
    file_path = os.path.join('database', row['issue'], row['id'], f"{row['id']}.pdf")
    outfile = os.path.join('database', row['issue'], row['id'], f"text.json")
    results = []

    # skip if pdf has already been processed
    if os.path.exists(outfile): # was not
        doc = convert_from_path(file_path)
        
        # ocr text from pdf
        # using multiprocessing to process multiple pages at the same time
        with concurrent.futures.ThreadPoolExecutor() as executor:
            # Submit tasks to the executor
            futures = [executor.submit(extract_text_from_page, i, page) for i, page in enumerate(doc)]
            results = [future.result() for future in concurrent.futures.as_completed(futures)]
    
        results.sort(key=lambda x:x[0])
    return(results)

In [12]:
def dump_text(row, results):
    outfile = os.path.join('database', row['issue'], row['id'], "text.json")
    if not os.path.exists(outfile):
        # convert data to dictionary
        data_dict = {}
        for element in results:
            data_dict[element[0]] = element[1]
    
        # dump data
        with open(outfile, 'w') as fp:
            json.dump(data_dict, fp)

In [13]:
def task(row):
    data = extract_text_from_pdf(row)
    dump_text(row, data)

In [14]:
df = pd.read_csv('scraped_data.csv')

In [15]:
[task(row) for i, row in tqdm(df.iterrows(), total=df.shape[0])];

  0%|          | 0/3281 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
df2 = pd.read_csv('scraped_data2.csv')
[task(row) for i, row in tqdm(df2.iterrows(), total=df2.shape[0])];

  0%|          | 0/197 [00:00<?, ?it/s]