# Load png files

In [1]:
import os

png_files = [f for f in os.listdir('pdf_images') if f.endswith('.png')]
print(f"Found {len(png_files)} PNG files in pdf_images directory.")

Found 71 PNG files in pdf_images directory.


# OCR

In [None]:
import re
def info_from_file(filename):    
    page_number = filename.split('.pdf')[-1].split('_')[-1].split('.')[0]
    year = re.findall(r'[0-9]+ *- *[0-9]+', filename)[0]
    level = re.findall(r'[Ll] *- *[0-9]', filename)[0].split('-')[1]
    term = re.findall(r'[Tt] *- *[0-9]', filename)[0].split('-')[1]

    year = [int(x) for x in year.split('-')]
    level = int(level)
    term = int(term)
    filename = filename.split('.pdf')[0].split('civil,')[1]
    #print(filename)
    dept = re.sub('[0-9]+ *- *[0-9]+','', filename)
    #print(dept)
    dept = re.sub('[Ll] *- *[0-9]','', dept)
    #print(dept)
    dept = re.sub('[Tt] *- *[0-9]','', dept)
    #print(dept)
    dept = re.findall('[A-Za-z]+',dept)[0]
    return {
        'page' : page_number,
        'session' : year,
        'level' : level,
        'term' : term,
        'department' : dept
    }


print(info_from_file('civil,L-4,T-1,CE, 2018-2019.pdf,page_21.png'))

L-4,T-1,CE, 2018-2019
L-4,T-1,CE, 
,T-1,CE, 
,,CE, 
{'page': '21', 'session': [2018, 2019], 'level': 4, 'term': 1, 'department': 'CE'}


In [None]:
import keras_ocr
import numpy as np
from PIL import Image

# Initialize the pipeline
pipeline = keras_ocr.pipeline.Pipeline()

# Create a list to store OCR results
ocr_results = []

# Process images in batches to avoid memory issues
batch_size = 2
for i in range(0, len(png_files), batch_size):
    batch = png_files[i:i + batch_size]
    
    # Load images from the pdf_images directory
    images = [keras_ocr.tools.read(os.path.join('pdf_images', img)) for img in batch]
    
    # Perform OCR on the batch
    predictions = pipeline.recognize(images)
    
    # Store results
    for idx, pred in enumerate(predictions):
        filename = batch[idx]
        paper_info = info_from_file(filename)
        print(f"Processing {filename} (page {paper_info['page']})")
        text_predictions = [(text, box.tolist()) for text, box in pred]
        ocr_results.append({
            'page' : paper_info['page'],
            'session' : paper_info['session'],
            'level' : paper_info['level'],
            'term' : paper_info['term'],
            'department' : paper_info['department'],
            'text': text_predictions
        })
    
    print(f"Processed {min(i + batch_size, len(png_files))} of {len(png_files)} images")




2025-04-06 16:19:57.111682: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-04-06 16:19:57.234011: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /home/hamim-mahmud/Projects/tf_organizer/lib/python3.10/site-packages/cv2/../../lib64:
2025-04-06 16:19:57.234031: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2025-04-06 16:19:57.257598: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS 

Looking for /home/hamim-mahmud/.keras-ocr/craft_mlt_25k.h5


2025-04-06 16:20:00.801275: E tensorflow/stream_executor/cuda/cuda_driver.cc:265] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2025-04-06 16:20:00.801294: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (hamim-mahmud-Lenovo-Legion-5-15IMH05): /proc/driver/nvidia/version does not exist
2025-04-06 16:20:00.801513: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Looking for /home/hamim-mahmud/.keras-ocr/crnn_kurapan.h5


2025-04-06 16:20:03.732254: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 1303379968 exceeds 10% of free system memory.
2025-04-06 16:20:03.992175: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 1303379968 exceeds 10% of free system memory.
2025-04-06 16:20:04.385915: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 1303379968 exceeds 10% of free system memory.
2025-04-06 16:20:05.894021: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 1303379968 exceeds 10% of free system memory.


L-4,T-1,CE, 2018-2019
L-4,T-1,CE, 
,T-1,CE, 
,,CE, 
Processing civil,L-4,T-1,CE, 2018-2019.pdf,page_21.png (page 21)
L-4,T-1,CE, 2018-2019
L-4,T-1,CE, 
,T-1,CE, 
,,CE, 
Processing civil,L-4,T-1,CE, 2018-2019.pdf,page_23.png (page 23)
Processed 2 of 71 images


2025-04-06 16:21:05.517313: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 2078277632 exceeds 10% of free system memory.


L-4T-1, CE, 2020-2021
L-4T-1, CE, 
T-1, CE, 
, CE, 
Processing civil,L-4T-1, CE, 2020-2021.pdf,page_11.png (page 11)
2013-2014 (L-4, T-1)-CE
 (L-4, T-1)-CE
 (, T-1)-CE
 (, )-CE
Processing civil,2013-2014 (L-4, T-1)-CE.pdf,page_18.png (page 18)
Processed 4 of 71 images
2013-2014 (L-4, T-1)-CE
 (L-4, T-1)-CE
 (, T-1)-CE
 (, )-CE
Processing civil,2013-2014 (L-4, T-1)-CE.pdf,page_16.png (page 16)
L-4T-1, CE, 2020-2021
L-4T-1, CE, 
T-1, CE, 
, CE, 
Processing civil,L-4T-1, CE, 2020-2021.pdf,page_16.png (page 16)
Processed 6 of 71 images
2014-2015 (L-2,T-1)-CE
 (L-2,T-1)-CE
 (,T-1)-CE
 (,)-CE
Processing civil,2014-2015 (L-2,T-1)-CE.pdf,page_6.png (page 6)
L-4,T-1,CE, 2018-2019
L-4,T-1,CE, 
,T-1,CE, 
,,CE, 
Processing civil,L-4,T-1,CE, 2018-2019.pdf,page_22.png (page 22)
Processed 8 of 71 images
L-4T-1, CE, 2020-2021
L-4T-1, CE, 
T-1, CE, 
, CE, 
Processing civil,L-4T-1, CE, 2020-2021.pdf,page_19.png (page 19)
2013-2014 (L-4, T-1)-CE
 (L-4, T-1)-CE
 (, T-1)-CE
 (, )-CE
Processing civil,2013-2

In [None]:
print("OCR results:")
for result in ocr_results:
    print(f"Filename: {result['filename']}")
    print("Detected text:")
    for x in result['text']:
        print(f"  {x[0]} \t\t-> {x[1]}")

OCR results:
Filename: civil,L-4,T-1,CE, 2018-2019.pdf,page_21.png
Detected text:
  291012019 		-> [[1349.4140625, 181.7578125], [1522.91015625, 181.7578125], [1522.91015625, 217.55859375], [1349.4140625, 217.55859375]]
  latice 		-> [[217.55859375, 184.51171875], [402.0703125, 184.51171875], [402.0703125, 220.3125], [217.55859375, 220.3125]]
  date 		-> [[1247.51953125, 184.51171875], [1324.62890625, 184.51171875], [1324.62890625, 214.8046875], [1247.51953125, 214.8046875]]
  of 		-> [[735.29296875, 250.60546875], [776.6015625, 250.60546875], [776.6015625, 280.8984375], [735.29296875, 280.8984375]]
  engineering 		-> [[784.86328125, 250.60546875], [991.40625, 250.60546875], [991.40625, 280.8984375], [784.86328125, 280.8984375]]
  and 		-> [[1002.421875, 250.60546875], [1065.76171875, 250.60546875], [1065.76171875, 278.14453125], [1002.421875, 278.14453125]]
  technology 		-> [[1074.0234375, 250.60546875], [1294.3359375, 250.60546875], [1294.3359375, 280.8984375], [1074.0234375, 280.89