In [None]:
!pip install keras-ocr
!pip install pdf2image
!apt-get install poppler-utils
!pip install jiwer

In [None]:
#Calculate the CER and WER
import os
import keras_ocr
from pdf2image import convert_from_path
import numpy as np
import re
import cv2
from jiwer import wer, cer

def preprocess_image(image, size=(1280, 720)):
    """
    Preprocess the image by resizing, converting to grayscale, and applying Gaussian blur.
    Then convert it back to RGB as keras-ocr expects a three-channel image.
    """
    image = image.resize(size)
    gray_image = cv2.cvtColor(np.array(image), cv2.COLOR_BGR2GRAY)
    blurred_image = cv2.GaussianBlur(gray_image, (5, 5), 0)
    preprocessed_image = cv2.cvtColor(blurred_image, cv2.COLOR_GRAY2RGB)
    return preprocessed_image

def clean_text(text):
    """
    Normalize the text by removing invalid XML characters and other non-alphanumeric characters.
    """
    invalid_xml_chars = re.compile(u'[\u0000-\u0008\u000B-\u000C\u000E-\u001F]')
    text = invalid_xml_chars.sub('', text)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text

def process_documents(directory):
    # Setup the keras-ocr pipeline
    pipeline = keras_ocr.pipeline.Pipeline()

    for filename in os.listdir(directory):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(directory, filename)
            ground_truth_path = pdf_path.replace('.pdf', '.txt')

            if not os.path.exists(ground_truth_path):
                print(f"Missing ground truth text file for {filename}")
                continue

            images = convert_from_path(pdf_path)
            all_text = []

            for image in images:
                preprocessed_image = preprocess_image(image)
                prediction_groups = pipeline.recognize([np.array(preprocessed_image)])
                for predictions in prediction_groups:
                    text = ' '.join([word[0] for word in predictions])
                    clean_text_data = clean_text(text)
                    all_text.append(clean_text_data)

            ocr_output_text = '\n'.join(all_text)

            with open(ground_truth_path, 'r', encoding='utf-8') as gt_file:
                ground_truth_text = gt_file.read()
                ground_truth_text = clean_text(ground_truth_text)

            calculated_wer = wer(ground_truth_text, ocr_output_text)
            calculated_cer = cer(ground_truth_text, ocr_output_text)

            print(f"\n{filename} - WER: {calculated_wer:.2f}")
            print(f"{filename} - CER: {calculated_cer:.2f}")

directory_path = 'large/'
process_documents(directory_path)


In [None]:
#Extracting metadata based on predefined keywords
from pdf2image import convert_from_path
import xml.etree.ElementTree as ET
import re
import os
import keras_ocr
import numpy as np

def clean_text(text):
    """
    Remove invalid XML characters from the text.
    """
    invalid_xml_chars = re.compile(u'[\u0000-\u0008\u000B-\u000C\u000E-\u001F]')
    return invalid_xml_chars.sub('', text)

def group_text_by_lines(prediction_groups):
    """
    Group text by lines including their bounding boxes and confidences.
    """
    lines = []
    for word, box in prediction_groups[0]:
        y_coords = [point[1] for point in box]
        lines.append((word, box, 0.95))  # keras-ocr doesn't provide a confidence, so we use a dummy value
    return lines

def extract_first_occurrence(lines, keywords, text_limits):
    """
    Extracts the first occurrence of each keyword and ignores subsequent mentions.
    """
    pairs = {}
    current_key = None
    current_text = []
    current_limit = None
    found_keywords = set()

    normalized_keywords = {keyword.lower(): keyword for keyword in keywords}

    for text, _, _ in lines:
        normalized_text = text.lower().strip()

        if any(keyword == normalized_text for keyword in normalized_keywords) and normalized_text not in found_keywords:
            if current_key:
                formatted_text = ' '.join(current_text).strip()
                if current_limit is not None:
                    formatted_text = formatted_text[:current_limit]
                pairs[current_key] = formatted_text
            current_key = normalized_keywords.get(normalized_text, text.strip())
            current_text = []
            current_limit = text_limits.get(current_key, None)
            found_keywords.add(normalized_text)  # Mark this keyword as found
        elif current_key:
            current_text.append(text.strip())

    if current_key:
        formatted_text = ' '.join(current_text).strip()
        if current_limit is not None:
            formatted_text = formatted_text[:current_limit]
        pairs[current_key] = formatted_text

    return pairs

def write_to_xml(lines, xml_path):
    root = ET.Element("root")
    for text, _, _ in lines:
        element = ET.SubElement(root, "line")
        element.text = text
    tree = ET.ElementTree(root)
    tree.write(xml_path)

def read_from_xml(xml_path):
    tree = ET.parse(xml_path)
    root = tree.getroot()
    return [(line.text, None, None) for line in root]

def main():
    # Initialize the keras-ocr pipeline
    pipeline = keras_ocr.pipeline.Pipeline()

    pdf_path = '4.pdf'
    xml_path = os.path.splitext(pdf_path)[0] + '.xml'
    images = convert_from_path(pdf_path)

    keywords = ["Fastighetsagare", "Namn", "Postnr och ort", "Fastighetsbeteckning", "Anlaggare av ledning"]
    text_limits = {"Anlaggare av ledning": 41}  # For last keyword

    all_lines = []
    for i, image in enumerate(images):
        image_np = np.array(image)
        if len(image_np.shape) == 2:  # Convert grayscale to RGB
            image_np = np.stack([image_np]*3, axis=-1)

        # Perform OCR
        prediction_groups = pipeline.recognize([image_np])
        lines = group_text_by_lines(prediction_groups)
        all_lines.extend(lines)

    # Write OCR results to XML
    write_to_xml(all_lines, xml_path)

    # Read from XML
    lines_from_xml = read_from_xml(xml_path)
    info = extract_first_occurrence(lines_from_xml, keywords, text_limits)

    for key in keywords:
        value = info.get(key, "")
        print(f"{key}: {value}")  # Print each keyword and its first extracted value

if __name__ == "__main__":
    main()


Looking for /root/.keras-ocr/craft_mlt_25k.h5
Looking for /root/.keras-ocr/crnn_kurapan.h5
Fastighetsagare: 
Namn: l arabere f adress nals so l s postnr och ort 412 nonsto
Postnr och ort: 
Fastighetsbeteckning: anlaggare ledning norsjo kommun av storgatan 67 norsjo 935 81 2120002858 organisationsnr med underjordisk ledning for data och telekommunikation enligt detta avtal alla avses kablar och ledningar vilka overfors signaler for bild data eller i t genom ex annat underjordisk ledning ingar sadana andamal erforderliga for dess anordningar brunnar som skarvlador tillbehor tomror skap och andra fastighetsagaren till pa fastighet for anlaggaren ratt markutrymme angiven anl aggning ger bibehallande underhall ande underjordiska for och nyttji ledningar data och av telekommunikation disponera det behovs utfora for arbete enligt den samt utrymme att som strackning redovisas pa bifogad karta som vid byte anlaggningsagare overgar anderatten automatiskt till den nytti av nye anlaggningsagaren a

In [None]:
from pdf2image import convert_from_path
import xml.etree.ElementTree as ET
import re
import os
import keras_ocr
import numpy as np

def clean_text(text):
    """
    Remove invalid XML characters from the text.
    """
    invalid_xml_chars = re.compile(u'[\u0000-\u0008\u000B-\u000C\u000E-\u001F]')
    return invalid_xml_chars.sub('', text)

def group_text_by_lines(prediction_groups):
    """
    Group text by lines including their bounding boxes and confidences.
    """
    lines = []
    for word, box in prediction_groups[0]:
        y_coords = [point[1] for point in box]
        lines.append((word, box, 0.95))  # Assuming a dummy confidence value
    return lines

def write_to_xml(lines, xml_file):
    """
    Write the extracted lines to an XML file with bounding box and confidence.
    """
    root = ET.Element("Document")
    for text, bbox, confidence in lines:
        item = ET.SubElement(root, "Line")
        item.set('bbox', str(bbox))
        item.set('confidence', str(confidence))
        item.text = clean_text(text)
    tree = ET.ElementTree(root)
    tree.write(xml_file, encoding="utf-8", xml_declaration=True)

def read_and_search_keywords(xml_file, keywords):
    """
    Read the XML file and search for the first occurrence of each keyword.
    """
    tree = ET.parse(xml_file)
    root = tree.getroot()
    results = {}
    found_keywords = set()

    text_lines = [elem.text or "" for elem in root.findall('Line')]

    # Concatenate all texts into a single content block to mimic continuous reading
    full_text = " ".join(text_lines).lower()
    keyword_positions = {kw.lower(): full_text.find(kw.lower()) for kw in keywords if full_text.find(kw.lower()) != -1}
    # Sort keywords by their first occurrence position
    sorted_keywords = sorted(keyword_positions.items(), key=lambda x: x[1])

    for i, (kw, pos) in enumerate(sorted_keywords):
        # Find end of the text slice for this keyword
        if i < len(sorted_keywords) - 1:
            end_pos = sorted_keywords[i + 1][1]
        else:
            end_pos = len(full_text)
        # Extract the text segment associated with the keyword
        segment = full_text[pos:end_pos].strip()
        results[kw] = segment

    return results

def main():
    # Initialize the keras-ocr pipeline
    pipeline = keras_ocr.pipeline.Pipeline()

    pdf_path = '2.pdf'
    xml_path = os.path.splitext(pdf_path)[0] + '.xml'
    images = convert_from_path(pdf_path)

    all_lines = []
    for image in images:
        image_np = np.array(image)
        if len(image_np.shape) == 2:  # Convert grayscale to RGB if needed
            image_np = np.stack([image_np]*3, axis=-1)

        # Perform OCR
        prediction_groups = pipeline.recognize([image_np])
        lines = group_text_by_lines(prediction_groups)
        all_lines.extend(lines)

    # Write the OCR results to XML
    write_to_xml(all_lines, xml_path)
    print(f"XML file has been created: {xml_path}")

    # Keywords to search in the XML
    keywords = ["Fastighetsagare", "Fastighetsbeteckning", "anlaggare", "ort, datum", "organisationsnumen" "Anlaggare av ledning"]

    # Read the XML and search for keywords
    search_results = read_and_search_keywords(xml_path, keywords)

    # Print the search results
    for kw, content in search_results.items():
        print(f"{kw.capitalize()}: {content}")

if __name__ == "__main__":
    main()


Looking for /root/.keras-ocr/craft_mlt_25k.h5
Looking for /root/.keras-ocr/crnn_kurapan.h5
XML file has been created: 2.xml
Fastighetsagare: fastighetsagare rislidens byaforening norsjo 935 81 risliden
Fastighetsbeteckning: fastighetsbeteckning 241
Anlaggare: anlaggare nod norsj o kommun av storgatan 67 norsjo 935 81 organisationsnumen 2120002858 nyttjanderatten fastighetsagaren till lokalutrymme pa fastighet for installera anlaggaren ratt angiven att ger tillhorande bibehalla teknisk nodj for drift itanl aggning jamte och apparatur av kanalisation ledningsdragning laget redovisas pa bifogad och karta overgar nyttja anderatten automatiskt till den anlaggnings vid byte anlaggningsagare nye av agaren medfora forandring flyttning anlaggning den nar planerar arbete kan eller ska part som av god eller andring omfattning andra tid kontaktas l fraga flyttning storre parten 1 om av sarskild ffas arbetets samordning tidplan for skall overenskommelse trat och om genomforandet om fastighetsagaren

In [None]:
#Calculate CER, WER just for one PDF
import os
import keras_ocr
from pdf2image import convert_from_path
import numpy as np
import re
import cv2
from jiwer import wer, cer

def preprocess_image(image, size=(1280, 720)):
    """
    Preprocess the image by resizing, converting to grayscale, and applying Gaussian blur.
    Then convert it back to RGB as keras-ocr expects a three-channel image.
    """
    image = image.resize(size)
    gray_image = cv2.cvtColor(np.array(image), cv2.COLOR_BGR2GRAY)
    blurred_image = cv2.GaussianBlur(gray_image, (5, 5), 0)
    preprocessed_image = cv2.cvtColor(blurred_image, cv2.COLOR_GRAY2RGB)
    return preprocessed_image

def clean_text(text):
    """
    Normalize the text by removing invalid XML characters and other non-alphanumeric characters.
    """
    invalid_xml_chars = re.compile(u'[\u0000-\u0008\u000B-\u000C\u000E-\u001F]')
    text = invalid_xml_chars.sub('', text)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text

def process_single_document(pdf_path, ground_truth_path):
    # Setup the keras-ocr pipeline
    pipeline = keras_ocr.pipeline.Pipeline()

    if not os.path.exists(ground_truth_path):
        print(f"Missing ground truth text file for {pdf_path}")
        return

    images = convert_from_path(pdf_path)
    all_text = []

    for image in images:
        preprocessed_image = preprocess_image(image)
        prediction_groups = pipeline.recognize([np.array(preprocessed_image)])
        for predictions in prediction_groups:
            text = ' '.join([word[0] for word in predictions])
            clean_text_data = clean_text(text)
            all_text.append(clean_text_data)

    ocr_output_text = '\n'.join(all_text)

    with open(ground_truth_path, 'r', encoding='utf-8') as gt_file:
        ground_truth_text = gt_file.read()
        ground_truth_text = clean_text(ground_truth_text)

    calculated_wer = wer(ground_truth_text, ocr_output_text)
    calculated_cer = cer(ground_truth_text, ocr_output_text)

    print(f"\nOCR Text: {ocr_output_text}")
    print(f"\nGround Truth Text: {ground_truth_text}")
    print(f"\nWER: {calculated_wer:.2f}")
    print(f"CER: {calculated_cer:.2f}")

# Example file paths
pdf_path = '2EDEPZ4VHTLPTWSZR6FAVUJ3B2ZVSIPS.pdf'
ground_truth_path = '2EDEPZ4VHTLPTWSZR6FAVUJ3B2ZVSIPS.txt'
process_single_document(pdf_path, ground_truth_path)


Looking for /root/.keras-ocr/craft_mlt_25k.h5
Looking for /root/.keras-ocr/crnn_kurapan.h5

OCR Text: adoes czaood ued raooalanes cranaaain starsee larate coaod codcooco oonae coa  cerons t loo soto rston alod ted uds eoole caaei sototans manoes e oroane srirazes rns oooed ceo rantine co orae cee lalees cottao caduo dlannas cooe cocas caooed conod cured cro nntsn oold lolns uoloed ood efeinn slasle udes somslons wtoles oanna only dy cored oaoes cadad ce to clacle nantoe csln dad soed tacoros talsantl srnaizan clod cood seloo acaoien coes aaatan ded en nes tansnes raasen scod uno et cioed on slagnss ce dadleed doted de coo oolans iadid de uanes olae onoatn cooslor coeed ood saiens od co

Ground Truth Text: 6172009 US religious freedom watchdog barred from India  Agence France Presse

   httpwwwgooglecomhostednewsafparticleALeqM5g6pjdmBNAEyFXqsYygocqBt5wQ  

        WASHINGTON AFP  The US government watchdog on religious freedom abroad
criticized India for refusing to grant its represent

In [None]:
#convert pdf to txt file
!pip install pdfminer.six
import os
from pdfminer.high_level import extract_text

def convert_pdf_to_txt(pdf_path, txt_path):
    """
    Convert a PDF file to a text file using pdfminer.six.
    """
    try:
        text = extract_text(pdf_path)
        with open(txt_path, 'w', encoding='utf-8') as txt_file:
            txt_file.write(text)
        print(f"Converted {pdf_path} to {txt_path}")
    except Exception as e:
        print(f"Error converting {pdf_path}: {str(e)}")

def convert_pdfs_in_directory(directory):
    """
    Convert all PDF files to text files.
    """
    for filename in os.listdir(directory):
        if filename.lower().endswith('.pdf'):
            pdf_path = os.path.join(directory, filename)
            txt_path = os.path.join(directory, filename.replace('.pdf', '.txt'))
            convert_pdf_to_txt(pdf_path, txt_path)

def main():
    directory_path = 'text/'
    convert_pdfs_in_directory(directory_path)

if __name__ == "__main__":
    main()


Converted text/7MHCNBM2ZR4PD2BSWVFLG77BRN3HS3AD.pdf to text/7MHCNBM2ZR4PD2BSWVFLG77BRN3HS3AD.txt
Converted text/27UIROOYZ4IE3FKKNAPCKEQXWBKKN7XM.pdf to text/27UIROOYZ4IE3FKKNAPCKEQXWBKKN7XM.txt
Converted text/7LAHAWE6SGPL25FWUGMRKXCMTD3CWEBO.pdf to text/7LAHAWE6SGPL25FWUGMRKXCMTD3CWEBO.txt
Converted text/7CIFOV7XZXS6EHUFYZF6H3NCN637PI2M.pdf to text/7CIFOV7XZXS6EHUFYZF6H3NCN637PI2M.txt
Converted text/7GYQLFBVGRTYWE73NKCNCODTJVURFULB.pdf to text/7GYQLFBVGRTYWE73NKCNCODTJVURFULB.txt
Converted text/73KIZM2KPVDJHMKWM5X36DBUKAOOCWZ4.pdf to text/73KIZM2KPVDJHMKWM5X36DBUKAOOCWZ4.txt
Converted text/6C5QST2YJX7GA4BOHPZK66SWOSEATYGO.pdf to text/6C5QST2YJX7GA4BOHPZK66SWOSEATYGO.txt
Converted text/6X3XULQL4IDEVAAER5F2M3RJGJS2REAH.pdf to text/6X3XULQL4IDEVAAER5F2M3RJGJS2REAH.txt
Converted text/7PZU4ANBX7CFDAKIK25A2U6GR5OMGEZX.pdf to text/7PZU4ANBX7CFDAKIK25A2U6GR5OMGEZX.txt
Converted text/35DKLURQRIKJHCGH3IP5TO2BTAOV4CBN.pdf to text/35DKLURQRIKJHCGH3IP5TO2BTAOV4CBN.txt
Converted text/6GHFGNIZJZWEVRN

In [None]:
#Calculate the time duration
from pdf2image import convert_from_path
import xml.etree.ElementTree as ET
import re
import os
import keras_ocr
import numpy as np
import time

def clean_text(text):
    """
    Remove invalid XML characters from the text.
    """
    invalid_xml_chars = re.compile(u'[\u0000-\u0008\u000B-\u000C\u000E-\u001F]')
    return invalid_xml_chars.sub('', text)

def group_text_by_lines(prediction_groups):
    """
    Group text by lines including their bounding boxes and confidences.
    """
    lines = []
    for word, box in prediction_groups[0]:
        y_coords = [point[1] for point in box]
        lines.append((word, box, 0.95))  # keras-ocr doesn't provide a confidence, so we use a dummy value
    return lines

def extract_specific_info_from_lines(lines, keywords, text_limits):
    """
    Extracts specific information based on predefined keywords.
    """
    pairs = {}
    current_key = None
    current_text = []
    current_limit = None
    normalized_keywords = {keyword.lower(): keyword for keyword in keywords}

    for text, _, _ in lines:
        normalized_text = text.lower().strip()
        if any(normalized_text.startswith(keyword) for keyword in normalized_keywords):
            if current_key:
                formatted_text = ' '.join(current_text).strip()
                if current_limit is not None:
                    formatted_text = formatted_text[:current_limit]
                pairs[current_key] = formatted_text
            current_key = normalized_keywords.get(normalized_text, text.strip())
            current_text = []
            current_limit = text_limits.get(current_key, None)
        elif current_key:
            current_text.append(text.strip())

    if current_key:
        formatted_text = ' '.join(current_text).strip()
        if current_limit is not None:
            formatted_text = formatted_text[:current_limit]
        pairs[current_key] = formatted_text

    return pairs

def main():
    start_time = time.time()

    # Initialize the keras-ocr pipeline
    pipeline = keras_ocr.pipeline.Pipeline()

    pdf_path = '4.pdf'
    images = convert_from_path(pdf_path)
    keywords = ["Fastighetsagare", "Namn", "Postnr och ort", "Fastighetsbeteckning", "Anlaggare av ledning"]
    text_limits = {"Anlaggare av ledning": 41}

    for i, image in enumerate(images):
        image_processing_start = time.time()  # Start timing the processing for each image

        image_np = np.array(image)
        if len(image_np.shape) == 2:  # Convert grayscale to RGB
            image_np = np.stack([image_np]*3, axis=-1)

        # Perform OCR
        prediction_groups = pipeline.recognize([image_np])
        lines = group_text_by_lines(prediction_groups)
        info = extract_specific_info_from_lines(lines, keywords, text_limits)

        for key in keywords:
            value = info.get(key, "")
            print(f"{key}: {value}")

        image_processing_end = time.time()
        print(f"Processed image {i+1} in {image_processing_end - image_processing_start} seconds")

    end_time = time.time()
    print(f"Total OCR processing time: {end_time - start_time} seconds")

if __name__ == "__main__":
    main()


Looking for /root/.keras-ocr/craft_mlt_25k.h5
Downloading /root/.keras-ocr/craft_mlt_25k.h5


Instructions for updating:
Use `tf.image.resize(...method=ResizeMethod.BILINEAR...)` instead.


Looking for /root/.keras-ocr/crnn_kurapan.h5
Downloading /root/.keras-ocr/crnn_kurapan.h5
Fastighetsagare: 
Namn: l arabere f adress nals so l s postnr och ort 412 nonsto
Postnr och ort: 
Fastighetsbeteckning: anlaggare ledning norsjo kommun av storgatan 67 norsjo 935 81 2120002858 organisationsnr med underjordisk ledning for data och telekommunikation enligt detta avtal alla avses kablar och ledningar vilka overfors signaler for bild data eller i t genom ex annat underjordisk ledning ingar sadana andamal erforderliga for dess anordningar brunnar som skarvlador tillbehor tomror skap och andra
Anlaggare av ledning: 
Processed image 1 in 16.055208921432495 seconds
Fastighetsagare: 
Namn: 
Postnr och ort: 
Fastighetsbeteckning: 
Anlaggare av ledning: 
Processed image 2 in 1.6911015510559082 seconds
Total OCR processing time: 24.213207960128784 seconds
