In [9]:
### This part scans an image of letter and applies an OCR text recognition in order
### to get the text of the image and transforms it to txt and stores it

import os
from PIL import Image
import easyocr


def process_images(image_directory, output_directory):
    """
    Process all images in the specified directory using OCR and save the extracted text to text files.

    Args:
        image_directory (str): Path to the directory containing the image files.
        output_directory (str): Path to the directory where the output text files will be saved.
    """
    # initialize the OCR reader
    reader = easyocr.Reader(['de'])

    # create the output directory if it doesn't exist
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    # iterate through all files in the directory
    for filename in os.listdir(image_directory):
        # check if file is an image
        if filename.endswith('.jpg') or filename.endswith('.jpeg') or filename.endswith('.png') or filename.endswith('.pdf'):
            # make the full path to the image file
            image_path = os.path.join(image_directory, filename)

            # open image for OCR
            try:
                with open(image_path, 'rb') as image_file:
                    image_bytes = image_file.read()
            except Exception as e:
                print(f"Error reading {filename}: {e}")
                continue

            # extract text from image using easyocr
            try:
                result = reader.readtext(image_bytes)
            except Exception as e:
                print(f"Error processing {filename}: {e}")
                continue

            # extracted text
            text = '\n'.join([entry[1] for entry in result])

            # define output path to save extracted text
            output_path = os.path.join(output_directory, f'{os.path.splitext(filename)[0]}.txt')

            # check if output file already exists
            if os.path.exists(output_path):
                print(f"Skipping {filename}. Output file {output_path} already exists.")
                continue

            # save extracted text to a text file
            with open(output_path, 'w', encoding='utf-8') as file:
                file.write(text)

            # print a confirmation message about the text extraction
            print(f'Text from {filename} has been saved in: {output_path}.')

# get the directory of the current script
current_directory = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

# define path to the directory containing the image files
image_directory = os.path.join(current_directory, 'text_extractor', 'data_images')

# define the path to the directory where the output text files will be saved
output_directory = os.path.join(current_directory, 'raw_data', 'german_ocr_text')

# process images in the specified directory
process_images(image_directory, output_directory)


Skipping Ist-oder-soll.jpeg. Output file /Users/carstenvolland/code/katia-si/better-letter/raw_data/german_ocr_text/Ist-oder-soll.txt already exists.
Skipping 1000014342.jpg. Output file /Users/carstenvolland/code/katia-si/better-letter/raw_data/german_ocr_text/1000014342.txt already exists.
Skipping letter_school_hamburg.png. Output file /Users/carstenvolland/code/katia-si/better-letter/raw_data/german_ocr_text/letter_school_hamburg.txt already exists.
Skipping 1000014343.jpg. Output file /Users/carstenvolland/code/katia-si/better-letter/raw_data/german_ocr_text/1000014343.txt already exists.
Skipping letter_arbeitsamt_internet.jpeg. Output file /Users/carstenvolland/code/katia-si/better-letter/raw_data/german_ocr_text/letter_arbeitsamt_internet.txt already exists.


In [10]:
### data cleaning of the ocr scans to remove indentation and hyphenation and so on

import os
import re

def clean_extracted_text(text):
    # remove hyphens at line breaks, replace newlines with spaces, and remove extra spaces
    text = re.sub(r'(\w+)-\n(\w+)', r'\1\2', text)
    text = text.replace('\n', ' ')
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r';', ',', text)
    text = re.sub(r':', '.', text)

    # remove text inside curly or other brackets
    text = re.sub(r'\{.*?\}', '', text)
    text = re.sub(r'\(.*?\)', '', text)
    text = re.sub(r'\[.*?\]', '', text)

    # find the index of the first occurrence of "Datum:"
    datum_index = text.find("Datum:")
    if datum_index != -1:
        # remove text before "Datum:"
        text = text[datum_index+len("Datum:"):]
    sehr_geehrt_index = re.search(r'Sehr geehrt.*', text, flags=re.IGNORECASE)
    if sehr_geehrt_index:
        # remove text before "Sehr geehrt" followed by any characters
        text = text[sehr_geehrt_index.start():]

    # find the index of "Berliner Sparkasse" and remove everything after it
    berliner_sparkasse_index = text.find("Berliner Sparkasse")
    if berliner_sparkasse_index != -1:
        text = text[:berliner_sparkasse_index]

    return text

def clean_and_save_files(input_directory, output_directory):
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    for filename in os.listdir(input_directory):
        if filename.endswith('.txt'):
            input_file_path = os.path.join(input_directory, filename)
            output_file_path = os.path.join(output_directory, filename.replace('.txt', '_cldn.txt'))

            if os.path.exists(output_file_path):
                print(f"skipping {filename}. output file {output_file_path} already exists.")
                continue

            with open(input_file_path, 'r', encoding='utf-8') as file:
                extracted_text = file.read()

            cleaned_text = clean_extracted_text(extracted_text)

            with open(output_file_path, 'w', encoding='utf-8') as file:
                file.write(cleaned_text)

            print(f'cleaned text has been saved to: {output_file_path}')

# get current working directory
current_directory = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

# define input directory containing the extracted text files
input_directory = os.path.join(current_directory, 'raw_data', 'german_ocr_text')

# define  output directory where cleaned text files will be saved
output_directory = os.path.join(current_directory, 'raw_data', 'german_ocr_text_cleaned')

# clean and save  files
clean_and_save_files(input_directory, output_directory)


skipping 1000014343.txt. output file /Users/carstenvolland/code/katia-si/better-letter/raw_data/german_ocr_text_cleaned/1000014343_cldn.txt already exists.
skipping 1000014342.txt. output file /Users/carstenvolland/code/katia-si/better-letter/raw_data/german_ocr_text_cleaned/1000014342_cldn.txt already exists.
skipping letter_school_hamburg.txt. output file /Users/carstenvolland/code/katia-si/better-letter/raw_data/german_ocr_text_cleaned/letter_school_hamburg_cldn.txt already exists.
skipping Ist-oder-soll.txt. output file /Users/carstenvolland/code/katia-si/better-letter/raw_data/german_ocr_text_cleaned/Ist-oder-soll_cldn.txt already exists.
skipping letter_arbeitsamt_internet.txt. output file /Users/carstenvolland/code/katia-si/better-letter/raw_data/german_ocr_text_cleaned/letter_arbeitsamt_internet_cldn.txt already exists.


In [16]:
### trying a longer summarization of the german text

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import os

# load pre-trained BART model and tokenizer for German
tokenizer = AutoTokenizer.from_pretrained("Shahm/bart-german")
model = AutoModelForSeq2SeqLM.from_pretrained("Shahm/bart-german")

# function to generate summary with adjusted parameters
def generate_summary_longer(text: str) -> str:

    # generate summary
    summary_ids = model.generate(
        max_length=575,
        min_length=200,
        length_penalty=2.0,
        num_beams=2,
        early_stopping=True
    )
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# define input and output directories
current_directory = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
input_directory = os.path.join(current_directory, 'raw_data', 'german_ocr_text_cleaned')
output_directory = os.path.join(current_directory, 'raw_data', 'german_summary')


# create the output directory if it doesn't exist
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# generate summary for each file in the input directory
for filename in os.listdir(input_directory):
    # check if file is a text file
    if filename.endswith('.txt'):
        # construct the full path to the input file
        input_file_path = os.path.join(input_directory, filename)

        # construct the full path to the output file
        output_file_path = os.path.join(output_directory, f'{os.path.splitext(filename)[0]}_sum.txt')

        # check if the output file already exists
        if os.path.exists(output_file_path):
            print(f'skipping {filename}. output file {output_file_path} already exists.')
            continue

        # read the content of the input file
        with open(input_file_path, 'r', encoding='utf-8') as file:
            text = file.read()

        # generate summary
        summary = generate_summary_longer(text)

        # write summary to the output file
        with open(output_file_path, 'w', encoding='utf-8') as output_file:
            output_file.write(summary)

        print(f'summary has been generated and saved to: {output_file_path}')


summary has been generated and saved to: /Users/carstenvolland/code/katia-si/better-letter/raw_data/german_summary/letter_arbeitsamt_internet_cldn_sum.txt
skipping 1000014342_cldn.txt. output file /Users/carstenvolland/code/katia-si/better-letter/raw_data/german_summary/1000014342_cldn_sum.txt already exists.
skipping 1000014343_cldn.txt. output file /Users/carstenvolland/code/katia-si/better-letter/raw_data/german_summary/1000014343_cldn_sum.txt already exists.
skipping Ist-oder-soll_cldn.txt. output file /Users/carstenvolland/code/katia-si/better-letter/raw_data/german_summary/Ist-oder-soll_cldn_sum.txt already exists.
skipping letter_school_hamburg_cldn.txt. output file /Users/carstenvolland/code/katia-si/better-letter/raw_data/german_summary/letter_school_hamburg_cldn_sum.txt already exists.


In [None]:
### translate german summarized text to english

from transformers import MarianMTModel, MarianTokenizer
import os

# load the MarianMTModel and tokenizer for translation
tokenizer_translate = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-de-en")
model_translate = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-de-en")

# function to translate German text to English
def translate_to_english(german_text):
    # tokenize the German text
    inputs = tokenizer_translate(german_text, return_tensors="pt", padding=True, truncation=True)
    # generate English translation
    translated = model_translate.generate(**inputs)
    # decode the translated text
    translated_text = tokenizer_translate.batch_decode(translated, skip_special_tokens=True)
    return translated_text[0]

# define input and output directories for english translation
current_directory = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
input_directory_german = os.path.join(current_directory, 'raw_data', 'german_summary')
output_directory_english = os.path.join(current_directory, 'raw_data', 'english_summary')


# create the output directory if it doesn't exist
if not os.path.exists(output_directory_english):
    os.makedirs(output_directory_english)

# translate and save summaries for each file in the input directory
for filename in os.listdir(input_directory_german):
    # check if file is a text file
    if filename.endswith('.txt'):
        # read the German summary
        with open(os.path.join(input_directory_german, filename), 'r', encoding='utf-8') as file:
            german_summary = file.read()

        # translate German summary to English
        english_summary = translate_to_english(german_summary)

        # construct the full path to the output file
        output_file_path_english = os.path.join(output_directory_english, filename)

        # write the translated summary to the output file
        with open(output_file_path_english, 'w', encoding='utf-8') as output_file:
            output_file.write(english_summary)

        print(f'translated summary has been saved to: {output_file_path_english}')


translated summary has been saved to: /Users/carstenvolland/code/katia-si/better-letter/raw_data/english_summary/1000014343_cldn_sum.txt
translated summary has been saved to: /Users/carstenvolland/code/katia-si/better-letter/raw_data/english_summary/1000014342_cldn_sum.txt
translated summary has been saved to: /Users/carstenvolland/code/katia-si/better-letter/raw_data/english_summary/letter_school_hamburg_cldn_sum.txt
translated summary has been saved to: /Users/carstenvolland/code/katia-si/better-letter/raw_data/english_summary/Ist-oder-soll_cldn_sum.txt


In [None]:
### playground to test several paramters in order to get the best translation


from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load BART model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("Shahm/bart-german")
model = AutoModelForSeq2SeqLM.from_pretrained("Shahm/bart-german")

# Define your original text
original_text = """
Sehr geehrte Eltern, mit diesem Schreiben möchten wir Sie darüber informieren, dass auf Basis der allgemeinen Preisentwicklung der von den Eltern zu zahlende Höchstpreis für das Mittagessen zum 1. August 2023 um 20 Cent auf 4,35 Euro angehoben wird. Angesichts der nach wie vor hohen Lebensmittelpreise deckt dieser Beitrag derzeit nicht die tatsächlichen Kosten für ein Mittagessen. Die Caterer können daher ab dem 01.08.2023 bis zu 4,80 Euro Mittagessen abrechnen Die Differenz zwischen dem von den Eltern zu zahlenden Maximalpreis von 4,35 Euro und der neuen Preisobergrenze für die Caterer übernimmt die Schulbehörde und rechnet die Differenz direkt mit den Caterern ab. Das gilt für alle Mittagessen, auch die der vollzahlenden Schülerinnen und Schüler. Damit setzt der Hamburger Senat konsequent die seit 2020 begonnene Linie durch, im Sinne der Familien, Kinder und Jugendlichen ein schulisches Mittagessen zu vertretbaren Preisen zu sichern und den an den Hamburger Schulen tätigen Cateringunternehmen angemessene Preise zu ermöglichen. Insgesamt übernehmen die Freie und Hansestadt Hamburg und der Bund deutlich mehr als 50 Prozent der Kosten aller schulischen Mittagessen, um für alle Schülerinnen und Schüler an Hamburgs Schulen ein gesundes Mittagessen zu gewährleisten. Mehr Informationen hierzu finden Sie bei Bedarf unter Mittaqessen_für_die_Hamburqer_Schulen hamburq de.
"""

# Generate summary with longer max_length
def generate_summary_longer(text: str) -> str:
    inputs = tokenizer.encode(text, return_tensors="pt", truncation=True, max_length=1024)
    summary_ids = model.generate(
        inputs,
        max_length=575,
        min_length=200,
        length_penalty=2.0,
        num_beams=2,
        early_stopping=True
    )
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Generate longer summary
summary_longer = generate_summary_longer(original_text)

# Print the longer summary
print(summary_longer)


Die Differenz zwischen dem von den Eltern zu zahlenden Maximalpreis von 4,35 Euro und der neuen Preisobergrenze übernimmt die Schulbehörde. Das gilt für alle Mittagessen, auch die der vollzahlendsten Schülerinnen und Schülern. Damit setzt der Hamburger Senat konsequent die seit 2020 begonnene Linie durch. Mehr Informationen hierzu finden Sie bei Bedarf unter Mittaqessen_... "Hamburqer_Schulen hamburq de". Ein Überblick überraschende Hamburger Hamburger Schulen, dass Cateringunternehmen angemessene Preise zu vertretbaren Preisen zu sichern. Die Kosten.
