In [8]:
### This part scans an image of letter and applies an OCR text recognition in order
### to get the text of the image and transforms it to txt and stores it

import os
from PIL import Image
import easyocr
import io

def process_images(image_directory, output_directory):
    """
    Process all images in the specified directory using OCR and save the extracted text to text files.

    Args:
        image_directory (str): Path to the directory containing the image files.
        output_directory (str): Path to the directory where the output text files will be saved.
    """
    # init the OCR reader
    reader = easyocr.Reader(['de'])

    # iterater through all files in the directory
    for filename in os.listdir(image_directory):
        # check if file is an image
        if filename.endswith('.jpg') or filename.endswith('.jpeg') or filename.endswith('.png'):
            # make the full path to the image file
            image_path = os.path.join(image_directory, filename)

            # open image for OCR
            with open(image_path, 'rb') as image_file:
                image_bytes = image_file.read()

            # extract text from image using easyocr
            result = reader.readtext(image_bytes)

            # extracted text
            text = '\n'.join([entry[1] for entry in result])

            # define output path to save extracted text
            output_path = os.path.join(output_directory, f'{os.path.splitext(filename)[0]}.txt')

            # check if output file already exists
            if os.path.exists(output_path):
                print(f"Skipping {filename}. Output file {output_path} already exists.")
                continue

            # save extracted text to a text file
            with open(output_path, 'w', encoding='utf-8') as file:
                file.write(text)

            # print a confirmation message about the text extraction
            print(f'Text from {filename} has been saved in: {output_path}.')

# define path to the directory containing the image files
image_directory = '/Users/carstenvolland/code/katia-si/better-letter/text_extractor/data_images/'

# define the path to the directory where the output text files will be saved
output_directory = '/Users/carstenvolland/code/katia-si/better-letter/raw_data/german_ocr_text'

# porcess images in the specified directory
process_images(image_directory, output_directory)


Skipping Ist-oder-soll.jpeg. Output file /Users/carstenvolland/code/katia-si/better-letter/raw_data/german_ocr_text/Ist-oder-soll.txt already exists.
Text from letter_school_hamburg.png has been saved in: /Users/carstenvolland/code/katia-si/better-letter/raw_data/german_ocr_text/letter_school_hamburg.txt.
Skipping letter_arbeitsamt_internet.jpeg. Output file /Users/carstenvolland/code/katia-si/better-letter/raw_data/german_ocr_text/letter_arbeitsamt_internet.txt already exists.


In [10]:
### function to clean the data so it looks like real sentences


import os
import re

def clean_extracted_text(text):
    # handly hyphenated line breaks
    text = re.sub(r'(\w+)-\n(\w+)', r'\1\2', text)
    # replace remaining newlines with spaces
    text = text.replace('\n', ' ')
    # remove multiple spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def clean_and_save_files(input_directory, output_directory):
    # create the output directory if it doesn't exist
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    # iterate through all files in the input directory
    for filename in os.listdir(input_directory):
        # check if the file is a text file
        if filename.endswith('.txt'):
            # construct the full path to the input file
            input_file_path = os.path.join(input_directory, filename)

            # read the content of the input file
            with open(input_file_path, 'r', encoding='utf-8') as file:
                extracted_text = file.read()

            # clean the extracted text
            cleaned_text = clean_extracted_text(extracted_text)

            # construct the full path to the output file
            output_file_path = os.path.join(output_directory, filename.replace('.txt', '_cldn.txt'))

            # write the cleaned text to the output file
            with open(output_file_path, 'w', encoding='utf-8') as file:
                file.write(cleaned_text)

            print(f'Cleaned text has been saved to: {output_file_path}')

# define the input directory containing the extracted text files
input_directory = '/Users/carstenvolland/code/katia-si/better-letter/raw_data/german_ocr_text'

# define the output directory where cleaned text files will be saved
output_directory = '/Users/carstenvolland/code/katia-si/better-letter/raw_data/german_ocr_text_cleaned'

# clean and save text files in the input directory
clean_and_save_files(input_directory, output_directory)


Cleaned text has been saved to: /Users/carstenvolland/code/katia-si/better-letter/raw_data/german_ocr_text_cleaned/letter_school_hamburg_cldn.txt
Cleaned text has been saved to: /Users/carstenvolland/code/katia-si/better-letter/raw_data/german_ocr_text_cleaned/Ist-oder-soll_cldn.txt
Cleaned text has been saved to: /Users/carstenvolland/code/katia-si/better-letter/raw_data/german_ocr_text_cleaned/letter_arbeitsamt_internet_cldn.txt


In [11]:
### translate the cleand letter text from german to english

import os
from transformers import MarianMTModel, MarianTokenizer

# load pre-trained model and tokenizer
model_name = "Helsinki-NLP/opus-mt-de-en"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# function to translate German text to English
def translate_german_to_english(text):
    # tokenize the input text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)

    # perform translation
    translated = model.generate(**inputs)

    # decode the translated text
    translated_text = tokenizer.batch_decode(translated, skip_special_tokens=True)[0]

    return translated_text

# define input and output directories
input_directory = '/Users/carstenvolland/code/katia-si/better-letter/raw_data/german_ocr_text_cleaned'
output_directory = '/Users/carstenvolland/code/katia-si/better-letter/raw_data/english_translation'

# create the output directory if it doesn't exist
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# translate and save each file in the input directory
for filename in os.listdir(input_directory):
    # check if the file is a text file
    if filename.endswith('.txt'):
        # construct the full path to the input file
        input_file_path = os.path.join(input_directory, filename)

        # read the content of the input file
        with open(input_file_path, 'r', encoding='utf-8') as file:
            german_text = file.read()

        # rranslate german text to english
        english_translation = translate_german_to_english(german_text)

        # construct the full path to the output file
        output_file_path = os.path.join(output_directory, filename.replace('.txt', '_en.txt'))

        # write english translation to the output file
        with open(output_file_path, 'w', encoding='utf-8') as output_file:
            output_file.write(english_translation)

        print(f"Translated text has been saved to: {output_file_path}")




Translated text has been saved to: /Users/carstenvolland/code/katia-si/better-letter/raw_data/english_translation/letter_arbeitsamt_internet_cldn_en.txt
Translated text has been saved to: /Users/carstenvolland/code/katia-si/better-letter/raw_data/english_translation/Ist-oder-soll_cldn_en.txt
Translated text has been saved to: /Users/carstenvolland/code/katia-si/better-letter/raw_data/english_translation/letter_school_hamburg_cldn_en.txt


In [12]:
### summarizes the translated text data


from transformers import BartForConditionalGeneration, BartTokenizer
import os

# load pre-trained model and tokenizer
model_name = "facebook/bart-large-cnn"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

# function to generate summary
def generate_summary(text):
    inputs = tokenizer([text], max_length=1024, return_tensors='pt', truncation=True)
    summary_ids = model.generate(inputs['input_ids'], num_beams=4, min_length=30, max_length=200, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# define input and output directories
input_directory = '/Users/carstenvolland/code/katia-si/better-letter/raw_data/english_translation'
output_directory = '/Users/carstenvolland/code/katia-si/better-letter/raw_data/english_summary'

# create the output directory if it doesn't exist
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# generate summary for each file in the input directory
for filename in os.listdir(input_directory):
    # check if file is a text file
    if filename.endswith('.txt'):
        # construct the full path to the input file
        input_file_path = os.path.join(input_directory, filename)

        # read the content of the input file
        with open(input_file_path, 'r', encoding='utf-8') as file:
            text = file.read()

        # generate summary
        summary = generate_summary(text)

        # construct the full path to the output file
        output_file_path = os.path.join(output_directory, f'{os.path.splitext(filename)[0]}_sum.txt')

        # write summary to the output file
        with open(output_file_path, 'w', encoding='utf-8') as output_file:
            output_file.write(summary)

        print(f"Summary has been generated and saved to: {output_file_path}")


Summary has been generated and saved to: /Users/carstenvolland/code/katia-si/better-letter/raw_data/english_summary/letter_school_hamburg_cldn_en_sum.txt
Summary has been generated and saved to: /Users/carstenvolland/code/katia-si/better-letter/raw_data/english_summary/letter_arbeitsamt_internet_cldn_en_sum.txt
Summary has been generated and saved to: /Users/carstenvolland/code/katia-si/better-letter/raw_data/english_summary/Ist-oder-soll_cldn_en_sum.txt
