In [12]:
import re
import json
import os
import spacy

In [13]:
nlp = spacy.load("de_core_news_lg")

from spacy.language import Language
from spacy.pipeline import Sentencizer

In [14]:
from pre_postprocess_utils import *
import importlib
#importlib.reload()

## 1) Preprocess Text Files:
   1) Remove parentheses and their content.
   2) Expand pre-defined abbreviations & acronyms.
   3) Replace specific disallowed characters
   4) Normalize special characters
      1) only allow german letters (a–z, äöüß), digits (0–9), and basic punctuation like .?!:„“
      2) Disallowed examples: $, §, <, >, , (comma)

In [15]:
def preprocess_files(input_folder_path, output_path, abbr_json_path):
    # Ensure output directory exists
    os.makedirs(output_path, exist_ok=True)
    # Load abbreviation dictionary
    abbr_map = load_abbreviation_map(abbr_json_path)

    # Process each text file in the input directory
    for filename in os.listdir(input_folder_path):
        if filename.endswith(".txt"):
            input_path = os.path.join(input_folder_path, filename)
            output_filename = filename.replace(".txt", "_preprocessed.txt")
            output_txt_path = os.path.join(output_path, output_filename)
    

            # Read the original text
            with open(input_path, "r", encoding="utf-8") as file:
                text = file.read()

            # Apply transformations
            text = remove_parentheses(text)
            text = expand_abbreviations(text, abbr_map)
            text = normalize_characters(text)  
            text = character_substitution(text)

            # Write the simplified text
            with open(output_txt_path, "w", encoding="utf-8") as file:
                file.write(text)

            #print(f"Processed: {filename} → {output_filename}")

In [16]:
abbr_json_path = "wiki_abkuerzungen.json"
input_folder_path = "preprocessed_texts/apa-rst/0_original"
output_path = "preprocessed_texts/apa-rst/1_preprocessed"

# Example usage
preprocess_files(input_folder_path, output_path, abbr_json_path)

## 2) Parsing and Tokenization using spaCy

In [17]:
# Add a rule-based sentencizer BEFORE the parser to split on newlines too
# Making sure that if headlines are present, they are separated and treated as single sentences when single/double newline is present
@Language.component("newline_sentencizer")
def newline_sentencizer(doc):
    # Split at each newline or double newline
    start = 0
    for i, token in enumerate(doc):
        if token.text == "\n" or token.is_space and "\n\n" in token.text:
            span = doc[start: i]
            if span:
                span[0].is_sent_start = True
            start = i + 1
    return doc

#Add newline sentencizer
nlp.add_pipe("newline_sentencizer", before="parser")

<function __main__.newline_sentencizer(doc)>

In [18]:
def parsing_and_tokenization(input_folder_path, output_path):

    # Ensure output directory exists
    os.makedirs(output_path, exist_ok=True)

    # Process each text file in the input directory
    for filename in os.listdir(input_folder_path):
        if filename.endswith("_preprocessed.txt"):
            input_path = os.path.join(input_folder_path, filename)
            output_filename = filename.replace("_preprocessed.txt", "_parsed.txt")
            output_txt_path = os.path.join(output_path, output_filename)

            # Read the preprocessed text
            with open(input_path, "r", encoding="utf-8") as file:
                text = file.read()

            # Pre-segment into lines (headlines and paragraphs) & parse
            lines = [line.strip() for line in text.split("\n") if line.strip()]
            docs = [nlp(line) for line in lines]

            with open(output_txt_path, "w", encoding="utf-8") as out:
                for doc in docs:
                    for sent in doc.sents:
                        out.write("<s>\n")
                        for token in sent:
                            if token.is_space:
                                continue

                            out.write(
                                f"{token.text}\t"
                                f"{token.lemma_}\t"
                                f"{token.pos_}\t"
                                f"{token.dep_}\t"
                                f"{token.head.text}\t"
                                f"{token.morph}\t"
                                f"{token.is_stop}\t"
                                f"{token.ent_type_ or '-'}\n"
                            )
                        out.write("</s>\n\n")
                    
            # Print the processed filename
            print(f"Parsed: {filename} → {output_filename}")

In [19]:
#Checking the nlp pipeline order
print(nlp.pipe_names)

['tok2vec', 'tagger', 'morphologizer', 'newline_sentencizer', 'parser', 'lemmatizer', 'attribute_ruler', 'ner']


In [20]:
input_folder_path = "preprocessed_texts/apa-rst/1_preprocessed"
output_path = "preprocessed_texts/apa-rst/2_parse_tokenize"

In [21]:
parsing_and_tokenization(input_folder_path, output_path)

Parsed: sent_1-29-11-21-or_preprocessed.txt → sent_1-29-11-21-or_parsed.txt
Parsed: sent_5-freitag-28-1-22-or_preprocessed.txt → sent_5-freitag-28-1-22-or_parsed.txt
Parsed: sent_5-dienstag-8-2-22-or_preprocessed.txt → sent_5-dienstag-8-2-22-or_parsed.txt
Parsed: sent_1-21-2-18-or_preprocessed.txt → sent_1-21-2-18-or_parsed.txt
Parsed: sent_5-18-1-22-or_preprocessed.txt → sent_5-18-1-22-or_parsed.txt
Parsed: sent_2-29-11-21-or_preprocessed.txt → sent_2-29-11-21-or_parsed.txt
Parsed: sent_4-21-2-18-or_preprocessed.txt → sent_4-21-2-18-or_parsed.txt
Parsed: sent_3-freitag-28-1-22-or_preprocessed.txt → sent_3-freitag-28-1-22-or_parsed.txt
Parsed: sent_3-dienstag-8-2-22-or_preprocessed.txt → sent_3-dienstag-8-2-22-or_parsed.txt
Parsed: sent_4-29-11-21-or_preprocessed.txt → sent_4-29-11-21-or_parsed.txt
Parsed: sent_1-18-1-22-or_preprocessed.txt → sent_1-18-1-22-or_parsed.txt
Parsed: sent_5-21-2-18-or_preprocessed.txt → sent_5-21-2-18-or_parsed.txt
Parsed: sent_4-18-1-22-or_preprocessed.txt

In [22]:
# # old version which worked
# def parsing_and_tokenization(input_folder_path, output_path):

#     # Ensure output directory exists
#     os.makedirs(output_path, exist_ok=True)

#     # Process each text file in the input directory
#     for filename in os.listdir(input_folder_path):
#         if filename.endswith("_preprocessed.txt"):
#             input_path = os.path.join(input_folder_path, filename)
#             output_filename = filename.replace("_preprocessed.txt", "_parsed.txt")
#             output_txt_path = os.path.join(output_path, output_filename)

#             # Read the preprocessed text
#             with open(input_path, "r", encoding="utf-8") as file:
#                 text = file.read()

#             # Parse and tokenize the text
#             # doc = nlp(text)

#             # Pre-segment into lines (headlines and paragraphs) & parse
#             lines = [line.strip() for line in text.split("\n") if line.strip()]
#             docs = [nlp(line) for line in lines]

#             with open(output_txt_path, "w", encoding="utf-8") as out:
#                 for doc in docs:
#                     out.write("<s>\n")

#                     for token in doc:
#                         if token.pos_ == "SPACE":
#                             continue
                    
#                         norm_text, lemma, pos, dep, head = postprocess_token(token)
#                         morph = str(token.morph)
#                         is_stop = str(token.is_stop)
#                         ent_type = token.ent_type_ if token.ent_type_ else "-"

#                         out.write(f"{norm_text}\t{lemma}\t{pos}\t{dep}\t{head}\t{morph}\t{is_stop}\t{ent_type}\n")

#                     out.write("</s>\n\n")
                    
#             # Print the processed filename
#             print(f"Parsed: {filename} → {output_filename}")