In [3]:
import os
import re
import stanza

nlp = stanza.Pipeline(lang="et", processors="tokenize,lemma")

input_directory = "raw_text/"
output_directory = "raw_text_lemma/"


if not os.path.exists(output_directory):
    os.makedirs(output_directory)

def clean_text(text):

    text = re.sub(r'(?<=\w)[_=+](?=\w)', '', text)
    return text


for entry in os.scandir(input_directory):
    if entry.is_file() and entry.name.endswith(".txt"):
        print("Processing file:", entry.name)
        

        output_file_path = os.path.join(output_directory, entry.name)

        with open(entry.path, "r", encoding="utf-8") as input_file:
            text = input_file.read()

        doc = nlp(text)
        lemmatized_text = " ".join(
            [word.lemma for sent in doc.sentences for word in sent.words]
        )

        cleaned_text = clean_text(lemmatized_text)

        with open(output_file_path, "w", encoding="utf-8") as output_file:
            output_file.write(cleaned_text)

        print("Finished processing file:", entry.name)


2024-05-04 19:57:25 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json: 367kB [00:00, 4.83MB/s]
2024-05-04 19:57:25 INFO: Loading these models for language: et (Estonian):
| Processor | Package      |
----------------------------
| tokenize  | edt          |
| lemma     | edt_nocharlm |

2024-05-04 19:57:25 INFO: Using device: cuda
2024-05-04 19:57:25 INFO: Loading: tokenize
2024-05-04 19:57:26 INFO: Loading: lemma
2024-05-04 19:57:26 INFO: Done loading processors!


Processing file: t10352.txt
Finished processing file: t10352.txt
Processing file: t105779.txt
Finished processing file: t105779.txt
Processing file: t105808.txt
Finished processing file: t105808.txt
Processing file: t106205.txt
Finished processing file: t106205.txt
Processing file: t106285.txt
Finished processing file: t106285.txt
Processing file: t106306.txt
Finished processing file: t106306.txt
Processing file: t106434.txt
Finished processing file: t106434.txt
Processing file: t106764.txt
Finished processing file: t106764.txt
Processing file: t10801.txt
Finished processing file: t10801.txt
Processing file: t10878.txt
Finished processing file: t10878.txt
Processing file: t109127.txt
Finished processing file: t109127.txt
Processing file: t10948.txt
Finished processing file: t10948.txt
Processing file: t110581.txt
Finished processing file: t110581.txt
Processing file: t112542.txt
Finished processing file: t112542.txt
Processing file: t1134.txt
Finished processing file: t1134.txt
Process