In [None]:
import os
import re
import stanza

# Initialize Stanza pipeline for Estonian
nlp = stanza.Pipeline(lang="et")

input_directory = "raw_text/"
output_directory = "raw_text_lemma"

# Create output directory if it doesn't exist
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Function to handle None values safely and remove specific characters
def clean_text(value):
    value = value if value is not None else ""
    return re.sub(r"[_=+]", "", value)  # Remove unwanted characters

# Process each file in the input directory
for entry in os.scandir(input_directory):
    if entry.is_file() and entry.name.endswith(".txt"):
        print("Processing file:", entry.name)
        # Define the output file path
        output_file_path = os.path.join(output_directory, entry.name)
        with open(entry.path, "r", encoding="utf-8") as input_file:
            text = input_file.read()
            text = re.sub(r"\s+", " ", text).strip()  # Normalize whitespace
            doc = nlp(text)
            lemmatized_text = []
            # Extract and clean lemmas for each word in each sentence
            for sent in doc.sentences:
                for word in sent.words:
                    cleaned_lemma = clean_text(word.lemma)
                    lemmatized_text.append(cleaned_lemma)
            # Join all cleaned lemmas with a space and write to the output file
            with open(output_file_path, "w", encoding="utf-8") as output_file:
                output_file.write(" ".join(lemmatized_text) + "\n")

print("Lemmatization complete. Files written to", output_directory)


In [25]:
import os
import re
import csv
from torch import cuda
from sentence_transformers import SentenceTransformer
from flair.embeddings import TransformerDocumentEmbeddings
from keybert import KeyBERT
from sklearn.feature_extraction.text import CountVectorizer

def load_and_preprocess_stopwords(file_path):
    with open(file_path, 'r', encoding='UTF-8') as file:
        stopwords = [re.sub(r'\W+', '', line.strip().lower()) for line in file]
    return stopwords

def custom_tokenizer(doc):
    return re.split(r'[\s,.!?;:()]+', doc)

def read_documents(folder_path):
    docs = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
                docs.append((filename, file.read()))
    return docs

def load_model(model_info):
    model_type, model_path = model_info
    if model_type == 'sentence_transformer':
        model = SentenceTransformer(model_path)
    elif model_type == 'flair_transformer':
        model = TransformerDocumentEmbeddings(model_path)
    return KeyBERT(model=model)

def run_models(docs, model, model_name, output_base, ngram_ranges, diversities, lowercase):
    stopwords = load_and_preprocess_stopwords('estonian-stopwords.txt')
    for ngram_range in ngram_ranges:
        vectorizer = CountVectorizer(tokenizer=custom_tokenizer, ngram_range=ngram_range, stop_words=stopwords, token_pattern=None, lowercase=lowercase)
        for diversity in diversities:
            output_dir_path = os.path.join(output_base, f"{model_name}", f"ngram_{ngram_range[0]}_{ngram_range[1]}", f"diversity_{int(diversity*10)}")
            os.makedirs(output_dir_path, exist_ok=True)
            for filename, doc in docs:
                output_path = os.path.join(output_dir_path, f"{filename[:-4]}.csv")
                keywords = model.extract_keywords(doc, use_mmr=True, diversity=diversity, vectorizer=vectorizer, nr_candidates=200, top_n=200)
                with open(output_path, 'w', newline='', encoding='utf-8') as csvfile:
                    writer = csv.writer(csvfile, delimiter=';')
                    writer.writerow(['keyphrase', 'score'])
                    for keyphrase, score in keywords:
                        writer.writerow([keyphrase, score])
            print(f"Finished processing {model_name} at ngram range {ngram_range} and diversity {diversity} with nr_candidates=200 and top_n=200 and lowercase={lowercase}")
    del model  # Free up memory
    if cuda.is_available():
        cuda.empty_cache()

def main():
    base_folders = {
        'raw_text': 'models/raw_text_data',
        'raw_text_lemma': 'models/raw_text_lemma_data',
    }
    lcf_folders = {
        'raw_text': 'models/raw_text_data_LCF',
        'raw_text_lemma': 'models/raw_text_lemma_data_LCF'
    }
    models_info = {
        'LaBSE': ('sentence_transformer', 'sentence-transformers/LaBSE'),
        'multi_e5': ('sentence_transformer', 'intfloat/multilingual-e5-large-instruct'),
        'MiniLM_multi': ('sentence_transformer', 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2'),
        'MiniLM-L12_multi': ('sentence_transformer', 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'),
        'distilbertMulti': ('flair_transformer', 'distilbert/distilbert-base-multilingual-cased'),
        'bertMulti': ('flair_transformer', 'google-bert/bert-base-multilingual-cased'),
        'xlm-roberta': ('flair_transformer', 'FacebookAI/xlm-roberta-base'),
        'EstBERT': ('flair_transformer', 'tartuNLP/EstBERT'),
        'est-roberta': ('flair_transformer', 'EMBEDDIA/est-roberta')
    }
    ngram_ranges = [(1, 1), (2, 2), (3, 3)]
    diversities = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]

    for folder_key in base_folders:
        folder_path = 'raw_text' if 'lemma' not in folder_key else 'raw_text_lemma'
        docs = read_documents(folder_path)
        for model_name, model_info in models_info.items():
            model = load_model(model_info)
            # Process normally
            run_models(docs, model, model_name, base_folders[folder_key], ngram_ranges, diversities, lowercase=True)
            # Process with lowercase=False
            run_models(docs, model, model_name, lcf_folders[folder_key], ngram_ranges, diversities, lowercase=False)

if __name__ == '__main__':
    main()


Finished processing est-roberta at ngram range (1, 1) and diversity 0 with nr_candidates=200 and top_n=200 and lowercase=True
Finished processing est-roberta at ngram range (1, 1) and diversity 1 with nr_candidates=200 and top_n=200 and lowercase=True
Finished processing est-roberta at ngram range (2, 2) and diversity 0 with nr_candidates=200 and top_n=200 and lowercase=True
Finished processing est-roberta at ngram range (2, 2) and diversity 1 with nr_candidates=200 and top_n=200 and lowercase=True
Finished processing est-roberta at ngram range (1, 1) and diversity 0 with nr_candidates=200 and top_n=200 and lowercase=False
Finished processing est-roberta at ngram range (1, 1) and diversity 1 with nr_candidates=200 and top_n=200 and lowercase=False
Finished processing est-roberta at ngram range (2, 2) and diversity 0 with nr_candidates=200 and top_n=200 and lowercase=False
Finished processing est-roberta at ngram range (2, 2) and diversity 1 with nr_candidates=200 and top_n=200 and lowe

In [5]:
import os
import re
import csv

import torch
from torch import cuda
from transformers import pipeline
from keybert import KeyBERT
from sklearn.feature_extraction.text import CountVectorizer

def load_and_preprocess_stopwords(file_path):
    with open(file_path, 'r', encoding='UTF-8') as file:
        stopwords = [re.sub(r'\W+', '', line.strip().lower()) for line in file]
    return stopwords

def custom_tokenizer(doc):
    return re.split(r'[\s,.!?;:()]+', doc)

def read_documents(folder_path):
    docs = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
                docs.append((filename, file.read()))
    return docs



def load_model(model_info):
    device = 0 if torch.cuda.is_available() else -1  # GPU device index (usually 0), -1 for CPU
    model_name = model_info
    hf_model = pipeline("feature-extraction", model=model_name, device=device)
    return KeyBERT(model=hf_model)


def run_models(docs, model, model_name, output_base, ngram_ranges, diversities, lowercase):
    stopwords = load_and_preprocess_stopwords('estonian-stopwords.txt')
    for ngram_range in ngram_ranges:
        vectorizer = CountVectorizer(tokenizer=custom_tokenizer, ngram_range=ngram_range, stop_words=stopwords, token_pattern=None, lowercase=lowercase)
        for diversity in diversities:
            output_dir_path = os.path.join(output_base, f"{model_name}", f"ngram_{ngram_range[0]}_{ngram_range[1]}", f"diversity_{int(diversity*10)}")
            os.makedirs(output_dir_path, exist_ok=True)
            for filename, doc in docs:
                output_path = os.path.join(output_dir_path, f"{filename[:-4]}.csv")
                keywords = model.extract_keywords(doc, use_mmr=True, diversity=diversity, vectorizer=vectorizer, nr_candidates=200, top_n=200)
                with open(output_path, 'w', newline='', encoding='utf-8') as csvfile:
                    writer = csv.writer(csvfile, delimiter=';')
                    writer.writerow(['keyphrase', 'score'])
                    for keyphrase, score in keywords:
                        writer.writerow([keyphrase, score])
            print(f"Finished processing {model_name} at ngram range {ngram_range} and diversity {diversity} with nr_candidates=200 and top_n=200 and lowercase={lowercase}")
    del model  # Free up memory
    if cuda.is_available():
        cuda.empty_cache()

def main():
    base_folders = {
        'raw_text': 'models/raw_text_data',
        'raw_text_lemma': 'models/raw_text_lemma_data',
    }
    lcf_folders = {
        'raw_text': 'models/raw_text_data_LCF',
        'raw_text_lemma': 'models/raw_text_lemma_data_LCF'
    }
    models_info = [
        'distilbert-base-cased',
        'bert-base-multilingual-cased',
        'xlm-roberta-base',
        'sentence-transformers/LaBSE'
    ]
    ngram_ranges = [(1, 1), (2, 2), (3, 3)]
    diversities = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]

    for folder_key in base_folders:
        folder_path = base_folders[folder_key]
        docs = read_documents(folder_path)
        for model_name in models_info:
            model = load_model(model_name)
            # Process normally
            run_models(docs, model, model_name, folder_path, ngram_ranges, diversities, lowercase=True)
            # Process with lowercase=False
            run_models(docs, model, model_name, lcf_folders[folder_key], ngram_ranges, diversities, lowercase=False)

if __name__ == '__main__':
    main()


FileNotFoundError: [WinError 3] The system cannot find the path specified: 'modelsx/raw_text_data'