In [None]:
import os
import re
import csv
from torch import cuda
from sentence_transformers import SentenceTransformer
from flair.embeddings import TransformerDocumentEmbeddings
from keybert import KeyBERT
from sklearn.feature_extraction.text import CountVectorizer


def load_and_preprocess_stopwords(file_path):
    with open(file_path, "r", encoding="UTF-8") as file:
        stopwords = [re.sub(r"\W+", "", line.strip().lower()) for line in file]
    return stopwords


def custom_tokenizer(doc):
    return re.split(r"[\s,.!?;:()]+", doc)


def read_documents(folder_path):
    docs = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            with open(
                os.path.join(folder_path, filename), "r", encoding="utf-8"
            ) as file:
                docs.append((filename, file.read()))
    return docs


def load_model(model_info):
    model_type, model_path = model_info
    if model_type == "sentence_transformer":
        model = SentenceTransformer(model_path)
    elif model_type == "flair_transformer":
        model = TransformerDocumentEmbeddings(model_path, device="cuda")
    return KeyBERT(model=model)


def run_models(
    docs,
    model,
    model_name,
    output_base,
    ngram_ranges,
    diversities,
    lowercase,
    batch_size=5,
):
    stopwords = load_and_preprocess_stopwords("estonian-stopwords.txt")
    for ngram_range in ngram_ranges:
        vectorizer = CountVectorizer(
            tokenizer=custom_tokenizer,
            ngram_range=ngram_range,
            stop_words=stopwords,
            token_pattern=None,
            lowercase=lowercase,
        )
        for diversity in diversities:
            output_dir_path = os.path.join(
                output_base,
                f"{model_name}",
                f"ngram_{ngram_range[0]}_{ngram_range[1]}",
                f"diversity_{int(diversity*10)}",
            )
            os.makedirs(output_dir_path, exist_ok=True)

            for i in range(0, len(docs), batch_size):
                batch_docs = docs[i : i + batch_size]
                batch_texts = [text for _, text in batch_docs]
                batch_filenames = [filename for filename, _ in batch_docs]
                keywords_batch = [
                    model.extract_keywords(
                        doc,
                        use_mmr=True,
                        diversity=diversity,
                        vectorizer=vectorizer,
                        nr_candidates=200,
                        top_n=200,
                    )
                    for doc in batch_texts
                ]

                for keywords, filename in zip(keywords_batch, batch_filenames):
                    output_path = os.path.join(output_dir_path, f"{filename[:-4]}.csv")
                    with open(
                        output_path, "w", newline="", encoding="utf-8"
                    ) as csvfile:
                        writer = csv.writer(csvfile, delimiter=";")
                        writer.writerow(["keyphrase", "score"])
                        for keyphrase, score in keywords:
                            writer.writerow([keyphrase, score])

            print(
                f"Finished processing {model_name} at ngram range {ngram_range} and diversity {diversity} with nr_candidates=200 and top_n=200 and lowercase={lowercase}"
            )
    del model
    if cuda.is_available():
        cuda.empty_cache()


def main():
    base_folders = {
        "raw_text": "models/raw_text_data",
        "raw_text_lemma": "models/raw_text_lemma_data",
    }
    lcf_folders = {
        "raw_text": "models/raw_text_data_LCF",
        "raw_text_lemma": "models/raw_text_lemma_data_LCF",
    }
    models_info = {
        "LaBSE": ("sentence_transformer", "sentence-transformers/LaBSE"),
        "multi_e5": ("sentence_transformer", "intfloat/multilingual-e5-large-instruct"),
        "MPNet": (
            "sentence_transformer",
            "sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
        ),
        "MiniLM-L12_multi": (
            "sentence_transformer",
            "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
        ),
        "distilbertMulti": (
            "flair_transformer",
            "distilbert/distilbert-base-multilingual-cased",
        ),
        "bertMulti": ("flair_transformer", "google-bert/bert-base-multilingual-cased"),
        "xlm-roberta": ("flair_transformer", "FacebookAI/xlm-roberta-base"),
        "EstBERT": ("flair_transformer", "tartuNLP/EstBERT"),
        "est-roberta": ("flair_transformer", "EMBEDDIA/est-roberta"),
    }
    ngram_ranges = [(1, 1), (2, 2), (3, 3)]
    diversities = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]

    for folder_key in base_folders:
        folder_path = "raw_text" if "lemma" not in folder_key else "raw_text_lemma"
        docs = read_documents(folder_path)
        for model_name, model_info in models_info.items():
            model = load_model(model_info)
            run_models(
                docs,
                model,
                model_name,
                base_folders[folder_key],
                ngram_ranges,
                diversities,
                lowercase=True,
            )
            run_models(
                docs,
                model,
                model_name,
                lcf_folders[folder_key],
                ngram_ranges,
                diversities,
                lowercase=False,
            )


if __name__ == "__main__":
    main()


In [6]:
# RUN ONLY TO VERIFY THE FILE COUNT AFTER GETTING THE MODEL AND 4 SUBFOLDERS COMPLETED!!!! (THE EXPECTED COUNT CAN VARY FOR ME IT WAS WITH 9 MODELS) IF ANYTHING DIVIDE THE NUMBER BY 9 AND MULTIPLY BY THE MODEL AMOUNT YOU HAVE
import os


def count_files_in_directory(directory):
    total_files = 0
    for _, _, files in os.walk(directory):
        total_files += len(files)
    return total_files


def verify_folder(root_dir, expected_files_per_folder, depth):
    if depth == 0:
        actual_files = count_files_in_directory(root_dir)
        if actual_files != expected_files_per_folder:
            print(
                f"Error: {root_dir} has {actual_files} files, expected {expected_files_per_folder}."
            )
        return actual_files

    if not os.path.isdir(root_dir):
        print(f"Missing folder: {root_dir}")
        return 0

    total_files = 0
    subdirs = [
        os.path.join(root_dir, subdir)
        for subdir in os.listdir(root_dir)
        if os.path.isdir(os.path.join(root_dir, subdir))
    ]
    if len(subdirs) == 0:
        print(f"Error: No subdirectories found in {root_dir}")
    for subdir in subdirs:
        total_files += verify_folder(subdir, expected_files_per_folder, depth - 1)

    return total_files


def main():
    base_dir = "models"
    depth = 4
    expected_files_per_folder = 180

    if not os.path.exists(base_dir):
        print(f"Base directory {base_dir} does not exist.")
        return

    total_found_files = verify_folder(base_dir, expected_files_per_folder, depth)
    print(f"Total found files: {total_found_files}")
    expected_total_files = 4 * 9 * 3 * 11 * 180
    print(f"Total expected files: {expected_total_files}")


if __name__ == "__main__":
    main()


Total found files: 213840
Total expected files: 213840
