In [14]:
pip install openai

Note: you may need to restart the kernel to use updated packages.


In [43]:
with open('t887L.txt', 'r', encoding='utf-8') as file:
    # Read the contents of the file
    doc = file.read()

In [47]:
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer

sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
kw_model = KeyBERT(model=sentence_model)

In [48]:
from keybert import KeyBERT
from flair.embeddings import TransformerDocumentEmbeddings

roberta = TransformerDocumentEmbeddings('roberta-base')
kw_model = KeyBERT(model=roberta)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [2]:
import csv
import os
from flair.embeddings import TransformerDocumentEmbeddings
from keybert import KeyBERT
import re

def load_and_preprocess_stopwords(file_path):
    with open(file_path, 'r', encoding='UTF-8') as file:
        stopwords = [re.sub(r'\W+', '', line.strip().lower()) for line in file]
    return stopwords

def extract_keywords_and_write(text_path, csv_path, output_path, stopwords, model_name, ngram):
    # Read the text file
    with open(text_path, 'r', encoding="UTF-8") as file:
        text_content = file.read()

    # Count rows in the corresponding CSV file
    with open(csv_path, 'r', newline='', encoding='UTF-8') as csvfile:
        reader = csv.reader(csvfile)
        next(reader, None)  # Skip the header row
        row_count = sum(1 for row in reader)  # Count rows excluding the header

    # Load the model and extract keywords
    doc_embeddings = TransformerDocumentEmbeddings(model_name)
    kw_model = KeyBERT(model=doc_embeddings)
    keywords = kw_model.extract_keywords(text_content, keyphrase_ngram_range=(ngram, ngram), stop_words=estonian_stopwords, nr_candidates=row_count, top_n=row_count)

    # Write keywords to a new CSV file in the output directory
    with open(output_path, 'w', newline='', encoding="UTF-8") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(("word", "score"))  # Write header
        writer.writerows(keywords)  # Write keywords and scores

# Load and preprocess Estonian stopwords
estonian_stopwords = load_and_preprocess_stopwords('estonian-stopwords.txt')

# Define directories
txt_dir = 'raw_text'
csv_dir = 'pre_processed_text_data'
models_dir = 'models'
os.makedirs(models_dir, exist_ok=True)

# Model configurations
models = ['google/mt5-base', 'facebook/mbart-large-50', 'tartuNLP/EstBERT']
ngrams = [1, 2, 3]

# Process each text file in the txt_dir for each model and ngram setting
for model in models:
    model_path = os.path.join(models_dir, model.split('/')[0])  # Model directory
    for ngram in ngrams:
        ngram_path = os.path.join(model_path, f"ngram {ngram}")
        os.makedirs(ngram_path, exist_ok=True)

        for txt_filename in os.listdir(txt_dir):
            if txt_filename.endswith('.txt'):
                base_filename = os.path.splitext(txt_filename)[0]
                csv_filename = f"{base_filename}.csv"
                txt_file_path = os.path.join(txt_dir, txt_filename)
                csv_file_path = os.path.join(csv_dir, csv_filename)
                output_file_path = os.path.join(ngram_path, csv_filename)

                if os.path.exists(csv_file_path):
                    extract_keywords_and_write(txt_file_path, csv_file_path, output_file_path, estonian_stopwords, model, ngram)


  _torch_pytree._register_pytree_node(


In [1]:
import os
import re
import stanza

# Initialize Stanza pipeline for Estonian
nlp = stanza.Pipeline(lang="et")

input_directory = "raw_text/"
output_directory = "raw_text_lemma"

# Create output directory if it doesn't exist
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Function to handle None values safely and remove specific characters
def clean_text(value):
    value = value if value is not None else ""
    return re.sub(r"[_=+]", "", value)  # Remove unwanted characters

# Process each file in the input directory
for entry in os.scandir(input_directory):
    if entry.is_file() and entry.name.endswith(".txt"):
        print("Processing file:", entry.name)
        # Define the output file path
        output_file_path = os.path.join(output_directory, entry.name)
        with open(entry.path, "r", encoding="utf-8") as input_file:
            text = input_file.read()
            text = re.sub(r"\s+", " ", text).strip()  # Normalize whitespace
            doc = nlp(text)
            lemmatized_text = []
            # Extract and clean lemmas for each word in each sentence
            for sent in doc.sentences:
                for word in sent.words:
                    cleaned_lemma = clean_text(word.lemma)
                    lemmatized_text.append(cleaned_lemma)
            # Join all cleaned lemmas with a space and write to the output file
            with open(output_file_path, "w", encoding="utf-8") as output_file:
                output_file.write(" ".join(lemmatized_text) + "\n")

print("Lemmatization complete. Files written to", output_directory)


  from .autonotebook import tqdm as notebook_tqdm
2024-04-11 03:40:39 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json: 367kB [00:00, 5.71MB/s]
2024-04-11 03:40:40 INFO: Loading these models for language: et (Estonian):
| Processor | Package      |
----------------------------
| tokenize  | edt          |
| pos       | edt_nocharlm |
| lemma     | edt_nocharlm |
| depparse  | edt_nocharlm |

2024-04-11 03:40:40 INFO: Using device: cuda
2024-04-11 03:40:40 INFO: Loading: tokenize
  _torch_pytree._register_pytree_node(
2024-04-11 03:40:41 INFO: Loading: pos
2024-04-11 03:40:42 INFO: Loading: lemma
2024-04-11 03:40:42 INFO: Loading: depparse
2024-04-11 03:40:42 INFO: Done loading processors!


Processing file: t10352.txt
Processing file: t105779.txt
Processing file: t105808.txt
Processing file: t106205.txt
Processing file: t106285.txt
Processing file: t106306.txt
Processing file: t106434.txt
Processing file: t106764.txt
Processing file: t10801.txt
Processing file: t108326.txt
Processing file: t10878.txt
Processing file: t109127.txt
Processing file: t10948.txt
Processing file: t110581.txt
Processing file: t112542.txt
Processing file: t113308.txt
Processing file: t1134.txt
Processing file: t114676.txt
Processing file: t115737.txt
Processing file: t116111.txt
Processing file: t116480.txt
Processing file: t119185.txt
Processing file: t120234.txt
Processing file: t120240.txt
Processing file: t121275.txt
Processing file: t123180.txt
Processing file: t123952.txt
Processing file: t124382.txt
Processing file: t126269.txt
Processing file: t126305.txt
Processing file: t126576.txt
Processing file: t12676.txt
Processing file: t127878.txt
Processing file: t128045.txt
Processing file: t129

In [5]:
import csv
import os
from flair.embeddings import TransformerDocumentEmbeddings
from keybert import KeyBERT
import re

def load_and_preprocess_stopwords(file_path):
    with open(file_path, 'r', encoding='UTF-8') as file:
        stopwords = [re.sub(r'\W+', '', line.strip().lower()) for line in file]
    return stopwords

def extract_keywords_and_write(text_path, csv_path, output_path, stopwords, model_name, ngram, is_lemma):
    # Read the text file
    with open(text_path, 'r', encoding="UTF-8") as file:
        text_content = file.read()

    # Count rows in the corresponding CSV file
    with open(csv_path, 'r', newline='', encoding='UTF-8') as csvfile:
        reader = csv.reader(csvfile)
        next(reader, None)  # Skip the header row
        row_count = sum(1 for row in reader)  # Count rows excluding the header

    # Load the model and extract keywords
    doc_embeddings = TransformerDocumentEmbeddings(model_name)
    kw_model = KeyBERT(model=doc_embeddings)
    keywords = kw_model.extract_keywords(text_content, keyphrase_ngram_range=(ngram, ngram), stop_words=stopwords, nr_candidates=row_count, top_n=row_count)

    # Write keywords to a new CSV file in the output directory
    with open(output_path, 'w', newline='', encoding="UTF-8") as csvfile:
        writer = csv.writer(csvfile, delimiter=';')
        if is_lemma:
            writer.writerow(("lemma", "score"))  # Header for lemmatized data
        else:
            writer.writerow(("word", "score"))  # Header for non-lemmatized data
        writer.writerows(keywords)  # Write keywords and scores

# Load and preprocess Estonian stopwords
estonian_stopwords = load_and_preprocess_stopwords('estonian-stopwords.txt')

# Define directories and configurations
txt_dirs = {'word': 'raw_text', 'lemma': 'raw_text_lemma'}
csv_dir = 'pre_processed_text_data'
models_dir = 'models'
os.makedirs(models_dir, exist_ok=True)

models = ['google/mt5-base', 'facebook/mbart-large-50', 'tartuNLP/EstBERT']
ngrams = [1, 2, 3]

# Process each text file in both txt_dirs for each model and ngram setting
for txt_type, txt_dir in txt_dirs.items():
    for model in models:
        model_short_name = model.split('/')[0]
        model_path = os.path.join(models_dir, model_short_name)  # Model directory
        for ngram in ngrams:
            ngram_path = os.path.join(model_path, txt_type, f"ngram {ngram}")
            os.makedirs(ngram_path, exist_ok=True)

            for txt_filename in os.listdir(txt_dir):
                if txt_filename.endswith('.txt'):
                    base_filename = os.path.splitext(txt_filename)[0]
                    csv_filename = f"{base_filename}.csv"
                    txt_file_path = os.path.join(txt_dir, txt_filename)
                    csv_file_path = os.path.join(csv_dir, csv_filename)
                    output_file_path = os.path.join(ngram_path, csv_filename)

                    if os.path.exists(csv_file_path):
                        extract_keywords_and_write(txt_file_path, csv_file_path, output_file_path, estonian_stopwords, model, ngram, txt_type == 'lemma')




KeyboardInterrupt: 

In [31]:
from sentence_transformers import SentenceTransformer
def load_and_preprocess_stopwords(file_path):
    with open(file_path, 'r', encoding='UTF-8') as file:
        stopwords = [re.sub(r'\W+', '', line.strip().lower()) for line in file]
    return stopwords
estonian_stopwords = load_and_preprocess_stopwords('estonian-stopwords.txt')
sentence_model = SentenceTransformer("sentence-transformers/LaBSE")
kw_model = KeyBERT(model=sentence_model)
kw_model.extract_keywords(doc, keyphrase_ngram_range=(3, 3), stop_words=estonian_stopwords,use_mmr=True, diversity=0.7,nr_candidates=20, top_n=20)

[('inglisekeelne lugu tegelikult', 0.4562),
 ('raadio kaks kuulaja', 0.3459),
 ('ilmuv plaat planeerima', 0.2865),
 ('seen käima joogajutt', 0.2623),
 ('kandma pealkiri šokolaad', 0.262),
 ('vesi agent emmy', 0.2229),
 ('merili marten ansambel', 0.2218),
 ('etapp selg korralik', 0.2174),
 ('laul ooperimetoodika pull', 0.21),
 ('eesti naitis evenessons', 0.2096),
 ('surm vaatama kommenteerima', 0.2031),
 ('laskma lennutatud moskva', 0.192),
 ('läbuvärk töö isik', 0.1916),
 ('põhimõte lehekülg usaldama', 0.1911),
 ('rockilikum alternatiivne aasta', 0.1789),
 ('siduma kergem määramatus', 0.1588),
 ('tahtma tahtma tahtma', 0.1476),
 ('uus hoog 50', 0.1156),
 ('jutt nafta world', 0.0772),
 ('pointaksandasam jaam filosoofiline', 0.0517)]

In [33]:
kw_model.extract_keywords(doc, keyphrase_ngram_range=(1, 1), stop_words=estonian_stopwords,use_maxsum=True, nr_candidates=20, top_n=20)

[('line', 0.3162),
 ('olenema', 0.3199),
 ('kuulama', 0.3234),
 ('kuulma', 0.3257),
 ('ansambel', 0.3276),
 ('sõnum', 0.3299),
 ('võimalik', 0.3321),
 ('track', 0.3323),
 ('sõitma', 0.3327),
 ('vesi', 0.335),
 ('rääkima', 0.3352),
 ('laulma', 0.3425),
 ('bändama', 0.3445),
 ('raadio', 0.3452),
 ('inglisekeelne', 0.3463),
 ('hooaeg', 0.3496),
 ('inglise', 0.3516),
 ('toimuma', 0.3628),
 ('lauljatar', 0.3642),
 ('käima', 0.3669)]

In [34]:
import csv
import os
import re
from flair.embeddings import TransformerDocumentEmbeddings
from sentence_transformers import SentenceTransformer
from keybert import KeyBERT

def load_and_preprocess_stopwords(file_path):
    with open(file_path, 'r', encoding='UTF-8') as file:
        stopwords = [re.sub(r'\W+', '', line.strip().lower()) for line in file]
    return stopwords

def extract_keywords_and_write(text_path, csv_path, output_path, stopwords, model, ngram, extraction_method, diversity=None):
    # Read the text file
    with open(text_path, 'r', encoding="UTF-8") as file:
        text_content = file.read()

    # Count rows in the corresponding CSV file
    with open(csv_path, 'r', newline='', encoding='UTF-8') as csvfile:
        reader = csv.reader(csvfile)
        next(reader, None)  # Skip the header row
        row_count = sum(1 for row in reader)  # Count rows excluding the header

    # Load the model and extract keywords
    if isinstance(model, str):
        doc_embeddings = TransformerDocumentEmbeddings(model)
    else:
        doc_embeddings = model  # SentenceTransformer model

    kw_model = KeyBERT(model=doc_embeddings)

    if extraction_method == 'mmr':
        keywords = kw_model.extract_keywords(text_content, keyphrase_ngram_range=(ngram, ngram),
                                             stop_words=stopwords, use_mmr=True, diversity=diversity,
                                             nr_candidates=row_count, top_n=row_count)
    elif extraction_method == 'maxsum':
        keywords = kw_model.extract_keywords(text_content, keyphrase_ngram_range=(ngram, ngram),
                                             stop_words=stopwords, use_maxsum=True,
                                             nr_candidates=row_count, top_n=row_count)
    else:  # Vanilla extraction
        keywords = kw_model.extract_keywords(text_content, keyphrase_ngram_range=(ngram, ngram),
                                             stop_words=stopwords, nr_candidates=row_count, top_n=row_count)

    # Write keywords to a new CSV file in the output directory
    with open(output_path, 'w', newline='', encoding="UTF-8") as csvfile:
        writer = csv.writer(csvfile, delimiter=';')
        writer.writerow(("word", "score"))
        writer.writerows(keywords)

# Load and preprocess Estonian stopwords
estonian_stopwords = load_and_preprocess_stopwords('estonian-stopwords.txt')

# Define directories and configurations
txt_dirs = ['raw_text', 'raw_text_lemma']
csv_dir = 'pre_processed_text_data'
models_dir = 'models'
os.makedirs(models_dir, exist_ok=True)

models = [
    'google/mt5-base',
    'facebook/mbart-large-50',
    'tartuNLP/EstBERT',
    SentenceTransformer("sentence-transformers/LaBSE")
]
ngrams = [1, 2, 3]
extraction_methods = [
    ('mmr', 0.7),
    ('mmr', 0.5),
    ('maxsum', None),
    ('vanilla', None)  # Vanilla keyword extraction
]

# Process each text file in both txt_dirs for each model, ngram, and extraction method
for txt_dir in txt_dirs:
    for model in models:
        model_name = model if isinstance(model, str) else model.__class__.__name__
        model_path = os.path.join(models_dir, model_name)  # Model directory
        os.makedirs(model_path, exist_ok=True)
        
        for ngram in ngrams:
            for method, diversity in extraction_methods:
                method_path = method if diversity is None else f"{method}_{diversity}"
                output_path = os.path.join(model_path, txt_dir, f"ngram_{ngram}", method_path)
                os.makedirs(output_path, exist_ok=True)

                for txt_filename in os.listdir(txt_dir):
                    if txt_filename.endswith('.txt'):
                        base_filename = os.path.splitext(txt_filename)[0]
                        csv_filename = f"{base_filename}.csv"
                        txt_file_path = os.path.join(txt_dir, txt_filename)
                        csv_file_path = os.path.join(csv_dir, csv_filename)
                        output_file_path = os.path.join(output_path, csv_filename)

                        if os.path.exists(csv_file_path):
                            extract_keywords_and_write(txt_file_path, csv_file_path, output_file_path,
                                                       estonian_stopwords, model, ngram, method, diversity)




KeyboardInterrupt: 

In [65]:
from keybert import KeyBERT
import re

def custom_tokenizer(text):
    # Tokenize on spaces and punctuation but keep hyphenated words together
    tokens = re.split(r'[\s,.!?;:()]+', text)
    return [token for token in tokens if token]

# Sample text
text = "high-quality products are in-demand in today's fast-paced market."

# Instantiate KeyBERT
kw_model = KeyBERT()

# Extract keywords using the custom tokenization
keywords = kw_model.extract_keywords(text, vectorizer=CountVectorizer(tokenizer=custom_tokenizer))
print(keywords)KeyphraseCountVectorizer()

[('eksima', 0.4286),
 ('olenema', 0.3889),
 ('ooperimetoodika', 0.3773),
 ('küsima', 0.3648),
 ('lubama', 0.3551)]

In [68]:
import csv
import os
import re
import warnings
from flair.embeddings import TransformerDocumentEmbeddings
from sentence_transformers import SentenceTransformer
from keybert import KeyBERT
from sklearn.feature_extraction.text import CountVectorizer

# Suppress specific user warnings about tokenizer fallback issues
warnings.filterwarnings("ignore", message="The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers.*")

def load_and_preprocess_stopwords(file_path):
    with open(file_path, 'r', encoding='UTF-8') as file:
        stopwords = [re.sub(r'\W+', '', line.strip().lower()) for line in file]
    return stopwords

def custom_tokenizer(text):
    # Tokenize on spaces and punctuation but keep hyphenated words together
    tokens = re.split(r'[\s,.!?;:()]+', text)
    return [token for token in tokens if token]

def extract_keywords_and_write(text_path, csv_path, output_path, stopwords, model, ngram, extraction_method, diversity=None):
    # Read the text file
    with open(text_path, 'r', encoding="UTF-8") as file:
        text_content = file.read()

    # Count rows in the corresponding CSV file
    with open(csv_path, 'r', newline='', encoding='UTF-8') as csvfile:
        reader = csv.reader(csvfile)
        next(reader, None)  # Skip the header row
        row_count = sum(1 for row in reader)  # Count rows excluding the header

    # Configure the vectorizer with the custom tokenizer
    vectorizer = CountVectorizer(tokenizer=custom_tokenizer, stop_words=stopwords)

    # Load the model and extract keywords
    if isinstance(model, str):
        doc_embeddings = TransformerDocumentEmbeddings(model)
    else:
        doc_embeddings = model  # SentenceTransformer model

    kw_model = KeyBERT(model=doc_embeddings)

    if extraction_method == 'mmr':
        keywords = kw_model.extract_keywords(text_content, vectorizer=vectorizer, keyphrase_ngram_range=(ngram, ngram),
                                             use_mmr=True, diversity=diversity,
                                             nr_candidates=row_count, top_n=row_count)
    elif extraction_method == 'maxsum':
        keywords = kw_model.extract_keywords(text_content, vectorizer=vectorizer, keyphrase_ngram_range=(ngram, ngram),
                                             use_maxsum=True,
                                             nr_candidates=row_count, top_n=row_count)
    else:  # Vanilla extraction
        keywords = kw_model.extract_keywords(text_content, vectorizer=vectorizer, keyphrase_ngram_range=(ngram, ngram),
                                             nr_candidates=row_count, top_n=row_count)

    # Write keywords to a new CSV file in the output directory
    with open(output_path, 'w', newline='', encoding="UTF-8") as csvfile:
        writer = csv.writer(csvfile, delimiter=';')
        writer.writerow(("word", "score"))
        writer.writerows(keywords)

# Load stopwords for both regular and lemmatized text
regular_stopwords = load_and_preprocess_stopwords('estonian-stopwords.txt')
lemma_stopwords = load_and_preprocess_stopwords('estonian-stopwords-lemmas.txt')

# Define directories and configurations
txt_dirs = ['raw_text', 'raw_text_lemma']
csv_dir = 'pre_processed_text_data'
models_dir = 'models'
os.makedirs(models_dir, exist_ok=True)

models = [
    'google/mt5-base',
    'facebook/mbart-large-50',
    'tartuNLP/EstBERT',
    SentenceTransformer("sentence-transformers/LaBSE")
]
ngrams = [1, 2, 3]
extraction_methods = [
    ('mmr', 0.7),
    ('mmr', 0.5),
    ('maxsum', None),
    ('vanilla', None)  # Vanilla keyword extraction
]

# Iterate over all configurations
for txt_dir in txt_dirs:
    for model in models:
        model_name = model if isinstance(model, str) else model.__class__.__name__
        model_path = os.path.join(models_dir, model_name)
        
        stopwords = lemma_stopwords if 'lemma' in txt_dir else regular_stopwords
        
        for ngram in ngrams:
            for method, diversity in extraction_methods:
                method_path = method if diversity is None else f"{method}_{diversity}"
                output_path = os.path.join(model_path, txt_dir, f"ngram_{ngram}", method_path, "with_vectorizer")
                os.makedirs(output_path, exist_ok=True)

                for txt_filename in os.listdir(txt_dir):
                    if txt_filename.endswith('.txt'):
                        base_filename = os.path.splitext(txt_filename)[0]
                        csv_filename = f"{base_filename}.csv"
                        txt_file_path = os.path.join(txt_dir, txt_filename)
                        csv_file_path = os.path.join(csv_dir, csv_filename)
                        output_file_path = os.path.join(output_path, csv_filename)

                        if os.path.exists(csv_file_path):
                            extract_keywords_and_write(txt_file_path, csv_file_path, output_file_path,
                                                       stopwords, model, ngram, method, diversity)




In [69]:
import warnings
# Suppress specific user warnings about the tokenizer
warnings.filterwarnings("ignore", message="The parameter 'token_pattern' will not be used since 'tokenizer' is not None")


In [72]:
import csv
import os
import re
import warnings
from flair.embeddings import TransformerDocumentEmbeddings
from keybert import KeyBERT
from sklearn.feature_extraction.text import CountVectorizer

# Suppress specific warnings
warnings.filterwarnings("ignore", message="The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers.*")
warnings.filterwarnings("ignore", message="The parameter 'token_pattern' will not be used since 'tokenizer' is not None")

def load_and_preprocess_stopwords(file_path):
    with open(file_path, 'r', encoding='UTF-8') as file:
        stopwords = [re.sub(r'\W+', '', line.strip().lower()) for line in file]
    return stopwords

def custom_tokenizer(text):
    # Tokenize on spaces and punctuation but keep hyphenated words together
    tokens = re.split(r'[\s,.!?;:()]+', text)
    return [token for token in tokens if token]

def extract_keywords_and_write(text_path, csv_path, output_path, stopwords, model_name, ngram, extraction_method, diversity=None):
    try:
        # Read the text file
        with open(text_path, 'r', encoding="UTF-8") as file:
            text_content = file.read()

        # Count rows in the corresponding CSV file but cap at 400
        with open(csv_path, 'r', newline='', encoding='UTF-8') as csvfile:
            reader = csv.reader(csvfile)
            next(reader, None)  # Skip the header row
            row_count = min(400, sum(1 for row in reader))  # Count rows excluding the header, max 400

        # Configure the vectorizer with the custom tokenizer
        vectorizer = CountVectorizer(tokenizer=custom_tokenizer, stop_words=stopwords)

        # Load the model and extract keywords
        doc_embeddings = TransformerDocumentEmbeddings(model_name)

        kw_model = KeyBERT(model=doc_embeddings)

        if extraction_method.startswith('mmr'):
            keywords = kw_model.extract_keywords(text_content, vectorizer=vectorizer, keyphrase_ngram_range=(ngram, ngram),
                                                 use_mmr=True, diversity=diversity,
                                                 nr_candidates=row_count, top_n=row_count)
        elif extraction_method == 'maxsum':
            keywords = kw_model.extract_keywords(text_content, vectorizer=vectorizer, keyphrase_ngram_range=(ngram, ngram),
                                                 use_maxsum=True,
                                                 nr_candidates=row_count, top_n=row_count)

        # Write keywords to a new CSV file in the output directory
        with open(output_path, 'w', newline='', encoding="UTF-8") as csvfile:
            writer = csv.writer(csvfile, delimiter=';')
            writer.writerow(("word", "score"))
            writer.writerows(keywords)
    except Exception as e:
        print(f"Error processing file {text_path}: {e}")

# Model name validation and configuration
model = 'tartuNLP/EstBERT'  # Ensure this is a valid model name

# Load stopwords for both regular and lemmatized text
regular_stopwords = load_and_preprocess_stopwords('estonian-stopwords.txt')
lemma_stopwords = load_and_preprocess_stopwords('estonian-stopwords-lemmas.txt')

# Define directories and configurations
txt_dirs = ['raw_text', 'raw_text_lemma']
csv_dir = 'pre_processed_text_data'
models_dir = 'models/tartuNLP_EstBERT'
os.makedirs(models_dir, exist_ok=True)

ngrams = [1, 2, 3]
extraction_methods = [
    ('mmr', 0.7),
    ('mmr', 0.9),
    ('mmr', 0.2),  # Adding low diversity setting
    ('maxsum', None)
]

# Iterate over configurations
for txt_dir in txt_dirs:
    path_dir = os.path.join(models_dir, txt_dir)
    os.makedirs(path_dir, exist_ok=True)
    
    for ngram in ngrams:
        for method, diversity in extraction_methods:
            method_path = f"{method}_{diversity}" if diversity else method
            output_path = os.path.join(path_dir, f"ngram_{ngram}", method_path)
            os.makedirs(output_path, exist_ok=True)

            for txt_filename in os.listdir(txt_dir):
                if txt_filename.endswith('.txt'):
                    base_filename = os.path.splitext(txt_filename)[0]
                    csv_filename = f"{base_filename}.csv"
                    txt_file_path = os.path.join(txt_dir, txt_filename)
                    csv_file_path = os.path.join(csv_dir, csv_filename)
                    output_file_path = os.path.join(output_path, csv_filename)

                    if os.path.exists(csv_file_path):
                        extract_keywords_and_write(txt_file_path, csv_file_path, output_file_path,
                                                   stopwords, model, ngram, method, diversity)


Error processing file raw_text\t152649.txt: 'NoneType' object is not iterable
Error processing file raw_text\t161609.txt: 'NoneType' object is not iterable
Error processing file raw_text\t162792.txt: 'NoneType' object is not iterable
Error processing file raw_text\t164900.txt: 'NoneType' object is not iterable
Error processing file raw_text\t887.txt: 'NoneType' object is not iterable
Error processing file raw_text\t896.txt: 'NoneType' object is not iterable
Error processing file raw_text\t903.txt: 'NoneType' object is not iterable
Error processing file raw_text\t152649.txt: 'NoneType' object is not iterable
Error processing file raw_text\t161609.txt: 'NoneType' object is not iterable
Error processing file raw_text\t162792.txt: 'NoneType' object is not iterable
Error processing file raw_text\t164900.txt: 'NoneType' object is not iterable
Error processing file raw_text\t887.txt: 'NoneType' object is not iterable
Error processing file raw_text\t896.txt: 'NoneType' object is not iterable
E