In [None]:
import os
import re
import csv
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer
from flair.embeddings import TransformerDocumentEmbeddings
from sklearn.feature_extraction.text import CountVectorizer
from transformers import AutoTokenizer

def load_and_preprocess_stopwords(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        stopwords = [re.sub(r'\W+', '', line.strip().lower()) for line in file]
    return stopwords

def custom_tokenizer(doc):
    tokens = re.split(r'[\s,.!?;:()]+', doc)
    return [token for token in tokens if token]

def get_model_embeddings(model_name):
    if model_name in ['google/mt5-base', 'facebook/mbart-large-50']:
        # Keep using the slow tokenizer for these models
        tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
    else:
        tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    return TransformerDocumentEmbeddings(model_name, tokenizer=tokenizer)

def extract_keywords_for_folder(folder_path, stopwords_file, output_base_folder, ngram_range, diversity, lemma_or_word):
    stopwords = load_and_preprocess_stopwords(stopwords_file)
    vectorizer = CountVectorizer(tokenizer=custom_tokenizer, ngram_range=ngram_range, stop_words=stopwords, token_pattern=None)
    diversity_label = f'diversity{int(diversity * 10)}'  # Correct diversity naming

    models = [
        ('EstBERT', 'tartuNLP/EstBERT'),
        ('LaBSE', 'sentence-transformers/LaBSE'),
        ('mBART', 'facebook/mbart-large-50'),
        ('mT5', 'google/mt5-base')
    ]

    for model_name, model_path in models:
        model = get_model_embeddings(model_path)
        model_output_folder = os.path.join(output_base_folder, model_name, lemma_or_word, f'ngram{ngram_range[0]}')
        os.makedirs(model_output_folder, exist_ok=True)
        kw_model = KeyBERT(model=model)

        for file_name in os.listdir(folder_path):
            if file_name.endswith('.txt'):
                file_path = os.path.join(folder_path, file_name)
                with open(file_path, 'r', encoding='utf-8') as file:
                    doc = file.read()

                keywords = kw_model.extract_keywords(doc, use_mmr=True, diversity=diversity, vectorizer=vectorizer, nr_candidates=200, top_n=200)
                csv_output_path = os.path.join(model_output_folder, file_name.replace('.txt', '.csv'))

                with open(csv_output_path, 'w', newline='', encoding='utf-8') as csvfile:
                    writer = csv.writer(csvfile)
                    writer.writerow(['word', 'score'])
                    writer.writerows(keywords)

if __name__ == '__main__':
    base_folder = 'raw_text'
    lemma_folder = 'raw_text_lemma'
    models_output_folder = 'models'
    ngram_ranges = [(1, 1), (2, 2), (3, 3)]
    diversities = [0.7]

    for diversity in diversities:
        for ngram_range in ngram_ranges:
            output_folder = os.path.join(models_output_folder, f'diversity{int(diversity * 10)}')
            extract_keywords_for_folder(base_folder, 'estonian-stopwords.txt', output_folder, ngram_range, diversity, 'word')
            extract_keywords_for_folder(lemma_folder, 'estonian-stopwords-lemmas.txt', output_folder, ngram_range, diversity, 'lemma')


  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [None]:
import os
import re
import csv
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer
from flair.embeddings import TransformerDocumentEmbeddings
from sklearn.feature_extraction.text import CountVectorizer
from transformers import AutoTokenizer

def load_and_preprocess_stopwords(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        stopwords = [re.sub(r'\W+', '', line.strip().lower()) for line in file]
    return stopwords

def custom_tokenizer(doc):
    tokens = re.split(r'[\s,.!?;:()]+', doc)
    return [token for token in tokens if token]

def get_model_embeddings(model_name):
    if model_name in ['google/mt5-base', 'facebook/mbart-large-50']:
        # Keep using the slow tokenizer for these models
        tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
    else:
        tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    return TransformerDocumentEmbeddings(model_name, tokenizer=tokenizer)

def extract_keywords_for_folder(folder_path, stopwords_file, output_base_folder, ngram_range, diversity, lemma_or_word):
    stopwords = load_and_preprocess_stopwords(stopwords_file)
    vectorizer = CountVectorizer(tokenizer=custom_tokenizer, ngram_range=ngram_range, stop_words=stopwords, token_pattern=None)
    diversity_label = f'diversity{int(diversity * 10)}'  # Correct diversity naming

    models = [
        ('EstBERT', 'tartuNLP/EstBERT'),
        ('LaBSE', 'sentence-transformers/LaBSE'),
        ('mBART', 'facebook/mbart-large-50'),
        ('mT5', 'google/mt5-base')
    ]

    for model_name, model_path in models:
        model = get_model_embeddings(model_path)
        model_output_folder = os.path.join(output_base_folder, model_name, lemma_or_word, f'ngram{ngram_range[0]}')
        os.makedirs(model_output_folder, exist_ok=True)
        kw_model = KeyBERT(model=model)

        for file_name in os.listdir(folder_path):
            if file_name.endswith('.txt'):
                file_path = os.path.join(folder_path, file_name)
                with open(file_path, 'r', encoding='utf-8') as file:
                    doc = file.read()

                keywords = kw_model.extract_keywords(doc, use_mmr=True, diversity=diversity, vectorizer=vectorizer, nr_candidates=200, top_n=200)
                csv_output_path = os.path.join(model_output_folder, file_name.replace('.txt', '.csv'))

                with open(csv_output_path, 'w', newline='', encoding='utf-8') as csvfile:
                    writer = csv.writer(csvfile)
                    writer.writerow(['word', 'score'])
                    writer.writerows(keywords)

if __name__ == '__main__':
    base_folder = 'raw_text'
    lemma_folder = 'raw_text_lemma'
    models_output_folder = 'models'
    ngram_ranges = [(1, 1), (2, 2), (3, 3)]
    diversities = [0.7]

    for diversity in diversities:
        for ngram_range in ngram_ranges:
            output_folder = os.path.join(models_output_folder, f'diversity{int(diversity * 10)}')
            extract_keywords_for_folder(base_folder, 'estonian-stopwords.txt', output_folder, ngram_range, diversity, 'word')
            extract_keywords_for_folder(lemma_folder, 'estonian-stopwords-lemmas.txt', output_folder, ngram_range, diversity, 'lemma')


In [1]:
import os
import re
import csv
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer
from flair.embeddings import TransformerDocumentEmbeddings
from sklearn.feature_extraction.text import CountVectorizer
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [89]:
from sentence_transformers import SentenceTransformer
from keybert import KeyBERT
from flair.embeddings import TransformerDocumentEmbeddings
with open('t896.txt', 'r', encoding='utf-8') as file:
    # Read the contents of the file
    doc = file.read()
def load_and_preprocess_stopwords(file_path):
    with open(file_path, 'r', encoding='UTF-8') as file:
        stopwords = [re.sub(r'\W+', '', line.strip().lower()) for line in file]
    return stopwords

def custom_tokenizer(doc):
    # Tokenize on spaces and punctuation but keep hyphenated words together
    tokens = re.split(r'[\s,.!?;:()]+', doc)
    return [token for token in tokens if token]

regular_stopwords = load_and_preprocess_stopwords('estonian-stopwords.txt')
vectorizerx = CountVectorizer(tokenizer=custom_tokenizer,ngram_range=(3,3), stop_words=regular_stopwords, token_pattern=None)

MiniLM = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')
MiniLM_model = KeyBERT(model=MiniLM)
LaBSE = SentenceTransformer('sentence-transformers/LaBSE')
LaBSE_model = KeyBERT(model=LaBSE)
e5= SentenceTransformer('intfloat/multilingual-e5-large-instruct')
e5_model = KeyBERT(model=e5)

roberta_est= TransformerDocumentEmbeddings('FacebookAI/xlm-roberta-base')
roberta_est_model = KeyBERT(model=roberta_est)

estBERT= TransformerDocumentEmbeddings('tartuNLP/EstBERT')
estBERT_model = KeyBERT(model=estBERT)

keywords = LaBSE_model.extract_keywords(doc, use_mmr=True, diversity=0.7, vectorizer=vectorizerx,nr_candidates=50, top_n=50)


keywords = LaBSE_model.extract_keywords(doc,nr_candidates=50,keyphrase_ngram_range=(1, 1), top_n=50)
print(keywords)

[('tavalise sportlasena teisisõnu', 0.4042), ('kõlab võib-olla klišee', 0.3689), ('alariksi kutse lihtne', 0.3114), ('peaasi mõnusat adrenaliini', 0.3069), ('tallinnas jõuluturniir sporditähtedele', 0.2999), ('allar levandi kuulus', 0.2894), ('mai lõpus kanaaridele', 0.2838), ('klubi hurra liikmed', 0.28), ('täis vaata raadio', 0.2645), ('11 aastat vanem', 0.2557), ('kahes stuudios ain-alar', 0.2545), ('laagrisse lanzarote saarel', 0.2541), ('havaile nii-öelda kvalifikatsiooninumbreid', 0.2529), ('ain-alar juhanson triatleet', 0.2467), ('jaanuar kodukuud ongi', 0.246), ('ain-alar õnnestus kinni', 0.2358), ('materjalid kunagine anstrunki', 0.2345), ('sissejuhatuseks täpsustus sõbrad-tuttavad', 0.228), ('ringkonna väga-väga traagiline', 0.2232), ('rokk-kontserdid täpselt meestele', 0.2227), ('hetk laiali vastus', 0.2216), ('suusatamisest lõpetan otepää', 0.2078), ('paned keha valmis', 0.2059), ('austria prantsusmaa saksamaa', 0.196), ('saateautos satub sisend', 0.1909), ('tegija võistlus

In [91]:
import os
import re
import csv
from sentence_transformers import SentenceTransformer
from keybert import KeyBERT
from flair.embeddings import TransformerDocumentEmbeddings
from sklearn.feature_extraction.text import CountVectorizer

def load_and_preprocess_stopwords(file_path):
    with open(file_path, 'r', encoding='UTF-8') as file:
        stopwords = [re.sub(r'\W+', '', line.strip().lower()) for line in file]
    return stopwords

def custom_tokenizer(doc):
    return re.split(r'[\s,.!?;:()]+', doc)

def read_documents(folder_path):
    docs = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
                docs.append((filename, file.read()))
    return docs

def run_models(docs, models, stopwords, output_base, diversities, ngram_ranges):
    for diversity in diversities:
        for ngram_range in ngram_ranges:
            vectorizer = CountVectorizer(tokenizer=custom_tokenizer, ngram_range=ngram_range, stop_words=stopwords, token_pattern=None)
            for model_name, model in models.items():
                output_dir = os.path.join(output_base, f"diversity_{diversity}", model_name, f"ngram_{ngram_range[0]}")
                os.makedirs(output_dir, exist_ok=True)
                for filename, doc in docs:
                    output_path = os.path.join(output_dir, f"{filename[:-4]}.csv")
                    keywords = model.extract_keywords(doc, use_mmr=True, diversity=diversity, vectorizer=vectorizer, nr_candidates=50, top_n=50)
                    with open(output_path, 'w', newline='', encoding='utf-8') as csvfile:
                        writer = csv.writer(csvfile, delimiter=';')
                        writer.writerow(['keyphrase', 'score'])
                        for keyphrase, score in keywords:
                            writer.writerow([keyphrase, score])

def main():
    folder_path = 'raw_text'
    stopwords = load_and_preprocess_stopwords('estonian-stopwords.txt')
    models = {
        'LaBSE': KeyBERT(model=SentenceTransformer('sentence-transformers/LaBSE')),
        'MiniLM': KeyBERT(model=SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')),
        'e5': KeyBERT(model=SentenceTransformer('intfloat/multilingual-e5-large-instruct')),
        'roberta_est': KeyBERT(model=TransformerDocumentEmbeddings('FacebookAI/xlm-roberta-base')),
        'estBERT': KeyBERT(model=TransformerDocumentEmbeddings('tartuNLP/EstBERT'))
    }
    docs = read_documents(folder_path)
    diversities = [0, 0.3, 0.7, 1]
    ngram_ranges = [(1, 1), (2, 2), (3, 3)]
    run_models(docs, models, stopwords, 'models', diversities, ngram_ranges)

if __name__ == '__main__':
    main()


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.44 GiB. GPU 0 has a total capacity of 8.00 GiB of which 0 bytes is free. Of the allocated memory 19.15 GiB is allocated by PyTorch, and 2.27 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)