In [2]:
import os
import re
import pandas as pd
import nltk
import networkx as nx

nltk.download("punkt")


def load_stopwords(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        stopwords_list = file.read().strip().split("\n")
    return set(stopwords_list)


def preprocess_text(text, stopwords_set):
    text = re.sub(r"[^\w\s-]", "", text)
    words = nltk.word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stopwords_set]
    return filtered_words


def build_graph(words):
    gr = nx.Graph()
    gr.add_nodes_from(set(words))
    window_size = 2
    for i in range(len(words) - window_size + 1):
        window = words[i : i + window_size]
        for j in range(1, len(window)):
            for k in range(j):
                gr.add_edge(window[k], window[j])
    return gr


def extract_keywords(graph):
    ranks = nx.pagerank(graph)
    ranked_keywords = sorted(
        ((word, ranks[word]) for word in ranks), key=lambda x: x[1], reverse=True
    )
    return ranked_keywords[:180]


def save_keywords_to_csv(keywords_data, folder, base_filename):
    df = pd.DataFrame(keywords_data, columns=["Keyword", "Score"])
    file_path = os.path.join(folder, f"{base_filename}.csv")
    df.to_csv(file_path, sep=";", index=False, header=True)


stopwords = load_stopwords("estonian-stopwords.txt")

raw_text_dir = "raw_text"
words_dir = os.path.join("scores", "TextRank", "words")
os.makedirs(words_dir, exist_ok=True)

for filename in os.listdir(raw_text_dir):
    file_path = os.path.join(raw_text_dir, filename)
    with open(file_path, "r", encoding="utf-8") as file:
        raw_text = file.read()

    processed_words = preprocess_text(raw_text, stopwords)
    text_graph = build_graph(processed_words)
    keywords_raw = extract_keywords(text_graph)
    base_filename = os.path.splitext(filename)[0]
    save_keywords_to_csv(keywords_raw, words_dir, base_filename)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\herma\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
import os
import re
import pandas as pd
import nltk
import networkx as nx

nltk.download("punkt")


def load_stopwords(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        stopwords_list = file.read().strip().split("\n")
    return set(stopwords_list)


def preprocess_text(text, stopwords_set):
    text = re.sub(r"[^\w\s-]", "", text)
    words = nltk.word_tokenize(text.lower())
    filtered_words = [word for word in words if word not in stopwords_set]
    return filtered_words


def build_graph(words):
    gr = nx.Graph()
    gr.add_nodes_from(set(words))
    window_size = 2
    for i in range(len(words) - window_size + 1):
        window = words[i : i + window_size]
        for j in range(1, len(window)):
            for k in range(j):
                gr.add_edge(window[k], window[j])
    return gr


def extract_keywords(graph):
    ranks = nx.pagerank(graph)
    ranked_keywords = sorted(
        ((word, ranks[word]) for word in ranks), key=lambda x: x[1], reverse=True
    )
    return ranked_keywords[:180]


def save_keywords_to_csv(keywords_data, folder, base_filename):
    df = pd.DataFrame(keywords_data, columns=["Keyword", "Score"])
    file_path = os.path.join(folder, f"{base_filename}.csv")
    df.to_csv(file_path, sep=";", index=False, header=True)


stopwords = load_stopwords("estonian-stopwords.txt")


raw_text_dir = "raw_text"
words_dir = os.path.join("scores", "TextRank", "words_LCF")
os.makedirs(words_dir, exist_ok=True)

for filename in os.listdir(raw_text_dir):
    file_path = os.path.join(raw_text_dir, filename)
    with open(file_path, "r", encoding="utf-8") as file:
        raw_text = file.read()

    processed_words = preprocess_text(raw_text, stopwords)
    text_graph = build_graph(processed_words)
    keywords_raw = extract_keywords(text_graph)
    base_filename = os.path.splitext(filename)[0]
    save_keywords_to_csv(keywords_raw, words_dir, base_filename)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\herma\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
