In [1]:
import numpy as np
import networkx as nx
import os
import csv
from collections import defaultdict

# Load stopwords
with open('estonian-stopwords.txt', 'r', encoding='utf-8') as f:
    stopwords = set(f.read().splitlines())

def load_and_preprocess_data(filepath, stopwords):
    sentences = []
    word_details = defaultdict(dict)
    with open(filepath, 'r', encoding='utf-8') as file:
        current_sentence = []
        for line in file:
            parts = line.strip().split(';')
            if parts[0] == '1' and current_sentence:  # Sentence boundary
                sentences.append(current_sentence)
                current_sentence = []
            word = parts[3].lower().replace('_', '').replace('=', '').replace('+', '')
            lemma = parts[3].replace('_', '').replace('=', '').replace('+', '')
            upos = parts[4]
            if upos not in ('PUNCT', 'NUM'):
                word_key = f"{lemma}_{parts[4]}"
                current_sentence.append(word_key)
                word_details[word_key] = {
                    'original_word': parts[2].replace('_', '').replace('=', '').replace('+', ''), 
                    'lemma': lemma, 
                    'upos': parts[4], 
                    'ner tag': parts[9]
                }
        if current_sentence:
            sentences.append(current_sentence)
    
    # Filter out stopwords from word_details based on updated conditions
    filtered_word_details = {}
    for word_key, details in word_details.items():
        word = details['original_word'].lower()
        lemma = details['lemma']
        # Check if both word and lemma are not in stopwords
        if word not in stopwords or lemma.lower() not in stopwords:
            # Post-processing for case sensitivity
            if not any(char.isupper() for char in lemma):
                details['original_word'] = word
            filtered_word_details[word_key] = details

    # Rebuild sentences with filtered words
    filtered_sentences = []
    for sentence in sentences:
        filtered_sentence = [word_key for word_key in sentence if word_key in filtered_word_details]
        if filtered_sentence:
            filtered_sentences.append(filtered_sentence)

    return filtered_sentences, filtered_word_details

def calculate_pagerank_scores(pre_processed_dir, stopwords):
    scores_dir = 'scores/TextRankScores'
    os.makedirs(scores_dir, exist_ok=True)

    for filename in os.listdir(pre_processed_dir):
        if not filename.endswith('.csv'):
            continue
        filepath = os.path.join(pre_processed_dir, filename)
        sentences, word_details = load_and_preprocess_data(filepath, stopwords)
        
        # The rest of the function remains unchanged...

        
        vocab = set([word_key for sentence in sentences for word_key in sentence])
        vocab = {word_key: i for i, word_key in enumerate(vocab)}
        matrix_size = len(vocab)
        co_occurrence_matrix = np.zeros((matrix_size, matrix_size), dtype=float)

        window_size = 2
        for sentence in sentences:
            for i, word_key in enumerate(sentence):
                for j in range(max(i - window_size, 0), min(i + window_size + 1, len(sentence))):
                    if i != j:
                        co_occurrence_matrix[vocab[word_key]][vocab[sentence[j]]] += 1

        graph = nx.from_numpy_array(co_occurrence_matrix)
        scores = nx.pagerank(graph)

        # Prepare the output data correctly
        output_data = [
            [word_details[word_key]['original_word'], word_details[word_key]['lemma'], word_details[word_key]['upos'], word_details[word_key]['ner tag'], scores[index]]
            for word_key, index in vocab.items()
        ]
        
        output_data.sort(key=lambda x: x[-1], reverse=True)  # Sort by score

        # Write to CSV, changing filename extension to .csv
        output_filename = f"{os.path.splitext(os.path.basename(filepath))[0]}.csv"
        output_filepath = os.path.join(scores_dir, output_filename)
        with open(output_filepath, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.writer(csvfile,delimiter=';')
            writer.writerow(['word', 'lemma', 'upos', 'ner tag', 'score'])
            writer.writerows(output_data)
        
        print(f"Processed {filename} -> {output_filepath}")


calculate_pagerank_scores('pre_processed_text_data', stopwords)


Processed t152649.csv -> scores/TextRankScores\t152649.csv
Processed t155284.csv -> scores/TextRankScores\t155284.csv
Processed t157341.csv -> scores/TextRankScores\t157341.csv
Processed t157958.csv -> scores/TextRankScores\t157958.csv
Processed t158936.csv -> scores/TextRankScores\t158936.csv
Processed t161609.csv -> scores/TextRankScores\t161609.csv
Processed t162505.csv -> scores/TextRankScores\t162505.csv
Processed t162792.csv -> scores/TextRankScores\t162792.csv
Processed t164900.csv -> scores/TextRankScores\t164900.csv
Processed t166260.csv -> scores/TextRankScores\t166260.csv
Processed t887.csv -> scores/TextRankScores\t887.csv
Processed t896.csv -> scores/TextRankScores\t896.csv
Processed t903.csv -> scores/TextRankScores\t903.csv
