In [1]:
pip install openpyxl

Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install fuzzywuzzy python-Levenshtein

Note: you may need to restart the kernel to use updated packages.


In [21]:
# CREATE KEYWORDS AND KEYPHRASES SEPERATIOn
import os
import pandas as pd
from openpyxl import load_workbook


def ensure_directory_exists(path):
    """Ensure the specified directory exists; create it if it doesn't."""
    if not os.path.exists(path):
        os.makedirs(path)


def extract_id_from_filename(filename):
    """Extract the ID from the filename, which is assumed to be within parentheses."""
    start = filename.find("(")
    end = filename.find(")")
    if start != -1 and end != -1:
        return filename[start + 1 : end]
    return None


def update_csv(data_frame, csv_path):
    """Save the DataFrame to a CSV file."""
    data_frame.to_csv(csv_path, index=False)


def generate_ngrams(words, max_size=3):
    """Generate n-grams up to the max_size from a list of words."""
    ngrams = []
    total_words = len(words)
    for size in range(2, max_size + 1):
        for start in range(total_words - size + 1):
            ngram = " ".join(words[start : start + size])
            ngrams.append(ngram)
    return ngrams


def process_cell_content(cell_content, csv_path_keywords, csv_path_keyphrases):
    """Process each cell content, updating keywords and keyphrases CSV files."""
    keywords_data = (
        pd.read_csv(csv_path_keywords)
        if os.path.exists(csv_path_keywords)
        else pd.DataFrame(columns=["keyword"])
    )
    keyphrases_data = (
        pd.read_csv(csv_path_keyphrases)
        if os.path.exists(csv_path_keyphrases)
        else pd.DataFrame(columns=["keyphrase"])
    )

    words = cell_content.split()
    all_phrases = generate_ngrams(words) if len(words) > 1 else []

    if len(words) > 1:
        full_phrase = " ".join(words)
        all_phrases.append(full_phrase)

    for phrase in set(all_phrases):
        if phrase not in keyphrases_data["keyphrase"].values:
            keyphrases_data = pd.concat(
                [keyphrases_data, pd.DataFrame({"keyphrase": [phrase]})],
                ignore_index=True,
            )

    for word in words:
        word = word.strip()
        if word not in keywords_data["keyword"].values:
            keywords_data = pd.concat(
                [keywords_data, pd.DataFrame({"keyword": [word]})], ignore_index=True
            )

    update_csv(keywords_data, csv_path_keywords)
    update_csv(keyphrases_data, csv_path_keyphrases)


def process_excel_file(file_path, source_id):
    wb = load_workbook(filename=file_path, data_only=True)
    sheet = wb.active

    base_dir = "filol_scores"
    keywords_dir = os.path.join(base_dir, "keywords", f"philologist_{source_id}")
    keyphrases_dir = os.path.join(base_dir, "keyphrases", f"philologist_{source_id}")
    ensure_directory_exists(keywords_dir)
    ensure_directory_exists(keyphrases_dir)

    for col in sheet.iter_cols(
        min_row=1, max_col=sheet.max_column, min_col=1, values_only=True
    ):
        filename_cell = col[0]
        if isinstance(filename_cell, str) and filename_cell.endswith(".txt"):
            base_filename = filename_cell.rstrip(".txt")
            csv_path_keywords = os.path.join(keywords_dir, f"{base_filename}.csv")
            csv_path_keyphrases = os.path.join(keyphrases_dir, f"{base_filename}.csv")

            for cell in col[1:]:
                if cell is None or not isinstance(cell, str):
                    continue
                process_cell_content(cell, csv_path_keywords, csv_path_keyphrases)


if __name__ == "__main__":
    base_path = "filol_scores/originalFiles"
    for file in os.listdir(base_path):
        if file.endswith(".xlsx"):
            file_path = os.path.join(base_path, file)
            source_id = extract_id_from_filename(file)
            if source_id:
                process_excel_file(file_path, source_id)


In [25]:
# SCRIPT TO CREATE LEMMATIZED VERSIONS
import os
import pandas as pd
from openpyxl import load_workbook
import stanza

nlp = stanza.Pipeline(lang="et", processors="tokenize,lemma")


def ensure_directory_exists(path):
    """Ensure the specified directory exists; create it if it doesn't."""
    if not os.path.exists(path):
        os.makedirs(path)


def extract_id_from_filename(filename):
    """Extract the ID from the filename, which is assumed to be within parentheses."""
    start = filename.find("(")
    end = filename.find(")")
    if start != -1 and end != -1:
        return filename[start + 1 : end]
    return None


def update_csv(data_frame, csv_path):
    """Save the DataFrame to a CSV file."""
    data_frame.to_csv(csv_path, index=False)


def lemmatize_content(text):
    """Return lemmatized versions of words and the entire text as a phrase."""
    doc = nlp(text)
    lemmas = [word.lemma for sent in doc.sentences for word in sent.words]
    return lemmas


def generate_ngrams(words, max_size=3):
    """Generate n-grams up to the max_size from a list of words."""
    ngrams = []
    total_words = len(words)
    for size in range(2, max_size + 1):
        for start in range(total_words - size + 1):
            ngram = " ".join(words[start : start + size])
            ngrams.append(ngram)
    return ngrams


def process_cell_content(cell_content, csv_path_keywords, csv_path_keyphrases):
    """Process each cell content, updating lemmatized keywords and keyphrases CSV files."""
    keywords_data = (
        pd.read_csv(csv_path_keywords)
        if os.path.exists(csv_path_keywords)
        else pd.DataFrame(columns=["keyword"])
    )
    keyphrases_data = (
        pd.read_csv(csv_path_keyphrases)
        if os.path.exists(csv_path_keyphrases)
        else pd.DataFrame(columns=["keyphrase"])
    )

    lemmatized_words = lemmatize_content(cell_content)
    ngrams = generate_ngrams(lemmatized_words)

    if len(lemmatized_words) > 1:
        full_phrase = " ".join(lemmatized_words)
        ngrams.append(full_phrase)

    for ngram in set(ngrams):
        if ngram not in keyphrases_data["keyphrase"].values:
            keyphrases_data = pd.concat(
                [keyphrases_data, pd.DataFrame({"keyphrase": [ngram]})],
                ignore_index=True,
            )

    for lemma in lemmatized_words:
        if lemma not in keywords_data["keyword"].values:
            keywords_data = pd.concat(
                [keywords_data, pd.DataFrame({"keyword": [lemma]})], ignore_index=True
            )

    update_csv(keywords_data, csv_path_keywords)
    update_csv(keyphrases_data, csv_path_keyphrases)


def process_excel_file(file_path, source_id):
    wb = load_workbook(filename=file_path, data_only=True)
    sheet = wb.active

    base_dir = os.path.join("filol_scores")
    keywords_dir = os.path.join(base_dir, "keywords_lemma", f"philologist_{source_id}")
    keyphrases_dir = os.path.join(
        base_dir, "keyphrases_lemma", f"philologist_{source_id}"
    )
    ensure_directory_exists(keywords_dir)
    ensure_directory_exists(keyphrases_dir)

    for col in sheet.iter_cols(
        min_row=1, max_col=sheet.max_column, min_col=1, values_only=True
    ):
        filename_cell = col[0]
        if isinstance(filename_cell, str) and filename_cell.endswith(".txt"):
            base_filename = filename_cell.rstrip(".txt")
            csv_path_keywords = os.path.join(keywords_dir, f"{base_filename}.csv")
            csv_path_keyphrases = os.path.join(keyphrases_dir, f"{base_filename}.csv")

            for cell in col[1:]:
                if cell is None or not isinstance(cell, str):
                    continue
                process_cell_content(cell, csv_path_keywords, csv_path_keyphrases)


if __name__ == "__main__":
    base_path = "filol_scores/originalFiles"
    for file in os.listdir(base_path):
        if file.endswith(".xlsx"):
            file_path = os.path.join(base_path, file)
            source_id = extract_id_from_filename(file)
            if source_id:
                process_excel_file(file_path, source_id)


2024-05-04 15:57:58 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json: 367kB [00:00, 5.51MB/s]
2024-05-04 15:57:58 INFO: Loading these models for language: et (Estonian):
| Processor | Package      |
----------------------------
| tokenize  | edt          |
| lemma     | edt_nocharlm |

2024-05-04 15:57:58 INFO: Using device: cuda
2024-05-04 15:57:58 INFO: Loading: tokenize
2024-05-04 15:57:58 INFO: Loading: lemma
2024-05-04 15:57:58 INFO: Done loading processors!


In [26]:
# CLEAN _ + from lemmatization
import os
import pandas as pd


def clean_text(text):
    """Remove specific unwanted characters from the text."""
    return text.replace("_", "").replace("+", "").replace("=", "")


def process_files(directory):
    """Process all CSV files in the specified directory to clean text fields."""
    for root, dirs, files in os.walk(directory):
        for filename in files:
            if filename.endswith(".csv"):
                path = os.path.join(root, filename)
                df = pd.read_csv(path)
                if "keyword" in df.columns:
                    df["keyword"] = df["keyword"].apply(clean_text)
                if "keyphrase" in df.columns:
                    df["keyphrase"] = df["keyphrase"].apply(clean_text)
                df.to_csv(path, index=False)


if __name__ == "__main__":
    base_directories = ["filol_scores/keywords_lemma", "filol_scores/keyphrases_lemma"]
    for base_dir in base_directories:
        process_files(base_dir)


In [28]:
import os
import csv


def read_keywords(filename):
    with open(filename, newline="", encoding="utf-8") as csvfile:
        reader = csv.reader(csvfile)
        next(reader)
        return {row[0].strip().lower() for row in reader if row}


def overlap_similarity(set1, set2):
    if not set1 or not set2:
        return 0.0
    intersection = set1.intersection(set2)
    smaller_set_size = min(len(set1), len(set2))
    return len(intersection) / smaller_set_size


def jaccard_similarity(set1, set2):
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    if not union:
        return 1.0
    return len(intersection) / len(union)


def main():
    base_dir = "filol_scores"
    m1_dir = os.path.join(base_dir, "keywords/philologist_M1")
    m2_dir = os.path.join(base_dir, "keywords/philologist_M2")

    similarities = {"Overlap": [], "Jaccard": []}
    categories = {f"{i} words": 0 for i in range(1, 11)}
    categories[">10 words"] = 0

    m1_more_words = 0
    m2_more_words = 0
    same_word_count = 0

    total_keywords_m1 = 0
    total_keywords_m2 = 0

    for filename in os.listdir(m1_dir):
        m1_path = os.path.join(m1_dir, filename)
        m2_path = os.path.join(m2_dir, filename)

        if os.path.exists(m1_path) and os.path.exists(m2_path):
            m1_keywords = read_keywords(m1_path)
            m2_keywords = read_keywords(m2_path)
            overlap_score = overlap_similarity(m1_keywords, m2_keywords)
            jaccard_score = jaccard_similarity(m1_keywords, m2_keywords)
            similarities["Overlap"].append(overlap_score)
            similarities["Jaccard"].append(jaccard_score)

            num_common = len(m1_keywords.intersection(m2_keywords))

            if num_common > 10:
                categories[">10 words"] += 1
            elif num_common >= 1:
                categories[f"{num_common} words"] += 1

            if len(m1_keywords) > len(m2_keywords):
                m1_more_words += 1
            elif len(m2_keywords) > len(m1_keywords):
                m2_more_words += 1
            elif len(m1_keywords) == len(m2_keywords):
                same_word_count += 1

            total_keywords_m1 += len(m1_keywords)
            total_keywords_m2 += len(m2_keywords)

    total_files = 180
    if total_files > 0:
        print("File Count by Common Words Category:")
        for category, count in categories.items():
            percentage = (count / total_files) * 100
            print(f"  {category}: {count} files, {percentage:.2f}%")

        print(f"Philologist M1 has more words in {m1_more_words} files.")
        print(f"Philologist M2 has more words in {m2_more_words} files.")
        print(
            f"Both philologists have the same number of words in {same_word_count} files."
        )

        avg_keywords_m1 = total_keywords_m1 / total_files
        avg_keywords_m2 = total_keywords_m2 / total_files
        print(f"Average number of keywords for Philologist M1: {avg_keywords_m1:.2f}")
        print(f"Average number of keywords for Philologist M2: {avg_keywords_m2:.2f}")

    if similarities["Overlap"]:
        avg_overlap = sum(similarities["Overlap"]) / len(similarities["Overlap"])
        avg_jaccard = sum(similarities["Jaccard"]) / len(similarities["Jaccard"])
        print(f"Average Overlap Similarity across all files: {avg_overlap:.4f}")
        print(f"Average Jaccard Similarity across all files: {avg_jaccard:.4f}")
    else:
        print("No comparable files found.")


if __name__ == "__main__":
    main()


File Count by Common Words Category:
  1 words: 1 files, 0.56%
  2 words: 18 files, 10.00%
  3 words: 27 files, 15.00%
  4 words: 26 files, 14.44%
  5 words: 36 files, 20.00%
  6 words: 23 files, 12.78%
  7 words: 17 files, 9.44%
  8 words: 7 files, 3.89%
  9 words: 10 files, 5.56%
  10 words: 6 files, 3.33%
  >10 words: 8 files, 4.44%
Philologist M1 has more words in 77 files.
Philologist M2 has more words in 70 files.
Both philologists have the same number of words in 33 files.
Average number of keywords for Philologist M1: 13.24
Average number of keywords for Philologist M2: 13.07
Average Overlap Similarity across all files: 0.4372
Average Jaccard Similarity across all files: 0.2732


In [24]:
import os
import csv


def read_keywords(filename):
    with open(filename, newline="", encoding="utf-8") as csvfile:
        reader = csv.reader(csvfile)
        next(reader)  # Skip the header
        return {row[0].strip().lower() for row in reader if row}


def jaccard_similarity(set1, set2):
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    if not union:
        return 1.0  # If both sets are empty, define similarity as 1
    return len(intersection) / len(union)


def main():
    base_dir = "filol_scores"
    m1_dir = os.path.join(base_dir, "keywords_lemma/philologist_M1")
    m2_dir = os.path.join(base_dir, "keywords_lemma/philologist_M2")

    similarities = []

    for filename in os.listdir(m1_dir):
        m1_path = os.path.join(m1_dir, filename)
        m2_path = os.path.join(m2_dir, filename)

        if os.path.exists(m1_path) and os.path.exists(m2_path):
            m1_keywords = read_keywords(m1_path)
            m2_keywords = read_keywords(m2_path)
            similarity = jaccard_similarity(m1_keywords, m2_keywords)
            similarities.append(similarity)
            # print(f"Jaccard Similarity for {filename}: {similarity:.4f}")

    if similarities:
        average_similarity = sum(similarities) / len(similarities)
        print(f"Average Jaccard Similarity across all files: {average_similarity:.4f}")
    else:
        print("No comparable files found.")


if __name__ == "__main__":
    main()


Average Jaccard Similarity across all files: 0.2970
