In [6]:
# CREATES LOWERCASE REFERENCE CORPUS VERSION  LOWER CASE TRUE
import pandas as pd
import os


input_dir = "SimpleMaths/corpus/reference"
output_dir = "SimpleMaths/corpus/reference_LCT"


os.makedirs(os.path.join(output_dir, "lemmas"), exist_ok=True)
os.makedirs(os.path.join(output_dir, "words"), exist_ok=True)


def process_csv(file_path, output_subdir):
    data = pd.read_csv(os.path.join(input_dir, file_path))
    data.iloc[:, 0] = data.iloc[:, 0].str.lower()
    data = data.groupby(data.columns[0], as_index=False).agg({data.columns[1]: "sum"})
    data = data.sort_values(by=data.columns[1], ascending=False)
    data.to_csv(
        os.path.join(output_dir, output_subdir, os.path.basename(file_path)),
        index=False,
    )


process_csv("lemmas/lemmas.csv", "lemmas")
process_csv("words/words.csv", "words")
print(
    "CSV files have been processed and saved with summed counts for identical entries, sorted by count."
)


CSV files have been processed and saved with summed counts for identical entries, sorted by count.


In [2]:
# CREATES PRE_PROCESSED_TEXT_DATA_FOLDER
import os
import re
import stanza

nlp = stanza.Pipeline(lang="et", use_gpu=True)
feat_labels = [
    "file",
    "sent id",
    "word id",
    "word",
    "lemma",
    "upos",
    "deprel",
    "head",
    "head upos",
    "head id",
    "ner tag",
    "sent text",
]
input_directory = "raw_text/"
output_directory = "pre_processed_text_data"
if not os.path.exists(output_directory):
    os.makedirs(output_directory)


def handle_none(value):
    """Return the provided value or 'None' if the value is None"""
    return value if value is not None else "None"


for entry in os.scandir(input_directory):
    if entry.is_file() and entry.name.endswith(".txt"):
        print("Processing file:", entry.name)
        output_file_path = os.path.join(output_directory, entry.name[:-4] + ".csv")

        with open(entry.path, "r", encoding="utf-8") as input_file:
            text = input_file.read()
            text = re.sub(r"\s+", " ", text).strip()
            doc = nlp(text)
            sentence_id = 0

            with open(output_file_path, "w", encoding="utf-8") as output:
                output.write(";".join(feat_labels[1:-1]) + "\n")
                for sent in doc.sentences:
                    sentence_id += 1
                    for word in sent.words:
                        word_data = [
                            entry.name[:-4],
                            str(sentence_id),
                            str(word.id),
                            word.text,
                            handle_none(word.lemma),
                            word.upos,
                            word.deprel,
                            sent.words[word.head - 1].text if word.head > 0 else "root",
                            sent.words[word.head - 1].upos if word.head > 0 else "_",
                            str(word.head),
                            "O",
                        ]
                        output.write(";".join(word_data[1:]) + "\n")

print("Processing complete. Data written to the output directory.")


2024-05-04 19:53:26 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json: 367kB [00:00, 4.58MB/s]
2024-05-04 19:53:28 INFO: Loading these models for language: et (Estonian):
| Processor | Package      |
----------------------------
| tokenize  | edt          |
| pos       | edt_nocharlm |
| lemma     | edt_nocharlm |
| depparse  | edt_nocharlm |

2024-05-04 19:53:28 INFO: Using device: cuda
2024-05-04 19:53:28 INFO: Loading: tokenize
2024-05-04 19:53:28 INFO: Loading: pos
2024-05-04 19:53:28 INFO: Loading: lemma
2024-05-04 19:53:28 INFO: Loading: depparse
2024-05-04 19:53:28 INFO: Done loading processors!


Processing file: t10352.txt
Processing file: t105779.txt
Processing file: t105808.txt
Processing file: t106205.txt
Processing file: t106285.txt
Processing file: t106306.txt
Processing file: t106434.txt
Processing file: t106764.txt
Processing file: t10801.txt
Processing file: t10878.txt
Processing file: t109127.txt
Processing file: t10948.txt
Processing file: t110581.txt
Processing file: t112542.txt
Processing file: t1134.txt
Processing file: t114676.txt
Processing file: t115737.txt
Processing file: t116111.txt
Processing file: t116480.txt
Processing file: t120240.txt
Processing file: t121275.txt
Processing file: t126269.txt
Processing file: t12676.txt
Processing file: t127878.txt
Processing file: t128045.txt
Processing file: t129828.txt
Processing file: t131838.txt
Processing file: t134839.txt
Processing file: t135605.txt
Processing file: t136064.txt
Processing file: t136798.txt
Processing file: t137720.txt
Processing file: t138410.txt
Processing file: t138675.txt
Processing file: t139

In [3]:
# REMOVE THE = _ + FROM THE WORDS CLEAN STRINGS
output_directory = "pre_processed_text_data"


def clean_line(line):

    return re.sub(r"[+=_]", "", line)


for entry in os.scandir(output_directory):
    if entry.is_file() and entry.name.endswith(".csv"):

        with open(entry.path, "r", encoding="utf-8") as file:
            lines = file.readlines()

        cleaned_lines = [clean_line(line) for line in lines]

        with open(entry.path, "w", encoding="utf-8") as file:
            file.writelines(cleaned_lines)

print("Cleaning complete. All CSV files have been processed.")


Cleaning complete. All CSV files have been processed.


In [3]:
# TAG THE STOPWORDS
import os
import csv


def load_stopwords(file):
    with open(file, "r", encoding="utf-8") as f:
        return set(word.strip().lower() for word in f)


def process_files(
    source_directory, destination_directory, stopwords, stopwords_lemmas, column
):
    if not os.path.exists(destination_directory):
        os.makedirs(destination_directory)

    for filename in os.listdir(source_directory):
        if filename.endswith(".csv"):
            counts = {}
            with open(
                os.path.join(source_directory, filename), "r", encoding="utf-8"
            ) as file:
                reader = csv.DictReader(file, delimiter=";")
                for row in reader:
                    original_value = row[column]
                    lemma = row.get("lemma", "")

                    if lemma.islower() and original_value[0].isupper():
                        processed_value = original_value.lower()
                    else:
                        processed_value = original_value

                    processed_key_value = processed_value.replace("_", "").replace(
                        "=", ""
                    )

                    key = (
                        processed_value,
                        row["upos"],
                        row["ner tag"],
                        processed_key_value.lower(),
                    )
                    counts[key] = counts.get(key, 0) + 1

            with open(
                os.path.join(destination_directory, filename),
                "w",
                encoding="utf-8",
                newline="",
            ) as outfile:
                writer = csv.writer(outfile, delimiter=";", lineterminator="\n")
                writer.writerow([column, "upos", "ner tag", "count", "stopword"])
                for key, count in counts.items():

                    is_stopword = (
                        "yes"
                        if key[3] in stopwords or key[3] in stopwords_lemmas
                        else "no"
                    )
                    writer.writerow([key[0], key[1], key[2], count, is_stopword])


source_directory = "pre_processed_text_data"
destination_directory_words = "SimpleMaths/corpus/focus/words"
destination_directory_lemmas = "SimpleMaths/corpus/focus/lemmas"

stopwords = load_stopwords("estonian-stopwords.txt")
stopwords_lemmas = load_stopwords("estonian-stopwords-lemmas.txt")


In [4]:
# CREATE KEYNESSDATA FOR CALCULATION WITH SIMPLEMATHS NOT LOWERCASED
import pandas as pd
from pathlib import Path
from tqdm import tqdm


def process_corpus_data(base_dir, data_type):
    base_dir_path = Path(base_dir)

    column_name = "lemma" if data_type == "lemmas" else "word"

    reference_path = (
        base_dir_path / "corpus" / "reference" / data_type / f"{data_type}.csv"
    )
    print(f"Loading reference data from: {reference_path}")
    reference_df = pd.read_csv(reference_path, sep=",")

    reference_df[column_name] = (
        reference_df[column_name]
        .str.replace("_", "")
        .replace("+", "")
        .replace("=", "", regex=True)
    )

    reference_dict = pd.Series(
        reference_df["count"].values, index=reference_df[column_name]
    ).to_dict()

    rfcTotalCount = reference_df["count"].sum()

    keyness_dir = base_dir_path / "keynessData" / data_type
    keyness_dir.mkdir(parents=True, exist_ok=True)

    focus_dir = base_dir_path / "corpus" / "focus" / data_type
    focus_files = list(focus_dir.glob("*.csv"))

    for file_path in tqdm(focus_files, desc=f"Processing {data_type} files"):
        focus_df = pd.read_csv(file_path, sep=";", encoding="utf-8")

        focus_df[column_name] = (
            focus_df[column_name]
            .str.replace("_", "")
            .replace("+", "")
            .replace("=", "", regex=True)
        )

        fcTotalCount = focus_df["count"].sum()
        focus_df["rfc_count"] = (
            focus_df[column_name].map(reference_dict).fillna(0).astype(int)
        )

        focus_df["fcTotalCount"] = fcTotalCount
        focus_df["rfcTotalCount"] = rfcTotalCount
        focus_df.rename(columns={"count": "fc_count"}, inplace=True)

        output_file_path = keyness_dir / file_path.name
        focus_df.to_csv(output_file_path, index=False, sep=";")


base_dir = "SimpleMaths"
process_corpus_data(base_dir, "lemmas")
process_corpus_data(base_dir, "words")


Loading reference data from: SimpleMaths\corpus\reference\lemmas\lemmas.csv


Processing lemmas files: 100%|███████████████████████████████████████████████████████| 180/180 [01:13<00:00,  2.46it/s]


Loading reference data from: SimpleMaths\corpus\reference\words\words.csv


Processing words files: 100%|████████████████████████████████████████████████████████| 180/180 [12:00<00:00,  4.00s/it]


In [5]:
# CREATE KEYNESSDATA FOR CALCULATION WITH SIMPLEMATHS NOT LOWERCASED (LOWERCASES THE FOCUS CORPUS DATA AND USES LCT REFERENCE CORPUS)


import pandas as pd
from pathlib import Path
from tqdm import tqdm


def process_corpus_data(base_dir, data_type):
    base_dir_path = Path(base_dir)

    column_name = "lemma" if data_type == "lemmas" else "word"

    reference_path = (
        base_dir_path / "corpus" / "reference_LCT" / data_type / f"{data_type}.csv"
    )
    print(f"Loading reference data from: {reference_path}")
    reference_df = pd.read_csv(reference_path, sep=",")

    reference_df[column_name] = (
        reference_df[column_name]
        .str.replace("_", "")
        .replace("+", "")
        .replace("=", "", regex=True)
    )

    reference_dict = pd.Series(
        reference_df["count"].values, index=reference_df[column_name].str.lower()
    ).to_dict()

    rfcTotalCount = reference_df["count"].sum()

    keyness_dir = base_dir_path / "keynessData_LCT" / data_type
    keyness_dir.mkdir(parents=True, exist_ok=True)

    focus_dir = base_dir_path / "corpus" / "focus" / data_type
    focus_files = list(focus_dir.glob("*.csv"))

    for file_path in tqdm(focus_files, desc=f"Processing {data_type} files"):
        focus_df = pd.read_csv(file_path, sep=";", encoding="utf-8")

        focus_df[column_name] = (
            focus_df[column_name]
            .str.lower()
            .str.replace("_", "")
            .replace("+", "")
            .replace("=", "", regex=True)
        )

        fcTotalCount = focus_df["count"].sum()
        focus_df["rfc_count"] = (
            focus_df[column_name].map(reference_dict).fillna(0).astype(int)
        )

        focus_df["fcTotalCount"] = fcTotalCount
        focus_df["rfcTotalCount"] = rfcTotalCount
        focus_df.rename(columns={"count": "fc_count"}, inplace=True)

        output_file_path = keyness_dir / file_path.name
        focus_df.to_csv(output_file_path, index=False, sep=";")


base_dir = "SimpleMaths"
process_corpus_data(base_dir, "lemmas")
process_corpus_data(base_dir, "words")


Loading reference data from: SimpleMaths\corpus\reference_LCT\lemmas\lemmas.csv


Processing lemmas files: 100%|███████████████████████████████████████████████████████| 180/180 [01:11<00:00,  2.52it/s]


Loading reference data from: SimpleMaths\corpus\reference_LCT\words\words.csv


Processing words files: 100%|████████████████████████████████████████████████████████| 180/180 [10:33<00:00,  3.52s/it]


In [2]:
# CALCULATE SIMPLEMATHS VALUE TO THE CSV IN KEYNESSDATA
import os
import pandas as pd


def calculate_simpleMathsScore(df):

    df["rfc_count"] = df["rfc_count"].replace(0, 1)

    df["fc_per_million_hits"] = (df["fc_count"] * 1000000) / df["fcTotalCount"]
    df["rfc_per_million_hits"] = (df["rfc_count"] * 1000000) / df["rfcTotalCount"]

    df["simpleMathsScore"] = (df["fc_per_million_hits"] + 1) / (
        df["rfc_per_million_hits"] + 1
    )

    df.drop(columns=["fc_per_million_hits", "rfc_per_million_hits"], inplace=True)

    return df


lemmas_dir = "SimpleMaths/keynessData/lemmas"  ### ADJUST HERE FOR LCT OR REGULAR
words_dir = "SimpleMaths/keynessData/words"

# Process all CSV files in the lemmas directory
for filename in os.listdir(lemmas_dir):
    if filename.endswith(".csv"):
        file_path = os.path.join(lemmas_dir, filename)
        df = pd.read_csv(file_path, delimiter=";")
        updated_df = calculate_simpleMathsScore(df)
        updated_df.to_csv(file_path, index=False, sep=";")

for filename in os.listdir(words_dir):
    if filename.endswith(".csv"):
        file_path = os.path.join(words_dir, filename)
        df = pd.read_csv(file_path, delimiter=";")
        updated_df = calculate_simpleMathsScore(df)
        updated_df.to_csv(file_path, index=False, sep=";")

print("All files have been updated with simpleMathsScore.")


All files have been updated with simpleMathsScore.


In [1]:
# CALCULATE SIMPLEMATSH FOR LOWER CASE TRUE DATA
import os
import pandas as pd


def calculate_simpleMathsScore(df):

    df["rfc_count"] = df["rfc_count"].replace(0, 1)

    df["fc_per_million_hits"] = (df["fc_count"] * 1000000) / df["fcTotalCount"]
    df["rfc_per_million_hits"] = (df["rfc_count"] * 1000000) / df["rfcTotalCount"]

    df["simpleMathsScore"] = (df["fc_per_million_hits"] + 1) / (
        df["rfc_per_million_hits"] + 1
    )

    df.drop(columns=["fc_per_million_hits", "rfc_per_million_hits"], inplace=True)
    return df


lemmas_dir = "SimpleMaths/keynessData_LCT/lemmas"  ### ADJUST HERE FOR LCT OR REGULAR
words_dir = "SimpleMaths/keynessData_LCT/words"


for filename in os.listdir(lemmas_dir):
    if filename.endswith(".csv"):
        file_path = os.path.join(lemmas_dir, filename)
        df = pd.read_csv(file_path, delimiter=";")
        updated_df = calculate_simpleMathsScore(df)
        updated_df.to_csv(file_path, index=False, sep=";")


for filename in os.listdir(words_dir):
    if filename.endswith(".csv"):
        file_path = os.path.join(words_dir, filename)
        df = pd.read_csv(file_path, delimiter=";")
        updated_df = calculate_simpleMathsScore(df)
        updated_df.to_csv(file_path, index=False, sep=";")

print("All files have been updated with simpleMathsScore.")


All files have been updated with simpleMathsScore.


In [26]:
# EXPORT LOWER CASE TRUE AS LEMMA AND WORDS; EXPORT LOWER CASE FALSE AS LEMMA_LCF AND WORDS_LCF
import os
import pandas as pd


def ensure_output_dirs(base_dir, suffix):
    """Create specified output directories for lemmas and words with a given suffix."""
    lemmas_output_dir = os.path.join(base_dir, "lemmas" + suffix)
    words_output_dir = os.path.join(base_dir, "words" + suffix)
    os.makedirs(lemmas_output_dir, exist_ok=True)
    os.makedirs(words_output_dir, exist_ok=True)
    return lemmas_output_dir, words_output_dir


def process_files(directory, output_directory, fc_count_threshold):
    """Process files to filter data and write to the output directory."""
    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            filepath = os.path.join(directory, filename)
            df = pd.read_csv(filepath, sep=";", encoding="utf-8")
            df = df[
                (~df["upos"].isin(["PUNCT", "NUM"]))
                & (df["fc_count"] >= fc_count_threshold)
                & (df["stopword"] != "yes")
            ]
            cols_to_keep = [
                "lemma" if "lemma" in df.columns else "word",
                "fc_count",
                "stopword",
                "rfc_count",
                "fcTotalCount",
                "rfcTotalCount",
                "simpleMathsScore",
            ]
            df = df[cols_to_keep].sort_values(by="simpleMathsScore", ascending=False)
            output_filepath = os.path.join(output_directory, filename)
            df.to_csv(output_filepath, sep=";", index=False, encoding="utf-8")


scores_base_dir = "scores/SimpleMaths"


lemmas_lcf_dir = "SimpleMaths/keynessData/lemmas"
words_lcf_dir = "SimpleMaths/keynessData/words"
scores_lemmas_lcf_dir, scores_words_lcf_dir = ensure_output_dirs(
    scores_base_dir, "_LCF"
)


lemmas_lct_dir = "SimpleMaths/keynessData_LCT/lemmas"
words_lct_dir = "SimpleMaths/keynessData_LCT/words"
scores_lemmas_lct_dir, scores_words_lct_dir = ensure_output_dirs(scores_base_dir, "")


fc_count_threshold = 1
# Process LCF
process_files(lemmas_lcf_dir, scores_lemmas_lcf_dir, fc_count_threshold)
process_files(words_lcf_dir, scores_words_lcf_dir, fc_count_threshold)
# Process LCT
process_files(lemmas_lct_dir, scores_lemmas_lct_dir, fc_count_threshold)
process_files(words_lct_dir, scores_words_lct_dir, fc_count_threshold)
