In [3]:
#CREATE THE UNIFIED CUSTOM FOLDER FOR MEASURING KEYBERT F1 ACCURACY
import os
import pandas as pd

def create_unified_data():
    base_path = os.getcwd() 
    models_path = os.path.join(base_path, 'models')
    unified_path = os.path.join(models_path, 'unified_data_custom')
    os.makedirs(unified_path, exist_ok=True)
    
    subdirs = ['raw_text_lemma_data_LCF', 'raw_text_lemma_data', 'raw_text_data_LCF', 'raw_text_data']
    ngrams = ['ngram_1_1', 'ngram_2_2', 'ngram_3_3']
    diversities = [f"diversity_{i}" for i in range(11)]
    models = ['EstBERT', 'est-roberta', 'LaBSE', 'bertMulti', 'distilbertMulti', 'MiniLM_multi', 'MiniLM-L12_multi', 'multi_e5', 'xml_roberta']
    m1_path = os.path.join(base_path, 'filol_scores', 'keywords', 'philologist_M1')

    for subdir in subdirs:
        for ngram in ngrams:
            for diversity in diversities:
                unified_sub_path = os.path.join(unified_path, subdir, ngram, diversity)
                os.makedirs(unified_sub_path, exist_ok=True)
                model_paths = [os.path.join(models_path, subdir, model, ngram, diversity) for model in models]
                
                example_files = os.listdir(model_paths[0])
                for file_name in example_files:
                    m1_file_path = os.path.join(m1_path, file_name)
                    if os.path.exists(m1_file_path):
                        m1_df = pd.read_csv(m1_file_path, delimiter=';', header=0)
                        expected_words = len(m1_df) 

                        data_frames = {}
                        for model, path in zip(models, model_paths):
                            file_path = os.path.join(path, file_name)
                            if os.path.exists(file_path):
                                df = pd.read_csv(file_path, delimiter=';', header=0)
                                data_frames[model] = df.iloc[:expected_words, 0]

                        combined_df = pd.DataFrame(data_frames)
                        combined_df.to_csv(os.path.join(unified_sub_path, file_name), index=False, sep=';')


create_unified_data() 


In [36]:
import os
import pandas as pd


def preprocess_keywords(keywords):
    """Preprocess a series of keywords: lowercasing and stripping spaces."""
    return keywords.str.lower().str.strip()


def calculate_f1_scores(model_base_dir, human_base_dir, output_base_dir):
    model_scores = {}
    count_files = 0

    for diversity in range(11):
        model_dir = os.path.join(model_base_dir, f"diversity_{diversity}")
        files = [f for f in os.listdir(model_dir) if f.endswith(".csv")]

        for file_name in files:
            model_file_path = os.path.join(model_dir, file_name)
            human_file_path = os.path.join(human_base_dir, file_name)

            if os.path.exists(human_file_path):
                model_keywords = pd.read_csv(model_file_path, delimiter=";")
                human_keywords = pd.read_csv(human_file_path)

                human_set = set(preprocess_keywords(human_keywords["keyword"].dropna()))
                for column in model_keywords:
                    model_set = set(
                        preprocess_keywords(model_keywords[column].dropna())
                    )
                    tp = len(model_set & human_set)
                    fp = len(model_set - human_set)
                    fn = len(human_set - model_set)

                    precision = tp / (tp + fp) if tp + fp > 0 else 0
                    recall = tp / (tp + fn) if tp + fn > 0 else 0
                    f1_score = (
                        2 * precision * recall / (precision + recall)
                        if precision + recall > 0
                        else 0
                    )

                    if column not in model_scores:
                        model_scores[column] = {"total_f1": 0, "count": 0}
                    model_scores[column]["total_f1"] += f1_score
                    model_scores[column]["count"] += 1

                count_files += 1

    scores_df = pd.DataFrame(
        [{**{"model": model}, **data} for model, data in model_scores.items()]
    )
    scores_df["average_f1"] = scores_df["total_f1"] / scores_df["count"]

    output_dir = os.path.join(output_base_dir, "f1_keybert_scores_average_results")
    os.makedirs(output_dir, exist_ok=True)

    output_file_name = model_base_dir.split("/")[-2] + ".csv"
    output_file_path = os.path.join(output_dir, output_file_name)
    scores_df.to_csv(output_file_path, index=False)

    return scores_df, count_files


configurations = [
    (
        "models/unified_data_custom/raw_text_lemma_data/ngram_1_1",
        "filol_scores/keywords_lemma/philologist_M1",
        "analytical_data/F1-score-KeyBERT/M1",
    ),
    (
        "models/unified_data_custom/raw_text_lemma_data_LCF/ngram_1_1",
        "filol_scores/keywords_lemma/philologist_M1",
        "analytical_data/F1-score-KeyBERT/M1",
    ),
    (
        "models/unified_data_custom/raw_text_data/ngram_1_1",
        "filol_scores/keywords/philologist_M1",
        "analytical_data/F1-score-KeyBERT/M1",
    ),
    (
        "models/unified_data_custom/raw_text_data_LCF/ngram_1_1",
        "filol_scores/keywords/philologist_M1",
        "analytical_data/F1-score-KeyBERT/M1",
    ),
    (
        "models/unified_data_custom/raw_text_lemma_data/ngram_1_1",
        "filol_scores/keywords_lemma/philologist_M2",
        "analytical_data/F1-score-KeyBERT/M2",
    ),
    (
        "models/unified_data_custom/raw_text_lemma_data_LCF/ngram_1_1",
        "filol_scores/keywords_lemma/philologist_M2",
        "analytical_data/F1-score-KeyBERT/M2",
    ),
    (
        "models/unified_data_custom/raw_text_data/ngram_1_1",
        "filol_scores/keywords/philologist_M2",
        "analytical_data/F1-score-KeyBERT/M2",
    ),
    (
        "models/unified_data_custom/raw_text_data_LCF/ngram_1_1",
        "filol_scores/keywords/philologist_M2",
        "analytical_data/F1-score-KeyBERT/M2",
    ),
]

for model_dir, human_dir, output_base_dir in configurations:
    scores_df, file_count = calculate_f1_scores(model_dir, human_dir, output_base_dir)
    print(
        f"Processed {file_count} files for {human_dir}. Results saved to {output_base_dir} as file {model_dir.split('/')[-1]}"
    )


Processed 1980 files for filol_scores/keywords_lemma/philologist_M1. Results saved to analytical_data/F1-score-KeyBERT/M1 as file ngram_1_1
Processed 1980 files for filol_scores/keywords_lemma/philologist_M1. Results saved to analytical_data/F1-score-KeyBERT/M1 as file ngram_1_1
Processed 1980 files for filol_scores/keywords/philologist_M1. Results saved to analytical_data/F1-score-KeyBERT/M1 as file ngram_1_1
Processed 1980 files for filol_scores/keywords/philologist_M1. Results saved to analytical_data/F1-score-KeyBERT/M1 as file ngram_1_1
Processed 1980 files for filol_scores/keywords_lemma/philologist_M2. Results saved to analytical_data/F1-score-KeyBERT/M2 as file ngram_1_1
Processed 1980 files for filol_scores/keywords_lemma/philologist_M2. Results saved to analytical_data/F1-score-KeyBERT/M2 as file ngram_1_1
Processed 1980 files for filol_scores/keywords/philologist_M2. Results saved to analytical_data/F1-score-KeyBERT/M2 as file ngram_1_1
Processed 1980 files for filol_scores/

In [37]:
import os
import pandas as pd


def preprocess_keywords(keywords):
    """Preprocess a series of keywords: lowercasing and stripping spaces."""
    return keywords.str.lower().str.strip()


def calculate_f1_scores(model_base_dir, human_base_dir, output_base_dir):
    model_scores = {}
    count_files = 0
    model_diversity_scores = {}

    for diversity in range(11):
        model_dir = os.path.join(model_base_dir, f"diversity_{diversity}")
        files = [f for f in os.listdir(model_dir) if f.endswith(".csv")]

        for file_name in files:
            model_file_path = os.path.join(model_dir, file_name)
            human_file_path = os.path.join(human_base_dir, file_name)

            if os.path.exists(human_file_path):
                model_keywords = pd.read_csv(model_file_path, delimiter=";")
                human_keywords = pd.read_csv(human_file_path)

                human_set = set(preprocess_keywords(human_keywords["keyword"].dropna()))
                for column in model_keywords:
                    model_set = set(
                        preprocess_keywords(model_keywords[column].dropna())
                    )
                    tp = len(model_set & human_set)
                    fp = len(model_set - human_set)
                    fn = len(human_set - model_set)

                    precision = tp / (tp + fp) if tp + fp > 0 else 0
                    recall = tp / (tp + fn) if tp + fn > 0 else 0
                    f1_score = (
                        2 * precision * recall / (precision + recall)
                        if precision + recall > 0
                        else 0
                    )

                    key = (diversity, column)
                    if key not in model_diversity_scores:
                        model_diversity_scores[key] = []
                    model_diversity_scores[key].append(f1_score)

                count_files += 1

    csv_data = []
    for (diversity, model), scores in model_diversity_scores.items():
        average_score = sum(scores) / len(scores)
        csv_data.append([diversity, model, average_score])

    df = pd.DataFrame(csv_data, columns=["Diversity Number", "Model name", "F1 score"])

    output_dir = os.path.join(output_base_dir, "f1_keybert_scores_per_diversity")
    os.makedirs(output_dir, exist_ok=True)
    output_file_name = model_base_dir.split("/")[-2] + ".csv"
    output_file_path = os.path.join(output_dir, output_file_name)
    df.to_csv(output_file_path, index=False)

    return df, count_files


configurations = [
    (
        "models/unified_data_custom/raw_text_lemma_data/ngram_1_1",
        "filol_scores/keywords_lemma/philologist_M1",
        "analytical_data/F1-score-KeyBERT/M1",
    ),
    (
        "models/unified_data_custom/raw_text_lemma_data_LCF/ngram_1_1",
        "filol_scores/keywords_lemma/philologist_M1",
        "analytical_data/F1-score-KeyBERT/M1",
    ),
    (
        "models/unified_data_custom/raw_text_data/ngram_1_1",
        "filol_scores/keywords/philologist_M1",
        "analytical_data/F1-score-KeyBERT/M1",
    ),
    (
        "models/unified_data_custom/raw_text_data_LCF/ngram_1_1",
        "filol_scores/keywords/philologist_M1",
        "analytical_data/F1-score-KeyBERT/M1",
    ),
    (
        "models/unified_data_custom/raw_text_lemma_data/ngram_1_1",
        "filol_scores/keywords_lemma/philologist_M2",
        "analytical_data/F1-score-KeyBERT/M2",
    ),
    (
        "models/unified_data_custom/raw_text_lemma_data_LCF/ngram_1_1",
        "filol_scores/keywords_lemma/philologist_M2",
        "analytical_data/F1-score-KeyBERT/M2",
    ),
    (
        "models/unified_data_custom/raw_text_data/ngram_1_1",
        "filol_scores/keywords/philologist_M2",
        "analytical_data/F1-score-KeyBERT/M2",
    ),
    (
        "models/unified_data_custom/raw_text_data_LCF/ngram_1_1",
        "filol_scores/keywords/philologist_M2",
        "analytical_data/F1-score-KeyBERT/M2",
    ),
]

for model_dir, human_dir, output_base_dir in configurations:
    scores_df, file_count = calculate_f1_scores(model_dir, human_dir, output_base_dir)
    print(
        f"Processed {file_count} files for {human_dir}. Results saved to {output_base_dir} as file {model_dir.split('/')[-1]}"
    )


Processed 1980 files for filol_scores/keywords_lemma/philologist_M1. Results saved to analytical_data/F1-score-KeyBERT/M1 as file ngram_1_1
Processed 1980 files for filol_scores/keywords_lemma/philologist_M1. Results saved to analytical_data/F1-score-KeyBERT/M1 as file ngram_1_1
Processed 1980 files for filol_scores/keywords/philologist_M1. Results saved to analytical_data/F1-score-KeyBERT/M1 as file ngram_1_1
Processed 1980 files for filol_scores/keywords/philologist_M1. Results saved to analytical_data/F1-score-KeyBERT/M1 as file ngram_1_1
Processed 1980 files for filol_scores/keywords_lemma/philologist_M2. Results saved to analytical_data/F1-score-KeyBERT/M2 as file ngram_1_1
Processed 1980 files for filol_scores/keywords_lemma/philologist_M2. Results saved to analytical_data/F1-score-KeyBERT/M2 as file ngram_1_1
Processed 1980 files for filol_scores/keywords/philologist_M2. Results saved to analytical_data/F1-score-KeyBERT/M2 as file ngram_1_1
Processed 1980 files for filol_scores/

In [38]:
import os
import pandas as pd


def preprocess_keywords(keywords):
    """Preprocess a series of keywords: lowercasing and stripping spaces."""
    return keywords.str.lower().str.strip()


def calculate_f1_scores(model_base_dir, human_base_dir, output_base_dir):
    model_scores = {}
    count_files = 0
    model_diversity_scores = {}
    model_averages = {}
    diversity_averages = {}

    for diversity in range(11):
        model_dir = os.path.join(model_base_dir, f"diversity_{diversity}")
        files = [f for f in os.listdir(model_dir) if f.endswith(".csv")]
        diversity_scores = []

        for file_name in files:
            model_file_path = os.path.join(model_dir, file_name)
            human_file_path = os.path.join(human_base_dir, file_name)

            if os.path.exists(human_file_path):
                model_keywords = pd.read_csv(model_file_path, delimiter=";")
                human_keywords = pd.read_csv(human_file_path)

                human_set = set(preprocess_keywords(human_keywords["keyword"].dropna()))
                for column in model_keywords:
                    model_set = set(
                        preprocess_keywords(model_keywords[column].dropna())
                    )
                    tp = len(model_set & human_set)
                    fp = len(model_set - human_set)
                    fn = len(human_set - model_set)

                    precision = tp / (tp + fp) if tp + fp > 0 else 0
                    recall = tp / (tp + fn) if tp + fn > 0 else 0
                    f1_score = (
                        2 * precision * recall / (precision + recall)
                        if precision + recall > 0
                        else 0
                    )

                    key = (diversity, column)
                    if key not in model_diversity_scores:
                        model_diversity_scores[key] = []
                    model_diversity_scores[key].append(f1_score)
                    diversity_scores.append(f1_score)

                    model_averages.setdefault(column, []).append(f1_score)

                count_files += 1

        if diversity_scores:
            diversity_averages[diversity] = sum(diversity_scores) / len(
                diversity_scores
            )

    csv_data_model = []
    csv_data_diversity = []
    for (diversity, model), scores in model_diversity_scores.items():
        average_score = sum(scores) / len(scores) * 100
        csv_data_model.append([model, average_score])
        csv_data_diversity.append([diversity, average_score])

    df_model = pd.DataFrame(csv_data_model, columns=["Model name", "F1 score"])
    df_diversity = pd.DataFrame(
        csv_data_diversity, columns=["Diversity Number", "F1 score"]
    )

    output_dir_model = os.path.join(
        output_base_dir, "Average_F1_Scores_across_all_files_and_diversities"
    )
    output_dir_diversity = os.path.join(
        output_base_dir, "Average_F1_Scores_for_each_Diversity"
    )
    os.makedirs(output_dir_model, exist_ok=True)
    os.makedirs(output_dir_diversity, exist_ok=True)
    output_file_name_model = model_base_dir.split("/")[-2] + ".csv"
    output_file_name_diversity = "diversity_scores.csv"
    output_file_path_model = os.path.join(output_dir_model, output_file_name_model)
    output_file_path_diversity = os.path.join(
        output_dir_diversity, output_file_name_diversity
    )
    df_model.to_csv(output_file_path_model, index=False)
    df_diversity.to_csv(output_file_path_diversity, index=False)

    return df_model, df_diversity, count_files


configurations = [
    (
        "models/unified_data_custom/raw_text_lemma_data/ngram_1_1",
        "filol_scores/keywords_lemma/philologist_M1",
        "analytical_data/F1-score-KeyBERT/M1",
    ),
    (
        "models/unified_data_custom/raw_text_lemma_data_LCF/ngram_1_1",
        "filol_scores/keywords_lemma/philologist_M1",
        "analytical_data/F1-score-KeyBERT/M1",
    ),
    (
        "models/unified_data_custom/raw_text_data/ngram_1_1",
        "filol_scores/keywords/philologist_M1",
        "analytical_data/F1-score-KeyBERT/M1",
    ),
    (
        "models/unified_data_custom/raw_text_data_LCF/ngram_1_1",
        "filol_scores/keywords/philologist_M1",
        "analytical_data/F1-score-KeyBERT/M1",
    ),
    (
        "models/unified_data_custom/raw_text_lemma_data/ngram_1_1",
        "filol_scores/keywords_lemma/philologist_M2",
        "analytical_data/F1-score-KeyBERT/M2",
    ),
    (
        "models/unified_data_custom/raw_text_lemma_data_LCF/ngram_1_1",
        "filol_scores/keywords_lemma/philologist_M2",
        "analytical_data/F1-score-KeyBERT/M2",
    ),
    (
        "models/unified_data_custom/raw_text_data/ngram_1_1",
        "filol_scores/keywords/philologist_M2",
        "analytical_data/F1-score-KeyBERT/M2",
    ),
    (
        "models/unified_data_custom/raw_text_data_LCF/ngram_1_1",
        "filol_scores/keywords/philologist_M2",
        "analytical_data/F1-score-KeyBERT/M2",
    ),
]

for model_dir, human_dir, output_base_dir in configurations:
    df_model, df_diversity, file_count = calculate_f1_scores(
        model_dir, human_dir, output_base_dir
    )
    print(
        f"Processed {file_count} files for {human_dir}. Results saved to {output_base_dir} as file {model_dir.split('/')[-1]}"
    )


Processed 1980 files for filol_scores/keywords_lemma/philologist_M1. Results saved to analytical_data/F1-score-KeyBERT/M1 as file ngram_1_1
Processed 1980 files for filol_scores/keywords_lemma/philologist_M1. Results saved to analytical_data/F1-score-KeyBERT/M1 as file ngram_1_1
Processed 1980 files for filol_scores/keywords/philologist_M1. Results saved to analytical_data/F1-score-KeyBERT/M1 as file ngram_1_1
Processed 1980 files for filol_scores/keywords/philologist_M1. Results saved to analytical_data/F1-score-KeyBERT/M1 as file ngram_1_1
Processed 1980 files for filol_scores/keywords_lemma/philologist_M2. Results saved to analytical_data/F1-score-KeyBERT/M2 as file ngram_1_1
Processed 1980 files for filol_scores/keywords_lemma/philologist_M2. Results saved to analytical_data/F1-score-KeyBERT/M2 as file ngram_1_1
Processed 1980 files for filol_scores/keywords/philologist_M2. Results saved to analytical_data/F1-score-KeyBERT/M2 as file ngram_1_1
Processed 1980 files for filol_scores/

In [39]:
# SIMPEL MATHS AND TEXTRANK F1 MEASURING
import pandas as pd
import os


def preprocess_keywords(keywords):
    """Preprocess a series of keywords: lowercasing and stripping spaces."""
    return keywords.apply(lambda x: x.lower().strip() if isinstance(x, str) else x)


def calculate_f1_scores(model_dir, human_dir, keyword_column):
    total_f1_score = 0
    files_counted = 0

    if not os.path.exists(model_dir) or not os.path.exists(human_dir):
        print(f"Directory not found: {model_dir} or {human_dir}")
        return None

    files = [f for f in os.listdir(model_dir) if f.endswith(".csv")]

    for file_name in files:
        model_file_path = os.path.join(model_dir, file_name)
        human_file_path = os.path.join(human_dir, file_name)

        if os.path.exists(human_file_path):
            model_keywords = pd.read_csv(model_file_path, delimiter=";")
            human_keywords = pd.read_csv(human_file_path)

            if "keyword" in human_keywords.columns:
                human_set = set(preprocess_keywords(human_keywords["keyword"].dropna()))
            else:
                continue

            if keyword_column in model_keywords.columns:
                model_set = set(
                    preprocess_keywords(model_keywords[keyword_column].dropna())
                )
                tp = len(model_set & human_set)
                fp = len(model_set - human_set)
                fn = len(human_set - model_set)

                precision = tp / (tp + fp) if tp + fp > 0 else 0
                recall = tp / (tp + fn) if tp + fn > 0 else 0
                f1_score = (
                    2 * precision * recall / (precision + recall)
                    if precision + recall > 0
                    else 0
                )

                total_f1_score += f1_score
                files_counted += 1

    if files_counted > 0:
        average_f1_score = total_f1_score / files_counted
        return average_f1_score
    else:
        return None


def main():
    methods = ["TextRank", "SimpleMaths"]
    data_types = ["lemmas", "lemmas_LCF", "words", "words_LCF"]
    human_groups = {
        "lemmas": "keywords_lemma",
        "lemmas_LCF": "keywords_lemma",
        "words": "keywords",
        "words_LCF": "keywords",
    }
    model_variants = ["M1", "M2"]
    column_names = {
        "TextRank": "Keyword",
        "SimpleMaths": {
            "lemmas": "lemma",
            "lemmas_LCF": "lemma",
            "words": "word",
            "words_LCF": "word",
        },
    }

    results = []

    output_dir = "analytical_data"
    output_file = "f1_comparison_results.csv"
    full_output_path = os.path.join(output_dir, output_file)

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        print(f"Created directory {output_dir}")

    for method in methods:
        for data_type in data_types:
            for variant in model_variants:
                human_group = human_groups[data_type]
                model_dir = f"scores/{method}/{data_type}"
                human_dir = f"filol_scores/{human_group}/philologist_{variant}"
                keyword_column = (
                    column_names[method]
                    if method == "TextRank"
                    else column_names[method][data_type]
                )
                score = calculate_f1_scores(model_dir, human_dir, keyword_column)
                if score is not None:
                    result = {
                        "Method": method,
                        "Data_Type": data_type,
                        "Human_Group": f"{human_group}_{variant}",
                        "F1 Score": score,
                    }
                    results.append(result)

    df = pd.DataFrame(results)
    df.to_csv(full_output_path, index=False)
    print(f"Results saved to '{full_output_path}'")


if __name__ == "__main__":
    main()


Results saved to 'analytical_data\f1_comparison_results.csv'
