In [19]:
import pandas as pd
import os

# Base and output directories
base_path = "./models/unified_data_10/raw_text_lemma_data_LCF"  # iterate through raw_text_data;raw_text_data_lcf;raw_text_lemma_data;raw_text_lemma_data_LCF;
philologist_path = "./filol_scores/keywords_lemma/philologist_M2"  # keywords/philologist_M1;keywords/philologist_M2;keywords_lemma/philologist_M1;keywords_lemma/philologist_M2
output_dir = "./models/diversity_accuracy/raw_text_lemma_data_LCF"  # iterate through raw_text_data;raw_text_data_lcf;raw_text_lemma_data;raw_text_lemma_data_LCF;

ngram_dirs = ["ngram_1_1", "ngram_2_2", "ngram_3_3"]

os.makedirs(output_dir, exist_ok=True)

processed_files_count = 0
unprocessed_files = []


def process_ngram_directory(ngram_dir):
    global processed_files_count, unprocessed_files
    diversity_paths = [f"{base_path}/{ngram_dir}/diversity_{i}" for i in range(11)]

    for philologist_file in os.listdir(philologist_path):
        if philologist_file.endswith(".csv"):
            philologist_file_path = os.path.join(philologist_path, philologist_file)
            df_philologist = pd.read_csv(philologist_file_path, delimiter=";")
            philologist_keywords = (
                df_philologist["keyword"].dropna().str.lower().unique()
            )
            expected_amount = len(philologist_keywords)
            model_names = [
                "EstBERT",
                "est-roberta",
                "LaBSE",
                "bertMulti",
                "distilbertMulti",
                "MiniLM_multi",
                "MiniLM-L12_multi",
                "multi_e5",
                "xml_roberta",
            ]
            results = {model: [0] * 11 for model in model_names}
            found_words = [{model: set() for model in model_names} for _ in range(11)]

            matching_files_exist = False
            for idx, path in enumerate(diversity_paths):
                diversity_file_path = os.path.join(path, philologist_file)
                if os.path.exists(diversity_file_path):
                    matching_files_exist = True
                    break

            if matching_files_exist:
                for idx, path in enumerate(diversity_paths):
                    diversity_file_path = os.path.join(path, philologist_file)
                    if os.path.exists(diversity_file_path):
                        df_diversity = pd.read_csv(
                            diversity_file_path, delimiter=";", names=model_names
                        )
                        for model in model_names:
                            entries = (
                                df_diversity[model].dropna().str.lower().str.split()
                            )
                            model_specific_words = set()
                            for entry in entries:
                                matched_words = set(entry) & set(philologist_keywords)
                                model_specific_words.update(matched_words)
                            found_words[idx][model] = model_specific_words
                            results[model][idx] = len(model_specific_words)
                        processed_files_count += 1

                output_df = pd.DataFrame(
                    results, index=[f"diversity_{i}" for i in range(11)]
                )
                output_df.reset_index(inplace=True)
                output_df.rename(columns={"index": "diversity"}, inplace=True)
                output_df["expected_amount"] = expected_amount
                output_df["words"] = [
                    "; ".join(set().union(*(d.values()))) for d in found_words
                ]

                ngram_output_dir = os.path.join(output_dir, ngram_dir)
                os.makedirs(ngram_output_dir, exist_ok=True)
                output_path = os.path.join(ngram_output_dir, philologist_file)
                output_df.to_csv(output_path, index=False)
            else:
                unprocessed_files.append(philologist_file)


for ngram_dir in ngram_dirs:
    process_ngram_directory(ngram_dir)

print(f"Total number of files processed: {processed_files_count}")
if unprocessed_files:
    print("Files not processed across all folders:")
    for file in unprocessed_files:
        print(file)
else:
    print("All files were successfully processed.")


Total number of files processed: 5907
Files not processed across all folders:
t103140.csv
t103140.csv
t103140.csv


In [None]:
import pandas as pd
import os


def process_all(sizes, models, data_types):
    for size in sizes:
        for model in models:
            for data_type in data_types:

                base_path = f"./models/unified_data_{size}/{data_type}"
                if "lemma" in data_type:
                    philologist_path = (
                        f"./filol_scores/keywords_lemma/philologist_{model}"
                    )
                else:
                    philologist_path = f"./filol_scores/keywords/philologist_{model}"
                output_dir = f"./models/diversity_accuracy_{size}_{model}/{data_type}"

                ngram_dirs = ["ngram_1_1", "ngram_2_2", "ngram_3_3"]

                os.makedirs(output_dir, exist_ok=True)

                def process_ngram_directory(ngram_dir):
                    diversity_paths = [
                        f"{base_path}/{ngram_dir}/diversity_{i}" for i in range(11)
                    ]
                    for philologist_file in os.listdir(philologist_path):
                        if philologist_file.endswith(".csv"):
                            philologist_file_path = os.path.join(
                                philologist_path, philologist_file
                            )
                            df_philologist = pd.read_csv(
                                philologist_file_path, delimiter=";"
                            )
                            philologist_keywords = (
                                df_philologist["keyword"].dropna().str.lower().unique()
                            )
                            model_names = [
                                "EstBERT",
                                "est-roberta",
                                "LaBSE",
                                "bertMulti",
                                "distilbertMulti",
                                "MiniLM_multi",
                                "MiniLM-L12_multi",
                                "multi_e5",
                                "xml_roberta",
                            ]
                            results = {
                                model_name: [0] * 11 for model_name in model_names
                            }
                            found_words = [
                                {model_name: set() for model_name in model_names}
                                for _ in range(11)
                            ]

                            for idx, path in enumerate(diversity_paths):
                                diversity_file_path = os.path.join(
                                    path, philologist_file
                                )
                                if os.path.exists(diversity_file_path):
                                    df_diversity = pd.read_csv(
                                        diversity_file_path,
                                        delimiter=";",
                                        names=model_names,
                                    )
                                    for model_name in model_names:
                                        if model_name in df_diversity.columns:
                                            entries = (
                                                df_diversity[model_name]
                                                .dropna()
                                                .astype(str)
                                                .str.lower()
                                                .str.split()
                                            )
                                            model_specific_words = set()
                                            for entry in entries:
                                                matched_words = set(entry) & set(
                                                    philologist_keywords
                                                )
                                                model_specific_words.update(matched_words)
                                            found_words[idx][
                                                model_name
                                            ] = model_specific_words
                                            results[model_name][idx] = len(
                                                model_specific_words
                                            )

                            output_df = pd.DataFrame(
                                results, index=[f"diversity_{i}" for i in range(11)]
                            )
                            output_df["expected_amount"] = len(philologist_keywords)
                            output_df["words"] = [
                                "; ".join(set().union(*(d.values())))
                                for d in found_words
                            ]
                            ngram_output_dir = os.path.join(output_dir, ngram_dir)
                            os.makedirs(ngram_output_dir, exist_ok=True)
                            output_df.to_csv(
                                os.path.join(ngram_output_dir, philologist_file),
                                index=False,
                            )

                for ngram_dir in ngram_dirs:
                    process_ngram_directory(ngram_dir)


sizes = [10, 50, 200]
models = ["M1", "M2"]
data_types = [
    "raw_text_data",
    "raw_text_data_LCF",
    "raw_text_lemma_data",
    "raw_text_lemma_data_LCF",
]

process_all(sizes, models, data_types)


In [None]:
import pandas as pd
import os


def process_data(filename):
    data = pd.read_csv(filename)
    model_columns = [
        "EstBERT",
        "est-roberta",
        "LaBSE",
        "bertMulti",
        "distilbertMulti",
        "MiniLM_multi",
        "MiniLM-L12_multi",
        "multi_e5",
        "xml_roberta",
    ]
    data["diversity"] = pd.Series(range(data.shape[0])) * (10 / (data.shape[0] - 1))
    for column in model_columns:
        if column in data.columns:
            data[column] = data[column] / data["expected_amount"]
    return data


def aggregate_and_rank(files, model_columns):
    processed_files = [process_data(file) for file in files]
    if not processed_files:
        return pd.DataFrame()
    aggregated_data = pd.concat(processed_files, ignore_index=True)
    if "diversity" not in aggregated_data.columns:
        return pd.DataFrame()
    mean_scores = aggregated_data.groupby("diversity")[model_columns].mean()
    return mean_scores


def get_best_combinations(mean_scores):
    if mean_scores.empty:
        return pd.DataFrame(columns=["Diversity", "Model", "Score"])
    
    best_combinations = []

    for diversity, row in mean_scores.iterrows():
        highest_model = row.idxmax()
        highest_score = row.max()
        best_combinations.append([f"diversity_{int(diversity)}", highest_model, highest_score])
    
    best_combinations_df = pd.DataFrame(best_combinations, columns=["Diversity", "Model", "Score"])
    best_combinations_df = best_combinations_df.sort_values(by="Score", ascending=False)
    
    return best_combinations_df


diversity_levels = ["10", "50", "200"]
model_types = ["M1", "M2"]
subfolders = ["ngram_1_1", "ngram_2_2", "ngram_3_3"]
model_columns = [
    "EstBERT",
    "est-roberta",
    "LaBSE",
    "bertMulti",
    "distilbertMulti",
    "MiniLM_multi",
    "MiniLM-L12_multi",
    "multi_e5",
    "xml_roberta",
]
common_analytical_base = "analytical_data"

if not os.path.exists(common_analytical_base):
    os.makedirs(common_analytical_base)

for diversity in diversity_levels:
    for model_type in model_types:
        input_base = f"models/diversity_accuracy_{diversity}_{model_type}"

        for data_type in [
            "raw_text_lemma_data_LCF",
            "raw_text_lemma_data",
            "raw_text_data",
            "raw_text_data_LCF",
        ]:
            input_directory_path = os.path.join(input_base, data_type)
            output_directory_path = os.path.join(
                common_analytical_base,
                f"analytical_data_{diversity}_{model_type}",
                data_type,
            )

            if not os.path.exists(output_directory_path):
                os.makedirs(output_directory_path)

            for subfolder in subfolders:
                print(
                    f"--- Processing data for {subfolder} in {input_directory_path} ---"
                )
                directory_path = os.path.join(input_directory_path, subfolder)
                files = [
                    os.path.join(directory_path, file)
                    for file in os.listdir(directory_path)
                    if file.endswith(".csv")
                ]
                mean_scores = aggregate_and_rank(files, model_columns)
                best_combinations = get_best_combinations(mean_scores)

                output_csv_file = os.path.join(
                    output_directory_path,
                    f"{subfolder}_best_combinations.csv",
                )
                best_combinations.to_csv(output_csv_file, index=False)

                print(f"Results written to {output_csv_file}")
                print("\n" + "=" * 50 + "\n")


In [None]:
import pandas as pd
import os


def process_data(filename):
    data = pd.read_csv(filename)
    model_columns = [
        "EstBERT",
        "est-roberta",
        "LaBSE",
        "bertMulti",
        "distilbertMulti",
        "MiniLM_multi",
        "MiniLM-L12_multi",
        "multi_e5",
        "xml_roberta",
    ]
    data["diversity"] = pd.Series(range(data.shape[0])) * (10 / (data.shape[0] - 1))
    for column in model_columns:
        if column in data.columns:
            data[column] = data[column] / data["expected_amount"]
    return data


def aggregate_and_rank(files, model_columns):
    processed_files = [process_data(file) for file in files]
    aggregated_data = pd.concat(processed_files, ignore_index=True)
    mean_scores = aggregated_data.groupby("diversity")[model_columns].mean()
    return mean_scores


def get_top_combinations(mean_scores, model_columns, top_n=10):
    mean_scores_flat = mean_scores.stack().reset_index()
    mean_scores_flat.columns = ["Diversity", "Model", "Mean Score"]
    top_combinations = mean_scores_flat.sort_values(
        by="Mean Score", ascending=False
    ).head(top_n)
    return top_combinations


diversity_levels = ["10", "50", "200"]
model_types = ["M1", "M2"]
subfolders = ["ngram_1_1", "ngram_2_2", "ngram_3_3"]
model_columns = [
    "EstBERT",
    "est-roberta",
    "LaBSE",
    "bertMulti",
    "distilbertMulti",
    "MiniLM_multi",
    "MiniLM-L12_multi",
    "multi_e5",
    "xml_roberta",
]
common_analytical_base = "analytical_data"

if not os.path.exists(common_analytical_base):
    os.makedirs(common_analytical_base)

for diversity in diversity_levels:
    for model_type in model_types:
        input_base = f"models/diversity_accuracy_{diversity}_{model_type}"
        output_base = os.path.join(
            common_analytical_base, f"analytical_data_{diversity}_{model_type}"
        )

        if not os.path.exists(output_base):
            os.makedirs(output_base)

        for data_type in [
            "raw_text_lemma_data_LCF",
            "raw_text_lemma_data",
            "raw_text_data",
            "raw_text_data_LCF",
        ]:
            input_directory_path = os.path.join(input_base, data_type)
            output_directory_path = os.path.join(output_base, data_type)

            if not os.path.exists(output_directory_path):
                os.makedirs(output_directory_path)

            all_files = []
            for subfolder in subfolders:
                directory_path = os.path.join(input_directory_path, subfolder)
                all_files.extend(
                    [
                        os.path.join(directory_path, file)
                        for file in os.listdir(directory_path)
                        if file.endswith(".csv")
                    ]
                )

            mean_scores = aggregate_and_rank(all_files, model_columns)
            top_combinations = get_top_combinations(mean_scores, model_columns)

            output_csv_file = os.path.join(
                output_directory_path, "across_all_ngrams_top_10_combinations.csv"
            )
            top_combinations.to_csv(output_csv_file, index=False)

            print(f"Results written to {output_csv_file}")
            print("\n" + "=" * 50 + "\n")


In [3]:
import pandas as pd
import os


def load_keywords(file_path, limit=None):
    try:
        df = pd.read_csv(file_path)
        if ";" in df.iloc[0, 0]:
            keywords = df.iloc[:, 0].str.split(";").str[0].str.lower().unique()
        else:
            keywords = df.iloc[:, 0].str.lower().unique()
        if limit:
            keywords = keywords[:limit]
        return set(keywords)
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return set()


def compare_keywords(base_keywords, comparison_files, limit):
    results = {}
    for file_path in comparison_files:
        comp_keywords = load_keywords(file_path, limit=limit)
        common_keywords = base_keywords.intersection(comp_keywords)
        results[file_path] = len(common_keywords)
    return results


def main():
    data_sets = [
        {
            "label": "lemmas",
            "base_dirs": [
                "filol_scores/keywords_lemma/philologist_M1",
                "filol_scores/keywords_lemma/philologist_M2",
            ],
            "comparison_dirs": [
                "scores/SimpleMaths/lemmas",
                "scores/TextRank/lemmas",
                "scores/SimpleMaths/lemmas_LCF",
                "scores/TextRank/lemmas_LCF",
            ],
        },
        {
            "label": "words",
            "base_dirs": [
                "filol_scores/keywords/philologist_M1",
                "filol_scores/keywords/philologist_M2",
            ],
            "comparison_dirs": [
                "scores/SimpleMaths/words",
                "scores/TextRank/words",
                "scores/SimpleMaths/words_LCF",
                "scores/TextRank/words_LCF",
            ],
        },
    ]

    keyword_limits = [200, 50, 10]
    results = []

    for data_set in data_sets:
        for base_dir in data_set["base_dirs"]:
            M = base_dir.split("/")[-1].split("_")[1]
            for comparison_dir in data_set["comparison_dirs"]:
                method = comparison_dir.split("/")[-2]
                wordtype = comparison_dir.split("/")[-1]
                for keyword_limit in keyword_limits:
                    file_paths = [
                        os.path.join(base_dir, file)
                        for file in os.listdir(base_dir)
                        if file.endswith(".csv")
                    ]
                    total_scores = []
                    total_files = len(file_paths)

                    for base_file in file_paths:
                        base_keywords = load_keywords(base_file, limit=keyword_limit)
                        comparison_file = os.path.join(
                            comparison_dir, os.path.basename(base_file)
                        )
                        common_keywords = compare_keywords(
                            base_keywords, [comparison_file], keyword_limit
                        )
                        if total_files > 0:
                            score = (
                                common_keywords[comparison_file] / len(base_keywords)
                                if len(base_keywords) > 0
                                else 0
                            )
                            total_scores.append(score)

                    if total_scores:
                        mean_score = sum(total_scores) / len(total_scores)
                        results.append(
                            {
                                "M": M,
                                "Method": method,
                                "word_limit": keyword_limit,
                                "wordtype": wordtype,
                                "score": mean_score,
                            }
                        )

    df = pd.DataFrame(results)
    save_path = "analytical_data/results.csv"
    if not os.path.exists("analytical_data"):
        os.makedirs("analytical_data")
    df.to_csv(save_path, index=False)
    print(f"Results saved to {save_path}")


if __name__ == "__main__":
    main()


Results saved to analytical_data/results.csv
