In [2]:
import os


def verify_directory_structure():
    base_path = os.getcwd()
    models_path = os.path.join(base_path, "models")

    expected_subdirs = [
        "raw_text_lemma_data_LCF",
        "raw_text_lemma_data",
        "raw_text_data_LCF",
        "raw_text_data",
    ]
    expected_models = [
        "EstBERT",
        "est-roberta",
        "LaBSE",
        "bertMulti",
        "distilbertMulti",
        "MiniLM_multi",
        "MiniLM-L12_multi",
        "multi_e5",
        "xml_roberta",
    ]
    expected_ngrams = ["ngram_1_1", "ngram_2_2", "ngram_3_3"]
    expected_diversities = [f"diversity_{i}" for i in range(11)]
    expected_files_count = 180

    if not os.path.exists(models_path):
        print(f"Directory missing: {models_path}")
        return False

    for sub_dir in expected_subdirs:
        full_sub_dir = os.path.join(models_path, sub_dir)
        if not os.path.exists(full_sub_dir):
            print(f"Directory missing: {full_sub_dir}")
            return False
        for model in expected_models:
            full_model_dir = os.path.join(full_sub_dir, model)
            if not os.path.exists(full_model_dir):
                print(f"Directory missing: {full_model_dir}")
                return False
            for ngram in expected_ngrams:
                full_ngram_dir = os.path.join(full_model_dir, ngram)
                if not os.path.exists(full_ngram_dir):
                    print(f"Directory missing: {full_ngram_dir}")
                    return False
                for diversity in expected_diversities:
                    full_diversity_dir = os.path.join(full_ngram_dir, diversity)
                    if not os.path.exists(full_diversity_dir):
                        print(f"Directory missing: {full_diversity_dir}")
                        return False
                    csv_files = [
                        f for f in os.listdir(full_diversity_dir) if f.endswith(".csv")
                    ]
                    if len(csv_files) != expected_files_count:
                        print(
                            f"Error: Expected {expected_files_count} CSV files in {full_diversity_dir}, found {len(csv_files)}"
                        )
                        return False
    print("All directories and files are correctly structured and present.")
    return True


verify_directory_structure()


All directories and files are correctly structured and present.


True

In [1]:
import os
import pandas as pd


def create_unified_data(top_n):
    base_path = os.getcwd()
    models_path = os.path.join(base_path, "models")
    unified_path = os.path.join(models_path, f"unified_data_{top_n}")
    os.makedirs(unified_path, exist_ok=True)

    subdirs = [
        "raw_text_lemma_data_LCF",
        "raw_text_lemma_data",
        "raw_text_data_LCF",
        "raw_text_data",
    ]
    ngrams = ["ngram_1_1", "ngram_2_2", "ngram_3_3"]
    diversities = [f"diversity_{i}" for i in range(11)]
    models = [
        "EstBERT",
        "est-roberta",
        "LaBSE",
        "bertMulti",
        "distilbertMulti",
        "MiniLM_multi",
        "MiniLM-L12_multi",
        "multi_e5",
        "xml_roberta",
    ]

    for subdir in subdirs:
        for ngram in ngrams:
            for diversity in diversities:
                unified_sub_path = os.path.join(unified_path, subdir, ngram, diversity)
                os.makedirs(unified_sub_path, exist_ok=True)
                model_paths = [
                    os.path.join(models_path, subdir, model, ngram, diversity)
                    for model in models
                ]

                example_files = os.listdir(model_paths[0])
                for file_name in example_files:
                    data_frames = {}
                    for model, path in zip(models, model_paths):
                        file_path = os.path.join(path, file_name)
                        if os.path.exists(file_path):
                            df = pd.read_csv(file_path, delimiter=";", header=0)
                            data_frames[model] = df.iloc[:top_n, 0]

                    max_length = (
                        max(len(df) for df in data_frames.values())
                        if data_frames
                        else 0
                    )
                    for model in models:
                        if model in data_frames:
                            df_length = len(data_frames[model])
                            if df_length < max_length:
                                data_frames[model] = pd.concat(
                                    [
                                        data_frames[model],
                                        pd.Series([pd.NA] * (max_length - df_length)),
                                    ],
                                    ignore_index=True,
                                )

                    combined_df = pd.DataFrame(data_frames)
                    combined_df.to_csv(
                        os.path.join(unified_sub_path, file_name), index=False, sep=";"
                    )


create_unified_data(top_n=200)  # Create unified data for top 200 entries
create_unified_data(top_n=50)  # Create unified data for top 50 entries
create_unified_data(top_n=10)  # Create unified data for top 10 entries
