In [2]:
import pandas as pd

def load_and_process_data(filenames):
    # Combine all CSV files into a single DataFrame
    data_frames = [pd.read_csv(filename) for filename in filenames]
    combined_df = pd.concat(data_frames, ignore_index=True)
    
    # Calculate total possible words in each row
    combined_df['Total_Words'] = combined_df['words'].apply(lambda x: len(x.split('; ')))
    
    # List of model columns (edit this list based on your actual model columns)
    model_columns = ['EstBERT', 'est-roberta', 'LaBSE', 'bertMulti', 'distilbertMulti', 'MiniLM_multi', 'MiniLM-L12_multi', 'multi_e5', 'xml_roberta']
    
    # Normalize scores by the total possible words for each model
    for model in model_columns:
        combined_df[model + '_Normalized'] = combined_df[model] / combined_df['Total_Words']
    
    return combined_df

def calculate_average_scores(df, model_columns):
    # Calculate average normalized score for each model
    average_scores = {model: df[model + '_Normalized'].mean() for model in model_columns}
    return average_scores

def find_best_model(average_scores):
    # Find the model with the highest average score
    best_model = max(average_scores, key=average_scores.get)
    return best_model

# List your CSV filenames here
filenames = ['t887.csv', 't896.csv', 't903.csv']  # Update this list with your file paths

# Processing data
combined_df = load_and_process_data(filenames)

# List of models (update this if it changes)
model_columns = ['EstBERT', 'est-roberta', 'LaBSE', 'bertMulti', 'distilbertMulti', 'MiniLM_multi', 'MiniLM-L12_multi', 'multi_e5', 'xml_roberta']

# Calculating scores
average_scores = calculate_average_scores(combined_df, model_columns)
best_model = find_best_model(average_scores)

# Output results
print("Average Normalized Scores by Model:", average_scores)
print("Best Model:", best_model)


Average Normalized Scores by Model: {'EstBERT': 0.49772727272727274, 'est-roberta': 0.3888888888888889, 'LaBSE': 0.39562289562289554, 'bertMulti': 0.33670033670033667, 'distilbertMulti': 0.5161616161616163, 'MiniLM_multi': 0.3061447811447812, 'MiniLM-L12_multi': 0.3971380471380472, 'multi_e5': 0.48265993265993257, 'xml_roberta': 0.39789562289562286}
Best Model: distilbertMulti


In [10]:
import pandas as pd
import os

# Base and output directories
base_path = "./models/unified_data/raw_text_data"
philologist_path = "./filol_scores/keywords/philologist_M1"
output_dir = "./models/diversity_accuracy/raw_text_data"

# Ngram directories to process
ngram_dirs = ["ngram_1_1", "ngram_2_2", "ngram_3_3"]

# Ensure the output directory exists
os.makedirs(output_dir, exist_ok=True)

# Initialize counters and lists for tracking
processed_files_count = 0
unprocessed_files = []

# Function to process each directory
def process_ngram_directory(ngram_dir):
    global processed_files_count, unprocessed_files
    diversity_paths = [f"{base_path}/{ngram_dir}/diversity_{i}" for i in range(11)]
    
    for philologist_file in os.listdir(philologist_path):
        if philologist_file.endswith(".csv"):
            philologist_file_path = os.path.join(philologist_path, philologist_file)
            df_philologist = pd.read_csv(philologist_file_path, delimiter=';')
            philologist_keywords = df_philologist['keyword'].dropna().str.lower().unique()
            expected_amount = len(philologist_keywords)  # Total number of unique expected keywords
            model_names = ["EstBERT", "est-roberta", "LaBSE", "bertMulti", "distilbertMulti", "MiniLM_multi", "MiniLM-L12_multi", "multi_e5", "xml_roberta"]
            results = {model: [0]*11 for model in model_names}
            found_words = [{model: set() for model in model_names} for _ in range(11)]  # List of dicts to store found words uniquely per model and diversity

            # Check if any matching files exist in the diversity folder
            matching_files_exist = False
            for idx, path in enumerate(diversity_paths):
                diversity_file_path = os.path.join(path, philologist_file)
                if os.path.exists(diversity_file_path):
                    matching_files_exist = True
                    break
            
            if matching_files_exist:
                for idx, path in enumerate(diversity_paths):
                    diversity_file_path = os.path.join(path, philologist_file)
                    if os.path.exists(diversity_file_path):
                        df_diversity = pd.read_csv(diversity_file_path, delimiter=';', names=model_names)
                        for model in model_names:
                            entries = df_diversity[model].dropna().str.lower().str.split()
                            model_specific_words = set()
                            for entry in entries:
                                matched_words = set(entry) & set(philologist_keywords)
                                model_specific_words.update(matched_words)
                            found_words[idx][model] = model_specific_words  # Update with unique words for this model
                            results[model][idx] = len(model_specific_words)  # Count unique words
                        processed_files_count += 1

                # Prepare output DataFrame
                output_df = pd.DataFrame(results, index=[f"diversity_{i}" for i in range(11)])
                output_df.reset_index(inplace=True)
                output_df.rename(columns={'index': 'diversity'}, inplace=True)
                output_df['expected_amount'] = expected_amount  # Add expected_amount as a new column
                output_df['words'] = ['; '.join(set().union(*(d.values()))) for d in found_words]  # Combine words from all models

                # Save the results
                ngram_output_dir = os.path.join(output_dir, ngram_dir)
                os.makedirs(ngram_output_dir, exist_ok=True)
                output_path = os.path.join(ngram_output_dir, philologist_file)
                output_df.to_csv(output_path, index=False)
            else:
                unprocessed_files.append(philologist_file)

# Process each ngram directory
for ngram_dir in ngram_dirs:
    process_ngram_directory(ngram_dir)

# Print summary of processed and unprocessed files
print(f"Total number of files processed: {processed_files_count}")
if unprocessed_files:
    print("Files not processed across all folders:")
    for file in unprocessed_files:
        print(file)
else:
    print("All files were successfully processed.")


Total number of files processed: 5940
Files not processed across all folders:
t100098.csv
t100101.csv
t100218.csv
t100506.csv
t100648.csv
t101044.csv
t101345.csv
t103140.csv
t1064.csv
t1189.csv
t120660.csv
t12088.csv
t1423.csv
t1505.csv
t1637.csv
t177159.csv
t179339.csv
t179429.csv
t179494.csv
t179497.csv
t100098.csv
t100101.csv
t100218.csv
t100506.csv
t100648.csv
t101044.csv
t101345.csv
t103140.csv
t1064.csv
t1189.csv
t120660.csv
t12088.csv
t1423.csv
t1505.csv
t1637.csv
t177159.csv
t179339.csv
t179429.csv
t179494.csv
t179497.csv
t100098.csv
t100101.csv
t100218.csv
t100506.csv
t100648.csv
t101044.csv
t101345.csv
t103140.csv
t1064.csv
t1189.csv
t120660.csv
t12088.csv
t1423.csv
t1505.csv
t1637.csv
t177159.csv
t179339.csv
t179429.csv
t179494.csv
t179497.csv


In [28]:
import pandas as pd

def process_data(filename):
    data = pd.read_csv(filename)
    model_columns = ['EstBERT', 'est-roberta', 'LaBSE', 'bertMulti', 'distilbertMulti', 'MiniLM_multi', 'MiniLM-L12_multi', 'multi_e5', 'xml_roberta']
    for column in model_columns:
        data[column] = data[column] / data['expected_amount']
    return data

def aggregate_and_rank(files):
    # Aggregate data from multiple files
    aggregated_data = pd.concat([process_data(file) for file in files], ignore_index=True)

    # Calculate mean scores across all entries for each model and diversity
    mean_scores = aggregated_data.groupby('diversity')[['EstBERT', 'est-roberta', 'LaBSE', 'bertMulti', 'distilbertMulti', 'MiniLM_multi', 'MiniLM-L12_multi', 'multi_e5', 'xml_roberta']].mean()

    # Best model for each diversity
    best_models_per_diversity = mean_scores.idxmax(axis=1)
    
    # Best diversity for each model
    best_diversities_per_model = mean_scores.idxmax(axis=0)

    # Overall best model and diversity combination
    best_overall_diversity = mean_scores.mean(axis=1).idxmax()
    best_overall_model = mean_scores.loc[best_overall_diversity].idxmax()

    # Ranking of diversities and models by their average scores
    diversity_rankings = mean_scores.mean(axis=1).sort_values(ascending=False)
    model_rankings = mean_scores.mean(axis=0).sort_values(ascending=False)

    return {
        'Best Models per Diversity': best_models_per_diversity,
        'Best Diversities per Model': best_diversities_per_model,
        'Best Overall Model and Diversity': (best_overall_model, best_overall_diversity),
        'Diversity Rankings': diversity_rankings,
        'Model Rankings': model_rankings
    }

# List of files to process
files = ['t887.csv', 't903.csv']

# Get results
results = aggregate_and_rank(files)
for key, value in results.items():
    print(f"{key}:\n{value}\n")


Best Models per Diversity:
diversity
diversity_0              EstBERT
diversity_1              EstBERT
diversity_10            multi_e5
diversity_2                LaBSE
diversity_3      distilbertMulti
diversity_4      distilbertMulti
diversity_5      distilbertMulti
diversity_6     MiniLM-L12_multi
diversity_7     MiniLM-L12_multi
diversity_8            bertMulti
diversity_9             multi_e5
dtype: object

Best Diversities per Model:
EstBERT             diversity_0
est-roberta         diversity_6
LaBSE               diversity_2
bertMulti           diversity_8
distilbertMulti     diversity_5
MiniLM_multi        diversity_3
MiniLM-L12_multi    diversity_6
multi_e5            diversity_9
xml_roberta         diversity_9
dtype: object

Best Overall Model and Diversity:
('LaBSE', 'diversity_2')

Diversity Rankings:
diversity
diversity_2     0.492593
diversity_0     0.490741
diversity_1     0.481481
diversity_4     0.466667
diversity_5     0.466667
diversity_3     0.462037
diversity_6   

Best Models per Diversity:
diversity
diversity_0               LaBSE
diversity_1               LaBSE
diversity_10    distilbertMulti
diversity_2               LaBSE
diversity_3               LaBSE
diversity_4               LaBSE
diversity_5               LaBSE
diversity_6     distilbertMulti
diversity_7     distilbertMulti
diversity_8            multi_e5
diversity_9            multi_e5
dtype: object

Best Diversities per Model:
EstBERT              diversity_0
est-roberta          diversity_1
LaBSE                diversity_0
bertMulti            diversity_8
distilbertMulti      diversity_7
MiniLM_multi         diversity_4
MiniLM-L12_multi     diversity_2
multi_e5             diversity_9
xml_roberta         diversity_10
dtype: object

Best Overall Model and Diversity:
('LaBSE', 'diversity_0')

Diversity Rankings:
diversity
diversity_0     0.248379
diversity_2     0.248248
diversity_1     0.247992
diversity_3     0.243845
diversity_4     0.241258
diversity_8     0.240145
diversity_6     

In [39]:
import pandas as pd
import os

def process_data(filename):
    data = pd.read_csv(filename)
    model_columns = ['EstBERT', 'est-roberta', 'LaBSE', 'bertMulti', 'distilbertMulti', 'MiniLM_multi', 'MiniLM-L12_multi', 'multi_e5', 'xml_roberta']
    for column in model_columns:
        # Normalizing the scores by the expected amount
        data[column] = data[column] / data['expected_amount']
    return data

def aggregate_and_rank(files):
    # Aggregate data from multiple files
    aggregated_data = pd.concat([process_data(file) for file in files], ignore_index=True)

    # Calculate mean scores across all entries for each model and diversity
    mean_scores = aggregated_data.groupby('diversity')[model_columns].mean()

    # Calculate rankings and best performances
    best_models_per_diversity = mean_scores.idxmax(axis=1)
    best_diversities_per_model = mean_scores.idxmax(axis=0)
    best_overall_diversity = mean_scores.mean(axis=1).idxmax()
    best_overall_model = mean_scores.loc[best_overall_diversity].idxmax()
    diversity_rankings = mean_scores.mean(axis=1).sort_values(ascending=False)
    model_rankings = mean_scores.mean(axis=0).sort_values(ascending=False)

    return {
        'Best Models per Diversity': best_models_per_diversity,
        'Best Diversities per Model': best_diversities_per_model,
        'Best Overall Model and Diversity': (best_overall_model, best_overall_diversity),
        'Diversity Rankings': diversity_rankings,
        'Model Rankings': model_rankings
    }

base_directory_path = 'models/diversity_accuracy/raw_text_data'
subfolders = ['ngram_1_1', 'ngram_2_2', 'ngram_3_3']

for subfolder in subfolders:
    print(f"--- Results for {subfolder} ---\n")
    directory_path = os.path.join(base_directory_path, subfolder)
    files = [os.path.join(directory_path, file) for file in os.listdir(directory_path) if file.endswith('.csv')]
    results = aggregate_and_rank(files)

    print("=== Best Models per Diversity ===")
    print(results['Best Models per Diversity'])

    print("\n=== Best Diversities per Model ===")
    print(results['Best Diversities per Model'])

    print("\n=== Best Overall Model and Diversity ===")
    print(f"Best Model: {results['Best Overall Model and Diversity'][0]} at Diversity: {results['Best Overall Model and Diversity'][1]}")

    print("\n=== Diversity Rankings ===")
    print("Average normalized scores per diversity (the mean score across all models for each diversity):")
    print(results['Diversity Rankings'].rename('Average Normalized Score'))

    print("\n=== Model Rankings ===")
    print("Average normalized scores per model (the mean score across all diversities for each model):")
    print(results['Model Rankings'].rename('Average Normalized Score'))

    print("\n" + "="*50 + "\n")


--- Results for ngram_1_1 ---

=== Best Models per Diversity ===
diversity
diversity_0               LaBSE
diversity_1               LaBSE
diversity_10    distilbertMulti
diversity_2               LaBSE
diversity_3               LaBSE
diversity_4               LaBSE
diversity_5               LaBSE
diversity_6     distilbertMulti
diversity_7     distilbertMulti
diversity_8            multi_e5
diversity_9            multi_e5
dtype: object

=== Best Diversities per Model ===
EstBERT              diversity_0
est-roberta          diversity_1
LaBSE                diversity_0
bertMulti            diversity_8
distilbertMulti      diversity_7
MiniLM_multi         diversity_4
MiniLM-L12_multi     diversity_2
multi_e5             diversity_9
xml_roberta         diversity_10
dtype: object

=== Best Overall Model and Diversity ===
Best Model: LaBSE at Diversity: diversity_0

=== Diversity Rankings ===
Average normalized scores per diversity (the mean score across all models for each diversity):
div

In [33]:
import pandas as pd
import os
import numpy as np

def process_data(filename):
    data = pd.read_csv(filename)
    model_columns = ['EstBERT', 'est-roberta', 'LaBSE', 'bertMulti', 'distilbertMulti', 'MiniLM_multi', 'MiniLM-L12_multi', 'multi_e5', 'xml_roberta']
    for column in model_columns:
        data[column] = data[column] / data['expected_amount']
    return data

def aggregate_and_rank(files):
    # Aggregate data from multiple files
    aggregated_data = pd.concat([process_data(file) for file in files], ignore_index=True)

    # Calculate mean scores across all entries for each model and diversity
    mean_scores = aggregated_data.groupby('diversity')[model_columns].mean()

    return mean_scores

# Base directory path
base_directory_path = 'models/diversity_accuracy/raw_text_data'

# Subfolder names
subfolders = ['ngram_1_1', 'ngram_2_2', 'ngram_3_3']

# Initialize a DataFrame to collect all aggregated data from each n-gram
all_ngrams_data = pd.DataFrame()

# Collect data from each subfolder
for subfolder in subfolders:
    directory_path = os.path.join(base_directory_path, subfolder)
    files = [os.path.join(directory_path, file) for file in os.listdir(directory_path) if file.endswith('.csv')]
    ngram_data = aggregate_and_rank(files)
    ngram_data['ngram'] = subfolder  # Add a column to track n-gram source
    all_ngrams_data = pd.concat([all_ngrams_data, ngram_data], ignore_index=False)

# Exclude the 'ngram' column for mean calculation
numeric_data = all_ngrams_data.select_dtypes(include=[np.number])
overall_mean_scores = numeric_data.groupby(numeric_data.index).mean()
overall_std_devs = numeric_data.groupby(numeric_data.index).std()

# Identify the overall best model and diversity
best_model_per_diversity = overall_mean_scores.idxmax(axis=1)
best_diversity = overall_mean_scores.mean(axis=1).idxmax()
best_model = overall_mean_scores.loc[best_diversity].idxmax()

# Display the results
print("Overall Best Model per Diversity:")
print(best_model_per_diversity)
print("\nStatistical Numbers (Mean Scores):")
print(overall_mean_scores)
print("\nStatistical Numbers (Standard Deviations):")
print(overall_std_devs)
print("\nOverall Best Diversity:", best_diversity)
print("Overall Best Model at this Diversity:", best_model)


Overall Best Model per Diversity:
diversity
diversity_0             EstBERT
diversity_1             EstBERT
diversity_10    distilbertMulti
diversity_2             EstBERT
diversity_3             EstBERT
diversity_4             EstBERT
diversity_5               LaBSE
diversity_6               LaBSE
diversity_7               LaBSE
diversity_8               LaBSE
diversity_9               LaBSE
dtype: object

Statistical Numbers (Mean Scores):
               EstBERT  est-roberta     LaBSE  bertMulti  distilbertMulti  \
diversity                                                                   
diversity_0   0.422903     0.370988  0.379979   0.332841         0.303094   
diversity_1   0.423294     0.372361  0.385398   0.333995         0.310107   
diversity_10  0.282627     0.325807  0.402252   0.375972         0.406655   
diversity_2   0.425702     0.373392  0.393347   0.337309         0.317320   
diversity_3   0.423243     0.369304  0.402481   0.341431         0.326545   
diversity_4   0

In [40]:
import pandas as pd
import os

def process_data(filename):
    data = pd.read_csv(filename)
    model_columns = ['EstBERT', 'est-roberta', 'LaBSE', 'bertMulti', 'distilbertMulti', 'MiniLM_multi', 'MiniLM-L12_multi', 'multi_e5', 'xml_roberta']
    for column in model_columns:
        data[column] = data[column] / data['expected_amount']
    return data

def aggregate_and_rank(files):
    # Aggregate data from multiple files
    aggregated_data = pd.concat([process_data(file) for file in files], ignore_index=True)

    # Calculate mean scores across all entries for each model and diversity
    mean_scores = aggregated_data.groupby('diversity')[model_columns].mean()

    return mean_scores

def get_top_combinations(mean_scores, top_n=10):
    # Flatten the DataFrame to have model and diversity as a MultiIndex
    mean_scores_flat = mean_scores.stack().reset_index()
    mean_scores_flat.columns = ['Diversity', 'Model', 'Mean Score']

    # Sort the flattened DataFrame by mean scores in descending order
    top_combinations = mean_scores_flat.sort_values(by='Mean Score', ascending=False).head(top_n)

    return top_combinations

base_directory_path = 'models/diversity_accuracy/raw_text_data'
subfolders = ['ngram_1_1', 'ngram_2_2', 'ngram_3_3']

for subfolder in subfolders:
    print(f"--- Results for {subfolder} ---\n")
    directory_path = os.path.join(base_directory_path, subfolder)
    files = [os.path.join(directory_path, file) for file in os.listdir(directory_path) if file.endswith('.csv')]
    mean_scores = aggregate_and_rank(files)
    
    top_combinations = get_top_combinations(mean_scores)
    print("=== Top 10 Best Model and Diversity Combinations ===")
    for index, row in top_combinations.iterrows():
        print(f"{index + 1}. Model: {row['Model']} at Diversity: {row['Diversity']} with Mean Score: {row['Mean Score']:.3f}")

    print("\n" + "="*50 + "\n")


--- Results for ngram_1_1 ---

=== Top 10 Best Model and Diversity Combinations ===
3. Model: LaBSE at Diversity: diversity_0 with Mean Score: 0.352
12. Model: LaBSE at Diversity: diversity_1 with Mean Score: 0.347
30. Model: LaBSE at Diversity: diversity_2 with Mean Score: 0.340
39. Model: LaBSE at Diversity: diversity_3 with Mean Score: 0.314
48. Model: LaBSE at Diversity: diversity_4 with Mean Score: 0.299
57. Model: LaBSE at Diversity: diversity_5 with Mean Score: 0.282
98. Model: multi_e5 at Diversity: diversity_9 with Mean Score: 0.280
34. Model: MiniLM-L12_multi at Diversity: diversity_2 with Mean Score: 0.280
1. Model: EstBERT at Diversity: diversity_0 with Mean Score: 0.278
43. Model: MiniLM-L12_multi at Diversity: diversity_3 with Mean Score: 0.278


--- Results for ngram_2_2 ---

=== Top 10 Best Model and Diversity Combinations ===
37. Model: EstBERT at Diversity: diversity_3 with Mean Score: 0.476
71. Model: multi_e5 at Diversity: diversity_6 with Mean Score: 0.473
46. Mode

In [41]:
import pandas as pd
import os

def process_data(filename):
    data = pd.read_csv(filename)
    model_columns = ['EstBERT', 'est-roberta', 'LaBSE', 'bertMulti', 'distilbertMulti', 'MiniLM_multi', 'MiniLM-L12_multi', 'multi_e5', 'xml_roberta']
    for column in model_columns:
        data[column] = data[column] / data['expected_amount']
    return data

def aggregate_and_rank(files):
    # Aggregate data from multiple files
    aggregated_data = pd.concat([process_data(file) for file in files], ignore_index=True)

    # Calculate mean scores across all entries for each model and diversity
    mean_scores = aggregated_data.groupby('diversity')[model_columns].mean()

    return mean_scores

def get_top_combinations(mean_scores, top_n=10):
    # Flatten the DataFrame to have model and diversity as a MultiIndex
    mean_scores_flat = mean_scores.stack().reset_index()
    mean_scores_flat.columns = ['Diversity', 'Model', 'Mean Score']

    # Sort the flattened DataFrame by mean scores in descending order
    top_combinations = mean_scores_flat.sort_values(by='Mean Score', ascending=False).head(top_n)

    return top_combinations

base_directory_path = 'models/diversity_accuracy/raw_text_data'
subfolders = ['ngram_1_1', 'ngram_2_2', 'ngram_3_3']

# Collect all files from all subfolders
all_files = []
for subfolder in subfolders:
    directory_path = os.path.join(base_directory_path, subfolder)
    all_files.extend([os.path.join(directory_path, file) for file in os.listdir(directory_path) if file.endswith('.csv')])

# Aggregate and calculate mean scores across all n-grams
mean_scores = aggregate_and_rank(all_files)

# Get top combinations across all n-grams
top_combinations = get_top_combinations(mean_scores)
print("=== Top 10 Best Model and Diversity Combinations Across All N-grams ===")
for index, row in top_combinations.iterrows():
    print(f"{index + 1}. Model: {row['Model']} at Diversity: {row['Diversity']} with Mean Score: {row['Mean Score']:.3f}")

print("\n" + "="*50 + "\n")


=== Top 10 Best Model and Diversity Combinations Across All N-grams ===
28. Model: EstBERT at Diversity: diversity_2 with Mean Score: 0.426
10. Model: EstBERT at Diversity: diversity_1 with Mean Score: 0.423
37. Model: EstBERT at Diversity: diversity_3 with Mean Score: 0.423
66. Model: LaBSE at Diversity: diversity_6 with Mean Score: 0.423
1. Model: EstBERT at Diversity: diversity_0 with Mean Score: 0.423
71. Model: multi_e5 at Diversity: diversity_6 with Mean Score: 0.419
75. Model: LaBSE at Diversity: diversity_7 with Mean Score: 0.418
80. Model: multi_e5 at Diversity: diversity_7 with Mean Score: 0.417
46. Model: EstBERT at Diversity: diversity_4 with Mean Score: 0.416
57. Model: LaBSE at Diversity: diversity_5 with Mean Score: 0.415


