# **SETUP**

In [1]:
import csv
import logging
import pandas as pd
import numpy as np
import torch

from torch.utils.data import DataLoader
from sklearn.metrics.pairwise import paired_cosine_distances, paired_euclidean_distances, paired_manhattan_distances
from scipy.stats import pearsonr, spearmanr
from tqdm import tqdm
from typing import List, Dict, Tuple, Iterable, Type, Union, Callable, Optional

In [2]:
def split_dataframes(original_df, alternative_naming=False):
    """
    Splits the original dataframe into two dataframes: one for German and one for French.

    Parameters:
    original_df (DataFrame): The original dataframe.

    Returns:
    Tuple[DataFrame, DataFrame]: A tuple containing the German and French dataframes.
    """

    # Creating the German dataframe
    if alternative_naming: # ALTENRATIVE NAMING FORMAT
      german_df = original_df[['de_sent1', 'fr_sent1', 'de_adv1', 'de_adv2', 'de_adv3', 'de_adv4']].copy()
      german_df.rename(columns={
            'de_sent1': 'Original',
            'fr_sent1': 'French',
            'de_adv1': 'Sentences1',
            'de_adv2': 'Sentences2',
            'de_adv3': 'Sentences3',
            'de_adv4': 'Sentences4'
        }, inplace=True)
        # Creating the French dataframe
      french_df = original_df[['de_sent1', 'fr_sent1', 'fr_adv1', 'fr_adv2', 'fr_adv3', 'fr_adv4']].copy()
      french_df.rename(columns={
            'de_sent1': 'German',
            'fr_sent1': 'Original',
            'fr_adv1': 'Sentences1',
            'fr_adv2': 'Sentences2',
            'fr_adv3': 'Sentences3',
            'fr_adv4': 'Sentences4'
        }, inplace=True)
    else: # WMT Format
      german_df = original_df[['German', 'French', 'DESentence1', 'DESentence2', 'DESentence3', 'DESentence4']].copy()
      german_df.rename(columns={
          'German': 'Original',
          'de_adv1': 'Sentences1',
          'de_adv2': 'Sentences2',
          'de_adv3': 'Sentences3',
          'de_adv4': 'Sentences4'
      }, inplace=True)

      # Creating the French dataframe
      french_df = original_df[['German', 'French', 'FRSentence1', 'FRSentence2', 'FRSentence3', 'FRSentence4']].copy()
      french_df.rename(columns={
          'French': 'Original',
          'fr_adv1': 'Sentences1',
          'fr_adv2': 'Sentences2',
          'fr_adv3': 'Sentences3',
          'fr_adv4': 'Sentences4'
      }, inplace=True)

    return german_df, french_df

In [3]:
from pathlib import Path

# Assuming your notebook is inside "notebooks/" and the data is in "../evaluation_sets/"
evaluation_sets_dir = Path('evaluation_sets')

# --- filenames ---
wmt19_adv_df_file              = evaluation_sets_dir / 'CLSD_wmt2019_adversarial_dataset.csv'

wmt21_adv_df_file              = evaluation_sets_dir / 'CLSD_wmt2021_adversarial_dataset.csv'

wmt19_adv_df_m2m_mt_file       = evaluation_sets_dir / 'MT_M2M_wmt2019_adversarial_dataset.csv'
wmt21_adv_df_m2m_mt_file       = evaluation_sets_dir / 'MT_M2M_wmt2021_adversarial_dataset.csv'



wmt19_adv_df = pd.read_csv(wmt19_adv_df_file)
wmt21_adv_df = pd.read_csv(wmt21_adv_df_file)


wmt19_adv_df_m2m_mt = pd.read_csv(wmt19_adv_df_m2m_mt_file)
wmt21_adv_df_m2m_mt = pd.read_csv(wmt21_adv_df_m2m_mt_file)

# Now we can use this function on the loaded data
wmt19_adv_df.rename(columns={
            'German' : 'de_sent1',
            'French' : 'fr_sent1',
}, inplace=True)

wmt21_adv_df.rename(columns={
            'German' : 'de_sent1',
            'French' : 'fr_sent1',
}, inplace=True)

wmt19_de_adv_df , wmt19_fr_adv_df = split_dataframes(wmt19_adv_df, alternative_naming=True)
wmt21_de_adv_df , wmt21_fr_adv_df = split_dataframes(wmt21_adv_df, alternative_naming=True)


# Now we can use this function on the loaded data
wmt19_adv_df_m2m_mt.rename(columns={
            'German' : 'de_sent1',
            'French' : 'fr_sent1',
}, inplace=True)

wmt21_adv_df_m2m_mt.rename(columns={
            'German' : 'de_sent1',
            'French' : 'fr_sent1',
}, inplace=True)


wmt19_de_adv_m2m_mt_df , wmt19_fr_adv_m2m_mt_df = split_dataframes(wmt19_adv_df_m2m_mt, alternative_naming=True)
wmt21_de_adv_m2m_mt_df , wmt21_fr_adv_m2m_mt_df = split_dataframes(wmt21_adv_df_m2m_mt, alternative_naming=True)

e5_model = False
eval_file_list = []

In [4]:
bitext_file_list = [('WMT21 German Synthetic 4 sentences', wmt21_de_adv_df)]
bitext_file_list.insert(0, ('WMT19 German Synthetic 4 sentences', wmt19_de_adv_df))
bitext_file_list.insert(0, ('WMT21 French Synthetic 4 sentences', wmt21_fr_adv_df))
bitext_file_list.insert(0, ('WMT19 French Synthetic 4 sentences', wmt19_fr_adv_df))
bitext_file_list.insert(0, ('M2M MT WMT21 German Synthetic 4 sentences', wmt21_de_adv_m2m_mt_df))
bitext_file_list.insert(0, ('M2M MT WMT19 German Synthetic 4 sentences', wmt19_de_adv_m2m_mt_df))
bitext_file_list.insert(0, ('M2M MT WMT21 French Synthetic 4 sentences', wmt21_fr_adv_m2m_mt_df))
bitext_file_list.insert(0, ('M2M MT WMT19 French Synthetic 4 sentences', wmt19_fr_adv_m2m_mt_df))

In [5]:
len(bitext_file_list)

8

In [6]:
# Function to Evaluate Cosine Similarity
def evaluate_cosine_similarity(model, eval_df='None'):

    eval_df.columns = map(str.lower, eval_df.columns)  # normalize column names to lowercase
    german_sentences = eval_df['german'].tolist()
    french_sentences = eval_df['french'].tolist()

    german_embeddings = model.encode(german_sentences, convert_to_tensor=True).cpu()
    french_embeddings = model.encode(french_sentences, convert_to_tensor=True).cpu()
    similarities = 1 - (paired_cosine_distances(german_embeddings, french_embeddings))
    if 'score' in eval_df:
      scores = eval_df['score']
      return round(spearmanr(similarities, scores)[0], 3) # returns tuple, first is pearson r
    else:
      return round(np.mean(similarities), 3)

In [7]:
def mass_evaluate_files(model, file_list, bitext_files):
  results = []
  predictions = []
  for file in file_list:
    similarity = evaluate_cosine_similarity(model, eval_df=file[1])
    results.append((file[0], similarity))
  print('EVALUATION RESULTS')
  print('------------------')
  for result in results:
    print(result[0] +' : ' + str(result[1]))
  all_cosines_dict = {}
  print('------------------')
  for file in bitext_files:
      percentage, predictions_list, all_cosines = adversarial_evaluation(model, file[1])
      predictions.append(predictions_list)
      all_cosines_dict[file[0]] = all_cosines
      print('Bitext mining accuracy of ' + file[0] + ' : ' + percentage)
  return predictions, all_cosines_dict

In [8]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer

import torch.nn.functional as F

def adversarial_evaluation(model, adv_df):
    if 'French' in adv_df.columns:
        comparison_embeddings = model.encode(adv_df['French'].tolist(), convert_to_tensor=True).cpu()
    else:
        comparison_embeddings = model.encode(adv_df['German'].tolist(), convert_to_tensor=True).cpu()

    # Normalize the comparison embeddings
    comparison_embeddings = F.normalize(comparison_embeddings, p=2, dim=1)

    original_embeddings = model.encode(adv_df['Original'].tolist(), convert_to_tensor=True).cpu()

    # Normalize the original embeddings
    original_embeddings = F.normalize(original_embeddings, p=2, dim=1)

    original_results = (1 - (paired_cosine_distances(original_embeddings, comparison_embeddings)))

    # Prepare an empty DataFrame to store results
    result_df = pd.DataFrame()
    result_df['Original'] = original_results  # Assuming the result is a tensor

    # For each adversarial column, compute the results and add to the result_df
    for idx, sentence in enumerate(['Sentences1', 'Sentences2', 'Sentences3', 'Sentences4']):
        adv_df[sentence] = adv_df[sentence].fillna('Template Sentence') # arbitrary choice
        adv_embeddings = model.encode(adv_df[sentence].tolist(), convert_to_tensor=True).cpu()

        # Normalize the adversarial embeddings
        adv_embeddings = F.normalize(adv_embeddings, p=2, dim=1)

        current_adv_results = (1 - (paired_cosine_distances(adv_embeddings, comparison_embeddings)))

        # Add to the result DataFrame
        result_df[f'Adversarial{idx+1}'] = current_adv_results  # Assuming the result is a tensor

    # Find the maximum value in each row
    max_values = result_df.max(axis=1)

    # Initialize a list to track if the original is the true maximum
    is_original_max = []

    # Check for ties and prioritize adversarial columns in case of a tie
    for idx, row in result_df.iterrows():
        # Check if there's a tie between 'Original' and any 'Adversarial' columns
        original_value = row['Original']
        max_value = max_values[idx]

        # If there's a tie, treat it as negative (not original) if an adversarial shares the max value
        adversarial_cols = [col for col in result_df.columns if 'Adversarial' in col]
        if original_value == max_value and any(row[col] == max_value for col in adversarial_cols):
            # Tie exists, treat it as negative by setting 'is_original_max' to False
            is_original_max.append(False)
        else:
            # If the original is the only maximum, treat as positive
            is_original_max.append(original_value == max_value)

    # Convert is_original_max to a numpy array for easier calculations
    is_original_max = np.array(is_original_max)

    # Calculate the percentage where the original is the maximum
    percentage_max_in_x = str(round(is_original_max.sum() / len(adv_df) * 100, 2)) + '%'

    # Convert the boolean list to a list of 1s and 0s (1 if original is max, 0 otherwise)
    predictions_list = is_original_max.astype(int).tolist()

    return percentage_max_in_x, predictions_list, result_df


# **Embedding Models INFERENCE AND RESULTS**

## Variable Setup

In [9]:
model_correct_predictions = {}
model_all_cosines = {}
lev_distance_dict = {}

In [10]:
!pip install python-Levenshtein

Collecting python-Levenshtein
  Downloading python_levenshtein-0.27.1-py3-none-any.whl.metadata (3.7 kB)
Collecting Levenshtein==0.27.1 (from python-Levenshtein)
  Downloading levenshtein-0.27.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.27.1->python-Levenshtein)
  Downloading rapidfuzz-3.14.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Downloading python_levenshtein-0.27.1-py3-none-any.whl (9.4 kB)
Downloading levenshtein-0.27.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (159 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m159.9/159.9 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rapidfuzz-3.14.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m85.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packag

## Leveshtein and Jaccard similarity Calculations

In [11]:
import Levenshtein
def calculate_levenshtein_similarity(original, modified):
  distance = Levenshtein.distance(original, modified)
  max_len = max(len(original), len(modified))
  if max_len == 0:  # To avoid division by zero for two empty strings
      return 1.0
  similarity = 1 - (distance / max_len)
  return similarity

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import pairwise_distances
import numpy as np
def calculate_jaccard_similarity_bag_of_words(text1, text2):
    vectorizer = CountVectorizer(binary=True)
    X = vectorizer.fit_transform([text1, text2]).toarray()  # Convert to dense array format

    # Since we're working with binary occurrence data, use 'jaccard' metric directly
    jaccard_distance = pairwise_distances(X[0].reshape(1, -1), X[1].reshape(1, -1), metric='jaccard')
    jaccard_similarity = 1 - jaccard_distance

    return jaccard_similarity[0][0]  # Return the similarity value

In [13]:
# Creating a new DataFrame with similarities
columns = ['Sentences1', 'Sentences2', 'Sentences3', 'Sentences4']
distractor_map_dict = {
    'Sentences1': 'Adversarial1',
    'Sentences2': 'Adversarial2',
    'Sentences3': 'Adversarial3',
    'Sentences4': 'Adversarial4',
    }

for bitext_file in bitext_file_list:
    new_df = bitext_file[1].copy(deep=True)

    # Calculate Levenshtein similarities
    for col in columns:
        new_df[col+'_orig_lev'] = new_df.apply(lambda row: calculate_levenshtein_similarity(row['Original'], row[col]), axis=1)

    # Calculate Jaccard similarities for Bag of Words
    for col in columns:
        new_df[col+'_orig_jac'] = new_df.apply(lambda row: calculate_jaccard_similarity_bag_of_words(row['Original'], row[col]), axis=1)

    for index, row in new_df.iterrows():
        diff_list = []
        for i in range(len(columns)):
            # Levenshtein diversity calculation
            total_lev_original = calculate_levenshtein_similarity(str(row[columns[i]]), str(row['Original']))
            total_lev_distractors = 0
            # Jaccard diversity calculation
            total_jac_original = calculate_jaccard_similarity_bag_of_words(str(row['Original']), str(row[columns[i]]))
            total_jac_distractors = 0

            for j in range(len(columns)):
                if i == j:
                    continue
                # Levenshtein difference calculation
                diff_lev = abs(row[columns[i]+'_orig_lev'] - row[columns[j]+'_orig_lev'])
                diff_list.append((diff_lev, columns[i], columns[j]))
                total_lev_distractors += calculate_levenshtein_similarity(str(row[columns[i]]), str(row[columns[j]]))

                # Jaccard difference calculation (Bag of Words)
                diff_jac = abs(row[columns[i]+'_orig_jac'] - row[columns[j]+'_orig_jac'])
                total_jac_distractors += calculate_jaccard_similarity_bag_of_words(str(row[columns[i]]), str(row[columns[j]]))

            # Levenshtein Diversity Index Calculation
            new_df.at[index, 'Lev_Diversity_Index' + str(i+1)] = min(abs((total_lev_distractors / (len(columns) - 1)) - total_lev_original) / (total_lev_original+0.0000000001), 1)
            # Jaccard Diversity Index Calculation
            new_df.at[index, 'Jac_Diversity_Index' + str(i+1)] = total_jac_distractors / (len(columns) - 1)

        max_diff, col1, col2 = max(diff_list)
        new_df.at[index, 'HighDiffColumn1'] = distractor_map_dict[col1]
        new_df.at[index, 'HighDiffColumn2'] = distractor_map_dict[col2]
        new_df.at[index, 'HighDiffValue'] = max_diff
        new_df.at[index, 'Avg_Lev_Diversity_Index'] = sum(new_df.at[index, 'Lev_Diversity_Index'+str(i+1)] for i in range(len(columns))) / len(columns)
        new_df.at[index, 'Avg_Jac_Diversity_Index'] = sum(new_df.at[index, 'Jac_Diversity_Index'+str(i+1)] for i in range(len(columns))) / len(columns)

    # Store the DataFrame with all calculated fields
    lev_distance_dict[bitext_file[0]] = new_df[['Original', 'Sentences1', 'Sentences2', 'Sentences3', 'Sentences4', 'HighDiffColumn1', 'HighDiffColumn2', 'HighDiffValue', 'Lev_Diversity_Index1', 'Lev_Diversity_Index2', 'Lev_Diversity_Index3', 'Lev_Diversity_Index4', 'Avg_Lev_Diversity_Index', 'Jac_Diversity_Index1', 'Jac_Diversity_Index2', 'Jac_Diversity_Index3', 'Jac_Diversity_Index4', 'Avg_Jac_Diversity_Index']].rename(columns = distractor_map_dict)

columns = ['Adversarial1', 'Adversarial2', 'Adversarial3', 'Adversarial4']

[1;30;43mStreaming output truncated to the last 5000 lines.[0m


In [14]:
# Extracting all Jaccard similarities from each dataframe in lev_distance_dict
jac_similarities = []
for df in lev_distance_dict.values():
    jac_similarities.extend([
        df['Jac_Diversity_Index1'],
        df['Jac_Diversity_Index2'],
        df['Jac_Diversity_Index3'],
        df['Jac_Diversity_Index4']
    ])

# Concatenating the list of Jaccard similarities into a single Series
jac_similarities_all = pd.concat(jac_similarities)

# Calculating the mean and standard deviation
mean_jac_sim = jac_similarities_all.mean()
std_dev_jac_sim = jac_similarities_all.std()

print("Mean Jaccard Similarity:", round(mean_jac_sim,2))
print("Standard Deviation of Jaccard Similarity:", round(std_dev_jac_sim,2))

Mean Jaccard Similarity: 0.46
Standard Deviation of Jaccard Similarity: 0.16


## INFERENCE

In [15]:
from sentence_transformers import SentenceTransformer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [16]:
model_name = 'Alibaba-NLP/gte-multilingual-base'
model = SentenceTransformer(model_name, trust_remote_code=True)
model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/55.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

configuration.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Alibaba-NLP/new-impl:
- configuration.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Alibaba-NLP/new-impl:
- modeling.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/611M [00:00<?, ?B/s]

Some weights of the model checkpoint at Alibaba-NLP/gte-multilingual-base were not used when initializing NewModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

SentenceTransformer(
  (0): Transformer({'max_seq_length': 8192, 'do_lower_case': False, 'architecture': 'NewModel'})
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [17]:
# Evaluate Vanilla model
print(model_name)
predictions_list, all_cosines = mass_evaluate_files(model, eval_file_list, bitext_file_list)
model_correct_predictions[model_name] =  predictions_list
model_all_cosines[model_name] = all_cosines

Alibaba-NLP/gte-multilingual-base
EVALUATION RESULTS
------------------
------------------
Bitext mining accuracy of M2M MT WMT19 French Synthetic 4 sentences : 89.61%
Bitext mining accuracy of M2M MT WMT21 French Synthetic 4 sentences : 90.26%
Bitext mining accuracy of M2M MT WMT19 German Synthetic 4 sentences : 89.07%
Bitext mining accuracy of M2M MT WMT21 German Synthetic 4 sentences : 92.95%
Bitext mining accuracy of WMT19 French Synthetic 4 sentences : 90.22%
Bitext mining accuracy of WMT21 French Synthetic 4 sentences : 90.48%
Bitext mining accuracy of WMT19 German Synthetic 4 sentences : 89.68%
Bitext mining accuracy of WMT21 German Synthetic 4 sentences : 91.6%


In [18]:
model_name = 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2'
model = SentenceTransformer(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/723 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/402 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False, 'architecture': 'XLMRobertaModel'})
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

In [19]:
# Evaluate Vanilla model
print(model_name)
predictions_list, all_cosines = mass_evaluate_files(model, eval_file_list, bitext_file_list)
model_correct_predictions[model_name] =  predictions_list
model_all_cosines[model_name] = all_cosines

sentence-transformers/paraphrase-multilingual-mpnet-base-v2
EVALUATION RESULTS
------------------
------------------
Bitext mining accuracy of M2M MT WMT19 French Synthetic 4 sentences : 87.85%
Bitext mining accuracy of M2M MT WMT21 French Synthetic 4 sentences : 88.02%
Bitext mining accuracy of M2M MT WMT19 German Synthetic 4 sentences : 87.64%
Bitext mining accuracy of M2M MT WMT21 German Synthetic 4 sentences : 91.27%
Bitext mining accuracy of WMT19 French Synthetic 4 sentences : 91.31%
Bitext mining accuracy of WMT21 French Synthetic 4 sentences : 91.15%
Bitext mining accuracy of WMT19 German Synthetic 4 sentences : 91.11%
Bitext mining accuracy of WMT21 German Synthetic 4 sentences : 92.95%


In [20]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = 'sentence-transformers/LaBSE'
model = SentenceTransformer(model_name)
model.to(device)

modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/804 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/397 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/114 [00:00<?, ?B/s]

2_Dense/model.safetensors:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False, 'architecture': 'BertModel'})
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Dense({'in_features': 768, 'out_features': 768, 'bias': True, 'activation_function': 'torch.nn.modules.activation.Tanh'})
  (3): Normalize()
)

In [21]:
# Evaluate Vanilla model
print(model_name)
predictions_list, all_cosines = mass_evaluate_files(model, eval_file_list, bitext_file_list)
model_correct_predictions[model_name] =  predictions_list
model_all_cosines[model_name] = all_cosines

sentence-transformers/LaBSE
EVALUATION RESULTS
------------------
------------------
Bitext mining accuracy of M2M MT WMT19 French Synthetic 4 sentences : 90.84%
Bitext mining accuracy of M2M MT WMT21 French Synthetic 4 sentences : 91.94%
Bitext mining accuracy of M2M MT WMT19 German Synthetic 4 sentences : 90.09%
Bitext mining accuracy of M2M MT WMT21 German Synthetic 4 sentences : 94.4%
Bitext mining accuracy of WMT19 French Synthetic 4 sentences : 95.18%
Bitext mining accuracy of WMT21 French Synthetic 4 sentences : 94.06%
Bitext mining accuracy of WMT19 German Synthetic 4 sentences : 94.3%
Bitext mining accuracy of WMT21 German Synthetic 4 sentences : 94.18%


In [22]:
def append_query_prefix(df):
    for col in df.columns:
        df[col] = df[col].apply(lambda x: f"query: {x}" if isinstance(x, str) else x)
    return df


e5_model = True

eval_file_list_e5 = []

for name, df in eval_file_list:
    df_e5 = append_query_prefix(df.copy())
    eval_file_list_e5.append((f"{name}_e5", df_e5))

bitext_file_list_e5 = []

for name, df in bitext_file_list:
    df_e5 = append_query_prefix(df.copy())
    bitext_file_list_e5.append((name, df_e5))

In [23]:
model_name = 'intfloat/multilingual-e5-base'
model = SentenceTransformer(model_name)
model.to(device)

modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False, 'architecture': 'XLMRobertaModel'})
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [24]:
# Evaluate Vanilla model
print(model_name)
predictions_list, all_cosines = mass_evaluate_files(model, eval_file_list_e5, bitext_file_list_e5)
model_correct_predictions[model_name] =  predictions_list
model_all_cosines[model_name] = all_cosines

intfloat/multilingual-e5-base
EVALUATION RESULTS
------------------
------------------
Bitext mining accuracy of M2M MT WMT19 French Synthetic 4 sentences : 90.36%
Bitext mining accuracy of M2M MT WMT21 French Synthetic 4 sentences : 89.81%
Bitext mining accuracy of M2M MT WMT19 German Synthetic 4 sentences : 89.55%
Bitext mining accuracy of M2M MT WMT21 German Synthetic 4 sentences : 93.17%
Bitext mining accuracy of WMT19 French Synthetic 4 sentences : 91.51%
Bitext mining accuracy of WMT21 French Synthetic 4 sentences : 86.34%
Bitext mining accuracy of WMT19 German Synthetic 4 sentences : 88.46%
Bitext mining accuracy of WMT21 German Synthetic 4 sentences : 81.97%


In [25]:
model_name = 'intfloat/multilingual-e5-large'
model = SentenceTransformer(model_name)
model.to(device)

modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/690 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/201 [00:00<?, ?B/s]

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False, 'architecture': 'XLMRobertaModel'})
  (1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [26]:
# Evaluate Vanilla model
print(model_name)
predictions_list, all_cosines = mass_evaluate_files(model, eval_file_list_e5, bitext_file_list_e5)
model_correct_predictions[model_name] =  predictions_list
model_all_cosines[model_name] = all_cosines

intfloat/multilingual-e5-large
EVALUATION RESULTS
------------------
------------------
Bitext mining accuracy of M2M MT WMT19 French Synthetic 4 sentences : 90.22%
Bitext mining accuracy of M2M MT WMT21 French Synthetic 4 sentences : 91.83%
Bitext mining accuracy of M2M MT WMT19 German Synthetic 4 sentences : 90.02%
Bitext mining accuracy of M2M MT WMT21 German Synthetic 4 sentences : 92.95%
Bitext mining accuracy of WMT19 French Synthetic 4 sentences : 94.5%
Bitext mining accuracy of WMT21 French Synthetic 4 sentences : 91.38%
Bitext mining accuracy of WMT19 German Synthetic 4 sentences : 91.51%
Bitext mining accuracy of WMT21 German Synthetic 4 sentences : 87.57%


In [27]:
e5_model = False

# Error Analysis of Models Performance

## Error Analysis UTILITIES

In [28]:
# Function to find the highest value columns for specified indices
def find_max_columns_for_indices(df, indices):
    max_columns = []
    for index in indices:
        if index < len(df):
            # Get the column name with the max value for the specific row index
            max_col = df.loc[index].idxmax()
            max_columns.append((index,max_col))
    return max_columns

# Function to find indices where the maximum column matches in both dataframes and store the column names
def max_col_match_indices_and_names(df1, df2, indices):
    matched_info = []
    for index in indices:
        if index < len(df1) and index < len(df2):
            # Get the column index of the max value for both rows
            max_col_df1 = df1.loc[index].idxmax()
            max_col_df2 = df2.loc[index].idxmax()
            # Check if they match
            if max_col_df1 == max_col_df2:
                matched_info.append((index, max_col_df1))
    return matched_info

# First, define a function to select the appropriate fr_adv column based on the 'selected_distractor' value
def get_failed_distractor(row, prefix='fr'):
    # Use the value in 'selected_distractor' to build the column name
    return row[f'{prefix}_adv{row["selected_distractor"]}']

## Leveshtein Similarity Buckets and Distractors

In [29]:
de_dfs = {'WMT19 German Synthetic 4 sentences' : 6, 'WMT21 German Synthetic 4 sentences' : 7}
fr_dfs = {'WMT19 French Synthetic 4 sentences' : 4, 'WMT21 French Synthetic 4 sentences' : 5}

### M-GTE

In [30]:
model_name = 'Alibaba-NLP/gte-multilingual-base'
lev_similarity_list_de = []
lev_similarity_list_fr = []
for key, index in de_dfs.items():
  failed_indices = np.array(model_correct_predictions[model_name][index]) == 0
  current_df = model_all_cosines[model_name][key].loc[failed_indices] # filters only sucessful distractors
  # Find the column index of the maximum value for each row
  max_column_indices = current_df.idxmax(axis=1)
  values_df = bitext_file_list[index][1].loc[failed_indices]
  original_texts = values_df['Original']
  adversarial_texts = values_df.to_numpy()[np.arange(len(values_df)), values_df.columns.get_indexer(max_column_indices)]
  # Compute Levenshtein similarity for each pair
  similarities = [
      Levenshtein.ratio(str(orig), str(adv))
      for orig, adv in zip(original_texts, adversarial_texts)
  ]
  lev_similarity_list_de.extend(similarities)


for key, index in fr_dfs.items():
  failed_indices = np.array(model_correct_predictions[model_name][index]) == 0
  current_df = model_all_cosines[model_name][key].loc[failed_indices] # filters only sucessful distractors
  # Find the column index of the maximum value for each row
  max_column_indices = current_df.idxmax(axis=1)
  values_df = bitext_file_list[index][1].loc[failed_indices]
  original_texts = values_df['Original']
  adversarial_texts = values_df.to_numpy()[np.arange(len(values_df)), values_df.columns.get_indexer(max_column_indices)]
  # Compute Levenshtein similarity for each pair
  similarities = [
      Levenshtein.ratio(str(orig), str(adv))
      for orig, adv in zip(original_texts, adversarial_texts)
  ]
  lev_similarity_list_fr.extend(similarities)

print(len(lev_similarity_list_de))
print(len(lev_similarity_list_fr))

227
229


In [31]:
# Define bins and labels
bins = [0.3, 0.6, 0.7, 0.8, 0.9, 0.999]
labels = ['0.3–0.6', '0.6–0.7', '0.7–0.8', '0.8–0.9', '0.9–0.99']

# German
binned_de = pd.cut(lev_similarity_list_de, bins=bins, labels=labels, right=False)
binned_table_de = pd.Series(binned_de).value_counts(normalize=True).sort_index(ascending=False) * 100
formatted_de = binned_table_de.apply(lambda x: f"{x:05.2f}%")

# French
binned_fr = pd.cut(lev_similarity_list_fr, bins=bins, labels=labels, right=False)
binned_table_fr = pd.Series(binned_fr).value_counts(normalize=True).sort_index(ascending=False) * 100
formatted_fr = binned_table_fr.apply(lambda x: f"{x:05.2f}%")

# Print results
print(model_name)
print("German")
print(formatted_de)
print("French")
print(formatted_fr)

Alibaba-NLP/gte-multilingual-base
German
0.9–0.99    29.96%
0.8–0.9     33.48%
0.7–0.8     22.91%
0.6–0.7     10.57%
0.3–0.6     03.08%
Name: proportion, dtype: object
French
0.9–0.99    26.20%
0.8–0.9     31.88%
0.7–0.8     26.20%
0.6–0.7     12.66%
0.3–0.6     03.06%
Name: proportion, dtype: object


## M-MPNet

In [32]:
model_name = 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2'
lev_similarity_list_de = []
lev_similarity_list_fr = []
for key, index in de_dfs.items():
  failed_indices = np.array(model_correct_predictions[model_name][index]) == 0
  current_df = model_all_cosines[model_name][key].loc[failed_indices] # filters only sucessful distractors
  # Find the column index of the maximum value for each row
  max_column_indices = current_df.idxmax(axis=1)
  values_df = bitext_file_list[index][1].loc[failed_indices]
  original_texts = values_df['Original']
  adversarial_texts = values_df.to_numpy()[np.arange(len(values_df)), values_df.columns.get_indexer(max_column_indices)]
  # Compute Levenshtein similarity for each pair
  similarities = [
      Levenshtein.ratio(str(orig), str(adv))
      for orig, adv in zip(original_texts, adversarial_texts)
  ]
  lev_similarity_list_de.extend(similarities)


for key, index in fr_dfs.items():
  failed_indices = np.array(model_correct_predictions[model_name][index]) == 0
  current_df = model_all_cosines[model_name][key].loc[failed_indices] # filters only sucessful distractors
  # Find the column index of the maximum value for each row
  max_column_indices = current_df.idxmax(axis=1)
  values_df = bitext_file_list[index][1].loc[failed_indices]
  original_texts = values_df['Original']
  adversarial_texts = values_df.to_numpy()[np.arange(len(values_df)), values_df.columns.get_indexer(max_column_indices)]
  # Compute Levenshtein similarity for each pair
  similarities = [
      Levenshtein.ratio(str(orig), str(adv))
      for orig, adv in zip(original_texts, adversarial_texts)
  ]
  lev_similarity_list_fr.extend(similarities)

print(len(lev_similarity_list_de))
print(len(lev_similarity_list_fr))

194
207


In [33]:
# Define bins and labels
bins = [0.3, 0.6, 0.7, 0.8, 0.9, 0.999]
labels = ['0.3–0.6', '0.6–0.7', '0.7–0.8', '0.8–0.9', '0.9–0.99']

# German
binned_de = pd.cut(lev_similarity_list_de, bins=bins, labels=labels, right=False)
binned_table_de = pd.Series(binned_de).value_counts(normalize=True).sort_index(ascending=False) * 100
formatted_de = binned_table_de.apply(lambda x: f"{x:05.2f}%")

# French
binned_fr = pd.cut(lev_similarity_list_fr, bins=bins, labels=labels, right=False)
binned_table_fr = pd.Series(binned_fr).value_counts(normalize=True).sort_index(ascending=False) * 100
formatted_fr = binned_table_fr.apply(lambda x: f"{x:05.2f}%")

# Print results
print(model_name)
print("German")
print(formatted_de)
print("French")
print(formatted_fr)

sentence-transformers/paraphrase-multilingual-mpnet-base-v2
German
0.9–0.99    28.87%
0.8–0.9     35.05%
0.7–0.8     22.16%
0.6–0.7     10.31%
0.3–0.6     03.61%
Name: proportion, dtype: object
French
0.9–0.99    29.95%
0.8–0.9     24.64%
0.7–0.8     28.50%
0.6–0.7     13.04%
0.3–0.6     03.86%
Name: proportion, dtype: object


## ME5-B

In [34]:
model_name = 'intfloat/multilingual-e5-base'
lev_similarity_list_de = []
lev_similarity_list_fr = []
for key, index in de_dfs.items():
  failed_indices = np.array(model_correct_predictions[model_name][index]) == 0
  current_df = model_all_cosines[model_name][key].loc[failed_indices] # filters only sucessful distractors
  # Find the column index of the maximum value for each row
  max_column_indices = current_df.idxmax(axis=1)
  values_df = bitext_file_list[index][1].loc[failed_indices]
  original_texts = values_df['Original']
  adversarial_texts = values_df.to_numpy()[np.arange(len(values_df)), values_df.columns.get_indexer(max_column_indices)]
  # Compute Levenshtein similarity for each pair
  similarities = [
      Levenshtein.ratio(str(orig), str(adv))
      for orig, adv in zip(original_texts, adversarial_texts)
  ]
  lev_similarity_list_de.extend(similarities)


for key, index in fr_dfs.items():
  failed_indices = np.array(model_correct_predictions[model_name][index]) == 0
  current_df = model_all_cosines[model_name][key].loc[failed_indices] # filters only sucessful distractors
  # Find the column index of the maximum value for each row
  max_column_indices = current_df.idxmax(axis=1)
  values_df = bitext_file_list[index][1].loc[failed_indices]
  original_texts = values_df['Original']
  adversarial_texts = values_df.to_numpy()[np.arange(len(values_df)), values_df.columns.get_indexer(max_column_indices)]
  # Compute Levenshtein similarity for each pair
  similarities = [
      Levenshtein.ratio(str(orig), str(adv))
      for orig, adv in zip(original_texts, adversarial_texts)
  ]
  lev_similarity_list_fr.extend(similarities)

print(len(lev_similarity_list_de))
print(len(lev_similarity_list_fr))

331
247


In [35]:
# Define bins and labels
bins = [0.3, 0.6, 0.7, 0.8, 0.9, 0.999]
labels = ['0.3–0.6', '0.6–0.7', '0.7–0.8', '0.8–0.9', '0.9–0.99']

# German
binned_de = pd.cut(lev_similarity_list_de, bins=bins, labels=labels, right=False)
binned_table_de = pd.Series(binned_de).value_counts(normalize=True).sort_index(ascending=False) * 100
formatted_de = binned_table_de.apply(lambda x: f"{x:05.2f}%")

# French
binned_fr = pd.cut(lev_similarity_list_fr, bins=bins, labels=labels, right=False)
binned_table_fr = pd.Series(binned_fr).value_counts(normalize=True).sort_index(ascending=False) * 100
formatted_fr = binned_table_fr.apply(lambda x: f"{x:05.2f}%")

# Print results
print(model_name)
print("German")
print(formatted_de)
print("French")
print(formatted_fr)

intfloat/multilingual-e5-base
German
0.9–0.99    28.40%
0.8–0.9     32.93%
0.7–0.8     25.38%
0.6–0.7     10.27%
0.3–0.6     03.02%
Name: proportion, dtype: object
French
0.9–0.99    25.51%
0.8–0.9     27.53%
0.7–0.8     31.58%
0.6–0.7     12.55%
0.3–0.6     02.83%
Name: proportion, dtype: object


## ME5-L

In [36]:
model_name = 'intfloat/multilingual-e5-large'
lev_similarity_list_de = []
lev_similarity_list_fr = []
for key, index in de_dfs.items():
  failed_indices = np.array(model_correct_predictions[model_name][index]) == 0
  current_df = model_all_cosines[model_name][key].loc[failed_indices] # filters only sucessful distractors
  # Find the column index of the maximum value for each row
  max_column_indices = current_df.idxmax(axis=1)
  values_df = bitext_file_list[index][1].loc[failed_indices]
  original_texts = values_df['Original']
  adversarial_texts = values_df.to_numpy()[np.arange(len(values_df)), values_df.columns.get_indexer(max_column_indices)]
  # Compute Levenshtein similarity for each pair
  similarities = [
      Levenshtein.ratio(str(orig), str(adv))
      for orig, adv in zip(original_texts, adversarial_texts)
  ]
  lev_similarity_list_de.extend(similarities)


for key, index in fr_dfs.items():
  failed_indices = np.array(model_correct_predictions[model_name][index]) == 0
  current_df = model_all_cosines[model_name][key].loc[failed_indices] # filters only sucessful distractors
  # Find the column index of the maximum value for each row
  max_column_indices = current_df.idxmax(axis=1)
  values_df = bitext_file_list[index][1].loc[failed_indices]
  original_texts = values_df['Original']
  adversarial_texts = values_df.to_numpy()[np.arange(len(values_df)), values_df.columns.get_indexer(max_column_indices)]
  # Compute Levenshtein similarity for each pair
  similarities = [
      Levenshtein.ratio(str(orig), str(adv))
      for orig, adv in zip(original_texts, adversarial_texts)
  ]
  lev_similarity_list_fr.extend(similarities)

print(len(lev_similarity_list_de))
print(len(lev_similarity_list_fr))

236
158


In [37]:
import pandas as pd

# Define bins and labels
bins = [0.3, 0.6, 0.7, 0.8, 0.9, 0.999]
labels = ['0.3–0.6', '0.6–0.7', '0.7–0.8', '0.8–0.9', '0.9–0.99']

# German
binned_de = pd.cut(lev_similarity_list_de, bins=bins, labels=labels, right=False)
binned_table_de = pd.Series(binned_de).value_counts(normalize=True).sort_index(ascending=False) * 100
formatted_de = binned_table_de.apply(lambda x: f"{x:05.2f}%")

# French
binned_fr = pd.cut(lev_similarity_list_fr, bins=bins, labels=labels, right=False)
binned_table_fr = pd.Series(binned_fr).value_counts(normalize=True).sort_index(ascending=False) * 100
formatted_fr = binned_table_fr.apply(lambda x: f"{x:05.2f}%")

# Print results
print(model_name)
print("German")
print(formatted_de)
print("French")
print(formatted_fr)

intfloat/multilingual-e5-large
German
0.9–0.99    28.39%
0.8–0.9     32.63%
0.7–0.8     23.73%
0.6–0.7     11.86%
0.3–0.6     03.39%
Name: proportion, dtype: object
French
0.9–0.99    19.62%
0.8–0.9     24.05%
0.7–0.8     36.71%
0.6–0.7     16.46%
0.3–0.6     03.16%
Name: proportion, dtype: object


## LaBSE

In [38]:
model_name = 'sentence-transformers/LaBSE'
lev_similarity_list_de = []
lev_similarity_list_fr = []
for key, index in de_dfs.items():
  failed_indices = np.array(model_correct_predictions[model_name][index]) == 0
  current_df = model_all_cosines[model_name][key].loc[failed_indices] # filters only sucessful distractors
  # Find the column index of the maximum value for each row
  max_column_indices = current_df.idxmax(axis=1)
  values_df = bitext_file_list[index][1].loc[failed_indices]
  original_texts = values_df['Original']
  adversarial_texts = values_df.to_numpy()[np.arange(len(values_df)), values_df.columns.get_indexer(max_column_indices)]
  # Compute Levenshtein similarity for each pair
  similarities = [
      Levenshtein.ratio(str(orig), str(adv))
      for orig, adv in zip(original_texts, adversarial_texts)
  ]
  lev_similarity_list_de.extend(similarities)


for key, index in fr_dfs.items():
  failed_indices = np.array(model_correct_predictions[model_name][index]) == 0
  current_df = model_all_cosines[model_name][key].loc[failed_indices] # filters only sucessful distractors
  # Find the column index of the maximum value for each row
  max_column_indices = current_df.idxmax(axis=1)
  values_df = bitext_file_list[index][1].loc[failed_indices]
  original_texts = values_df['Original']
  adversarial_texts = values_df.to_numpy()[np.arange(len(values_df)), values_df.columns.get_indexer(max_column_indices)]
  # Compute Levenshtein similarity for each pair
  similarities = [
      Levenshtein.ratio(str(orig), str(adv))
      for orig, adv in zip(original_texts, adversarial_texts)
  ]
  lev_similarity_list_fr.extend(similarities)

print(len(lev_similarity_list_de))
print(len(lev_similarity_list_fr))

136
124


In [39]:
import pandas as pd

# Define bins and labels
bins = [0.3, 0.6, 0.7, 0.8, 0.9, 0.999]
labels = ['0.3–0.6', '0.6–0.7', '0.7–0.8', '0.8–0.9', '0.9–0.99']

# German
binned_de = pd.cut(lev_similarity_list_de, bins=bins, labels=labels, right=False)
binned_table_de = pd.Series(binned_de).value_counts(normalize=True).sort_index(ascending=False) * 100
formatted_de = binned_table_de.apply(lambda x: f"{x:05.2f}%")

# French
binned_fr = pd.cut(lev_similarity_list_fr, bins=bins, labels=labels, right=False)
binned_table_fr = pd.Series(binned_fr).value_counts(normalize=True).sort_index(ascending=False) * 100
formatted_fr = binned_table_fr.apply(lambda x: f"{x:05.2f}%")

# Print results
print(model_name)
print("German")
print(formatted_de)
print("French")
print(formatted_fr)

sentence-transformers/LaBSE
German
0.9–0.99    40.44%
0.8–0.9     32.35%
0.7–0.8     19.85%
0.6–0.7     05.88%
0.3–0.6     01.47%
Name: proportion, dtype: object
French
0.9–0.99    32.26%
0.8–0.9     28.23%
0.7–0.8     25.00%
0.6–0.7     12.10%
0.3–0.6     02.42%
Name: proportion, dtype: object


## OTHER ERROR ANALYSIS

### Cross Lingual or Machine Translation, which errors differ?

##### French

In [40]:
model = 'sentence-transformers/LaBSE'

orig_dfs = {'WMT19 French Synthetic 4 sentences' : 4, 'WMT21 French Synthetic 4 sentences' : 5}
mt_dfs = {'M2M MT WMT19 French Synthetic 4 sentences' : 0, 'M2M MT WMT21 French Synthetic 4 sentences' : 1}

list1 = []

# df1 = model_all_cosines[model1][name_of_df]
# df2 = model_all_cosines[model2][name_of_df]
print(len(model_correct_predictions[model]))
for key,index in orig_dfs.items():
  list1.extend(model_correct_predictions[model][index])

list2 = []
for key,index in mt_dfs.items():
  list2.extend(model_correct_predictions[model][index])


8


In [41]:
# Indices where both lists are 0
both_tricked = [i for i, (x, y) in enumerate(zip(list1, list2)) if x == 0 and y == 0]

# Indices where only the first list is 0
orig_tricked_only = [i for i, (x, y) in enumerate(zip(list1, list2)) if x == 0 and y != 0]

# Indices where only the second list is 0
mt_tricked_only = [i for i, (x, y) in enumerate(zip(list1, list2)) if x != 0 and y == 0]

In [42]:
print(len(both_tricked))
print(len(orig_tricked_only))
print(len(mt_tricked_only))

52
72
155


In [43]:
len(wmt19_fr_adv_df)

1473

In [44]:
# Split mt_tricked_only based on the value 1472
df_19_mt_errors_indices = [x for x in mt_tricked_only if x <= 1472]
df_21_mt_errors_indices = [x - 1472 for x in mt_tricked_only if x > 1472]

In [45]:
df_21_mt_errors_indices

[3,
 45,
 58,
 78,
 108,
 111,
 136,
 166,
 171,
 173,
 174,
 208,
 238,
 244,
 318,
 356,
 391,
 394,
 407,
 409,
 416,
 455,
 460,
 466,
 492,
 505,
 511,
 517,
 543,
 558,
 570,
 611,
 625,
 636,
 639,
 649,
 665,
 692,
 695,
 733,
 738,
 742,
 749,
 751,
 803,
 805,
 820,
 834,
 855,
 873,
 889,
 891,
 892]

In [46]:
# Assuming the dataframe looks like this:
df = model_all_cosines[model_name]['WMT19 French Synthetic 4 sentences']
df = df.loc[df_19_mt_errors_indices]
# Select columns 'Adversarial1', 'Adversarial2', 'Adversarial3', and 'Adversarial4'
adversarial_df = df[['Adversarial1', 'Adversarial2', 'Adversarial3', 'Adversarial4']]

# Find the column index of the maximum value for each row
max_column_indices = adversarial_df.idxmax(axis=1)

# Extract the last character of each column name
last_character_of_column = max_column_indices.str[-1]

# Save or display the result
last_character_of_column

Unnamed: 0,0
19,3
21,3
56,1
71,4
78,2
...,...
1395,3
1442,2
1449,1
1454,4


In [49]:
# Save or display the result
df_19 = pd.read_csv('evaluation_sets/CLSD_wmt2019_adversarial_dataset.csv')
df_19 = df_19.loc[df_19_mt_errors_indices]
df_19['selected_distractor'] = last_character_of_column
df_19.head()

# Apply the function to each row to create the new 'Failed X-Distractor' column
df_19['Failed X-Distractor'] = df_19.apply(get_failed_distractor, axis=1)

# Now filter the dataset to include only 'Index', 'French', 'German', and 'Failed X-Distractor'
main_fr_df = df_19[['Index', 'French', 'German', 'Failed X-Distractor']]
# Rename 'French' and 'German' columns
main_fr_df = main_fr_df.rename(columns={'French': 'French(Orig)', 'German': 'German(Orig)'})

df_19_mt = pd.read_csv('evaluation_sets/MT_M2M_wmt2019_adversarial_dataset.csv')

df_19_mt = df_19_mt.loc[df_19_mt_errors_indices]
df_19_mt['selected_distractor'] = last_character_of_column
# Apply the function to each row to create the new 'Failed X-Distractor' column
df_19_mt['Successful M-Distractor'] = df_19_mt.apply(get_failed_distractor, axis=1)
df_19_mt = df_19_mt[['Index', 'French', 'German', 'Successful M-Distractor']]
df_19_mt = df_19_mt.rename(columns={'French': 'French(MT)', 'German': 'German(MT)'})

df_19_merged = pd.merge(main_fr_df, df_19_mt, on='Index')
df_19_merged.head(2)

Unnamed: 0,Index,French(Orig),German(Orig),Failed X-Distractor,French(MT),German(MT),Successful M-Distractor
0,19,"Originellement, un nouveau départ de l'UE étai...","Ursprünglich gefordert wurde ein EU-Neustart ""...","Au commencement, un repositionnement de l’UE é...","Originally, a new exit from the EU was claimed...","Initially, it was called for a restart of the ...","Initially, a repositioning of the EU was pledg..."
1,21,Gysi pour un net positionnement pro-européen. ...,Gysi für klare pro-europäische PositionierungS...,Gysi pour un angagement net envers des valeurs...,Before the European Congress of the Die Linke ...,Even before the start of the European Party Da...,Before the liberal assembly of the Die Linke p...


In [50]:
df_19_merged.to_csv("df19_fr_mistakes.csv")

In [51]:
# Assuming the dataframe looks like this:
df = model_all_cosines[model_name]['WMT21 French Synthetic 4 sentences']
df = df.loc[df_21_mt_errors_indices]
# Select columns 'Adversarial1', 'Adversarial2', 'Adversarial3', and 'Adversarial4'
adversarial_df = df[['Adversarial1', 'Adversarial2', 'Adversarial3', 'Adversarial4']]

# Find the column index of the maximum value for each row
max_column_indices = adversarial_df.idxmax(axis=1)

# Extract the last character of each column name
last_character_of_column = max_column_indices.str[-1]

# Save or display the result
last_character_of_column

Unnamed: 0,0
3,3
45,3
58,2
78,4
108,2
111,1
136,1
166,2
171,2
173,1


In [52]:
# Save or display the result
df_21 = pd.read_csv('evaluation_sets/CLSD_wmt2021_adversarial_dataset.csv')
df_21 = df_21.loc[df_21_mt_errors_indices]
df_21['selected_distractor'] = last_character_of_column
df_21.head()

# Apply the function to each row to create the new 'Failed X-Distractor' column
df_21['Failed X-Distractor'] = df_21.apply(get_failed_distractor, axis=1)

# Now filter the dataset to include only 'Index', 'French', 'German', and 'Failed X-Distractor'
main_fr_df = df_21[['Index', 'French', 'German', 'Failed X-Distractor']]
# Rename 'French' and 'German' columns
main_fr_df = main_fr_df.rename(columns={'French': 'French(Orig)', 'German': 'German(Orig)'})

df_21_mt = pd.read_csv('evaluation_sets/MT_M2M_wmt2021_adversarial_dataset.csv')

df_21_mt = df_21_mt.loc[df_21_mt_errors_indices]
df_21_mt['selected_distractor'] = last_character_of_column
# Apply the function to each row to create the new 'Failed X-Distractor' column
df_21_mt['Successful M-Distractor'] = df_21_mt.apply(get_failed_distractor, axis=1)
df_21_mt = df_21_mt[['Index', 'French', 'German', 'Successful M-Distractor']]
df_21_mt = df_21_mt.rename(columns={'French': 'French(MT)', 'German': 'German(MT)'})

df_21_merged = pd.merge(main_fr_df, df_21_mt, on='Index')
df_21_merged.head(2)

Unnamed: 0,Index,French(Orig),German(Orig),Failed X-Distractor,French(MT),German(MT),Successful M-Distractor
0,3,"Le S&P-500, plus large, a perdu 20,03 points, ...","Der breiter gefasste S&P 500 fiel um 20,03 Pun...","Le S&P-500, moins large, a perdu 20,03 points,...","The wider S&P-500, lost 20,03 points, or -0,62...","The wider S&P 500 fell by 20,03 points or -0,6...","The S&P-500, the wider, lost 20,03 points, or ..."
1,45,"En revanche, il estime qu'un salary-cap comme ...","Dagegen ist er der Ansicht, dass eine Gehaltso...","Toutefois, il pense qu'une régulation des prix...","On the other hand, he believes that a salary c...","On the other hand, he believes that a salary c...","However, he believes that a price regulation l..."


In [53]:
df_21_merged.to_csv("df21_fr_mistakes.csv")

##### German

In [54]:
model = 'sentence-transformers/LaBSE'

orig_dfs = {'WMT19 German Synthetic 4 sentences' : 6, 'WMT21 German Synthetic 4 sentences' : 7}
mt_dfs = {'M2M MT German French Synthetic 4 sentences' : 2, 'M2M MT WMT21 German Synthetic 4 sentences' : 3}

list1 = []

print(len(model_correct_predictions[model]))
for key,index in orig_dfs.items():
  list1.extend(model_correct_predictions[model][index])

list2 = []
for key,index in mt_dfs.items():
  list2.extend(model_correct_predictions[model][index])


8


In [55]:
# Indices where both lists are 0
both_tricked = [i for i, (x, y) in enumerate(zip(list1, list2)) if x == 0 and y == 0]

# Indices where only the first list is 0
orig_tricked_only = [i for i, (x, y) in enumerate(zip(list1, list2)) if x == 0 and y != 0]

# Indices where only the second list is 0
mt_tricked_only = [i for i, (x, y) in enumerate(zip(list1, list2)) if x != 0 and y == 0]

In [56]:
print(len(both_tricked))
print(len(orig_tricked_only))
print(len(mt_tricked_only))

68
68
128


In [57]:
# Split mt_tricked_only based on the value 1472
df_19_mt_errors_indices = [x for x in mt_tricked_only if x <= 1472]
df_21_mt_errors_indices = [x - 1472 for x in mt_tricked_only if x > 1472]

In [58]:
# Assuming the dataframe looks like this:
df = model_all_cosines[model_name]['WMT19 German Synthetic 4 sentences']
df = df.loc[df_19_mt_errors_indices]
# Select columns 'Adversarial1', 'Adversarial2', 'Adversarial3', and 'Adversarial4'
adversarial_df = df[['Adversarial1', 'Adversarial2', 'Adversarial3', 'Adversarial4']]

# Find the column index of the maximum value for each row
max_column_indices = adversarial_df.idxmax(axis=1)

# Extract the last character of each column name
last_character_of_column = max_column_indices.str[-1]

# Save or display the result
last_character_of_column

Unnamed: 0,0
0,2
35,3
49,1
50,3
56,1
...,...
1452,2
1454,4
1456,2
1457,1


In [60]:
# Save or display the result
df_19 = pd.read_csv('evaluation_sets/CLSD_wmt2019_adversarial_dataset.csv')
df_19 = df_19.loc[df_19_mt_errors_indices]
df_19['selected_distractor'] = last_character_of_column
df_19.head()

# Apply the function to each row to create the new 'Failed X-Distractor' column
df_19['Failed X-Distractor'] = df_19.apply(lambda row: get_failed_distractor(row, prefix='de'), axis=1)
# Now filter the dataset to include only 'Index', 'French', 'German', and 'Failed X-Distractor'
main_fr_df = df_19[['Index', 'French', 'German', 'Failed X-Distractor']]
# Rename 'French' and 'German' columns
main_fr_df = main_fr_df.rename(columns={'French': 'French(Orig)', 'German': 'German(Orig)'})

df_19_mt = pd.read_csv('evaluation_sets/MT_M2M_wmt2019_adversarial_dataset.csv')

df_19_mt = df_19_mt.loc[df_19_mt_errors_indices]
df_19_mt['selected_distractor'] = last_character_of_column
# Apply the function to each row to create the new 'Failed X-Distractor' column
df_19_mt['Successful M-Distractor'] = df_19_mt.apply(lambda row: get_failed_distractor(row, prefix='de'), axis=1)
df_19_mt = df_19_mt[['Index', 'French', 'German', 'Successful M-Distractor']]
df_19_mt = df_19_mt.rename(columns={'French': 'French(MT)', 'German': 'German(MT)'})

df_19_merged = pd.merge(main_fr_df, df_19_mt, on='Index')
df_19_merged = df_19_merged[['Index', 'German(Orig)', 'French(Orig)', 'German(MT)', 'French(MT)', 'Failed X-Distractor', 'Successful M-Distractor']]
print(len(df_19_merged))
df_19_merged.head(2)

94


Unnamed: 0,Index,German(Orig),French(Orig),German(MT),French(MT),Failed X-Distractor,Successful M-Distractor
0,0,Europa-Parteitag der Linken : Kipping: Europa ...,Kipping au congrès de die Linke sur l'Europe :...,Kipping: Europe has long been a continent of i...,Kipping at die Linke’s Congress on Europe: Eur...,Europa-Parteitag der Linken: Kipping: Europa i...,Kipping: Europe has always been a continent of...
1,35,Aus den Umfragewerten in den EU-Staaten ergibt...,Les résultats des sondages dans les pays europ...,"According to survey figures in EU countries, t...",The results of polls in European countries rev...,Aus den Pollingdaten in den EU-Nationen zeigt ...,Polling data in EU countries show that the num...


In [61]:
df_19_merged.to_csv("df19_de_mistakes.csv")

In [62]:
# Assuming the dataframe looks like this:
df = model_all_cosines[model_name]['WMT21 German Synthetic 4 sentences']
df = df.loc[df_21_mt_errors_indices]
# Select columns 'Adversarial1', 'Adversarial2', 'Adversarial3', and 'Adversarial4'
adversarial_df = df[['Adversarial1', 'Adversarial2', 'Adversarial3', 'Adversarial4']]

# Find the column index of the maximum value for each row
max_column_indices = adversarial_df.idxmax(axis=1)

# Extract the last character of each column name
last_character_of_column = max_column_indices.str[-1]

# Save or display the result
last_character_of_column

Unnamed: 0,0
3,3
74,3
77,3
112,3
180,2
218,2
227,1
278,4
286,4
306,3


In [65]:
# Save or display the result
df_21 = pd.read_csv('evaluation_sets/CLSD_wmt2021_adversarial_dataset.csv')
df_21 = df_21.loc[df_21_mt_errors_indices]
df_21['selected_distractor'] = last_character_of_column
df_21.head()

# Apply the function to each row to create the new 'Failed X-Distractor' column
df_21['Failed X-Distractor'] = df_21.apply(lambda row: get_failed_distractor(row, prefix='de'), axis=1)

# Now filter the dataset to include only 'Index', 'French', 'German', and 'Failed X-Distractor'
main_fr_df = df_21[['Index', 'French', 'German', 'Failed X-Distractor']]
# Rename 'French' and 'German' columns
main_fr_df = main_fr_df.rename(columns={'French': 'French(Orig)', 'German': 'German(Orig)'})

df_21_mt = pd.read_csv('evaluation_sets/MT_M2M_wmt2021_adversarial_dataset.csv')

df_21_mt = df_21_mt.loc[df_21_mt_errors_indices]
df_21_mt['selected_distractor'] = last_character_of_column
# Apply the function to each row to create the new 'Failed X-Distractor' column
df_21_mt['Successful M-Distractor'] = df_21_mt.apply(lambda row: get_failed_distractor(row, prefix='de'), axis=1)
df_21_mt = df_21_mt[['Index', 'French', 'German', 'Successful M-Distractor']]
df_21_mt = df_21_mt.rename(columns={'French': 'French(MT)', 'German': 'German(MT)'})

df_21_merged = pd.merge(main_fr_df, df_21_mt, on='Index')
print(len(df_21_merged))
df_21_merged = df_21_merged[['Index', 'German(Orig)', 'French(Orig)', 'German(MT)', 'French(MT)', 'Failed X-Distractor', 'Successful M-Distractor']]
df_21_merged.head(2)

34


Unnamed: 0,Index,German(Orig),French(Orig),German(MT),French(MT),Failed X-Distractor,Successful M-Distractor
0,3,"Der breiter gefasste S&P 500 fiel um 20,03 Pun...","Le S&P-500, plus large, a perdu 20,03 points, ...","The wider S&P 500 fell by 20,03 points or -0,6...","The wider S&P-500, lost 20,03 points, or -0,62...","Der weniger breit gefasste S&P 500 fiel um 20,...","The less broad S&P 500 fell by 20,03 points or..."
1,74,„Ich möchte mich für diesen Fehler entschuldig...,""" Je veux m'excuser pour cette erreur "", a-t-i...","“I would like to apologize for this mistake,” ...","“I want to apologize for this mistake,” he sai...","""Ich wollte mich für diesen Fehler entschuldig...","""I wanted to apologize for this mistake,"" he s..."


In [66]:
df_21_merged.to_csv("df21_de_mistakes.csv")

### Which mistakes and where

French

In [67]:
model1 = 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2'
model2 = 'intfloat/multilingual-e5-base'
index_of_df = 4
name_of_df = 'WMT19 French Synthetic 4 sentences'
# WMT19 German Synthetic 4 sentences
print(model_all_cosines[model1])
print(model_all_cosines[model2].keys())

list1 = model_correct_predictions[model1][index_of_df]
list2 = model_correct_predictions[model2][index_of_df]

df1 = model_all_cosines[model1][name_of_df]
df2 = model_all_cosines[model2][name_of_df]

{'M2M MT WMT19 French Synthetic 4 sentences':       Original  Adversarial1  Adversarial2  Adversarial3  Adversarial4
0     0.885400      0.707780      0.616641      0.502215      0.702412
1     0.709437      0.585411      0.657904      0.815933      0.617610
2     0.891520      0.701339      0.716763      0.712008      0.673212
3     0.903697      0.795672      0.729993      0.837563      0.818597
4     0.941846      0.612531      0.697050      0.607133      0.785027
...        ...           ...           ...           ...           ...
1468  0.847171      0.677770      0.867713      0.766742      0.644600
1469  0.974292      0.804811      0.790705      0.807827      0.863070
1470  0.903102      0.858007      0.632467      0.752324      0.737265
1471  0.943348      0.701374      0.699014      0.649922      0.659088
1472  0.943097      0.473502      0.714565      0.675799      0.505746

[1473 rows x 5 columns], 'M2M MT WMT21 French Synthetic 4 sentences':      Original  Adversarial1  Ad

In [68]:
# Indices where both lists are 0
both_tricked = [i for i, (x, y) in enumerate(zip(list1, list2)) if x == 0 and y == 0]

# Indices where only the first list is 0
mpnet_tricked_only = [i for i, (x, y) in enumerate(zip(list1, list2)) if x == 0 and y != 0]

# Indices where only the second list is 0
e5_tricked_only = [i for i, (x, y) in enumerate(zip(list1, list2)) if x != 0 and y == 0]

In [69]:
print(len(both_tricked))
print(len(mpnet_tricked_only))
print(len(e5_tricked_only))

40
88
85


In [70]:
# Get matched indices and column names
matched_info = max_col_match_indices_and_names(df1, df2, both_tricked)
len(matched_info)

25

German

In [71]:
model1 = 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2'
model2 = 'intfloat/multilingual-e5-base'
index_of_df = 6
name_of_df = 'WMT19 German Synthetic 4 sentences'
# WMT19 German Synthetic 4 sentences
print(model_all_cosines[model1])
print(model_all_cosines[model2].keys())

list1 = model_correct_predictions[model1][index_of_df]
list2 = model_correct_predictions[model2][index_of_df]

df1 = model_all_cosines[model1][name_of_df]
df2 = model_all_cosines[model2][name_of_df]

{'M2M MT WMT19 French Synthetic 4 sentences':       Original  Adversarial1  Adversarial2  Adversarial3  Adversarial4
0     0.885400      0.707780      0.616641      0.502215      0.702412
1     0.709437      0.585411      0.657904      0.815933      0.617610
2     0.891520      0.701339      0.716763      0.712008      0.673212
3     0.903697      0.795672      0.729993      0.837563      0.818597
4     0.941846      0.612531      0.697050      0.607133      0.785027
...        ...           ...           ...           ...           ...
1468  0.847171      0.677770      0.867713      0.766742      0.644600
1469  0.974292      0.804811      0.790705      0.807827      0.863070
1470  0.903102      0.858007      0.632467      0.752324      0.737265
1471  0.943348      0.701374      0.699014      0.649922      0.659088
1472  0.943097      0.473502      0.714565      0.675799      0.505746

[1473 rows x 5 columns], 'M2M MT WMT21 French Synthetic 4 sentences':      Original  Adversarial1  Ad

In [72]:
# Indices where both lists are 0
both_tricked = [i for i, (x, y) in enumerate(zip(list1, list2)) if x == 0 and y == 0]

# Indices where only the first list is 0
mpnet_tricked_only = [i for i, (x, y) in enumerate(zip(list1, list2)) if x == 0 and y != 0]

# Indices where only the second list is 0
e5_tricked_only = [i for i, (x, y) in enumerate(zip(list1, list2)) if x != 0 and y == 0]

In [73]:
print(len(both_tricked))
print(len(mpnet_tricked_only))
print(len(e5_tricked_only))

75
56
95


In [74]:
# Get matched indices and column names
matched_info = max_col_match_indices_and_names(df1, df2, both_tricked)
len(matched_info)

50