In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/translated-wikipedia-biographies/Translated Wikipedia Biographies - EN_DE.csv
/kaggle/input/translated-wikipedia-biographies/Translated Wikipedia Biographies - EN_ES.csv
/kaggle/input/translated-wikipedia-biographies/Data Card.pdf


We will measure the similarity of meaning between these two texts to see if the translation is faithful.

To do this, we use a Hugging Face model (sentence transformers) that transforms a text into a numerical vector (embedding).

Then, we calculate a cosine similarity between the two vectors. If the score is close to 1, the two texts mean almost the same thing.

In [2]:
df1 = pd.read_csv("/kaggle/input/translated-wikipedia-biographies/Translated Wikipedia Biographies - EN_DE.csv")
df2 = pd.read_csv("/kaggle/input/translated-wikipedia-biographies/Translated Wikipedia Biographies - EN_ES.csv")

data = pd.concat([df1, df2], ignore_index=True)
data.to_csv("concatene.csv", index=False)

In [3]:
print(data.head(5))

  sourceLanguage targetLanguage  documentID stringID  \
0             en             de           1      1-1   
1             en             de           1      1-2   
2             en             de           1      1-3   
3             en             de           1      1-4   
4             en             de           1      1-5   

                                          sourceText  \
0  Kaisa-Leena Mäkäräinen (born 11 January 1983) ...   
1  Outside sports, Mäkäräinen is currently studyi...   
2  Her team coach is Jonne Kähkönen, while Jarmo ...   
3  Mäkäräinen was originally a cross-country skie...   
4     She started training for the biathlon in 2003.   

                                      translatedText perceivedGender  \
0  Kaisa-Leena Mäkäräinen (geboren am 11. Januar ...          Female   
1  Neben dem Sport studiert Mäkäräinen derzeit Ph...          Female   
2  Ihr Mannschaftstrainer ist Jonne Kähkönen, Jar...          Female   
3  Mäkäräinen war ursprünglich Langläu

In [4]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

#Loading embedding model
model_name = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

df = pd.DataFrame(data)

def compute_similarity(text1, text2):
     # Tokenize and encoded text
    inputs = tokenizer([text1, text2], return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state.mean(dim=1)
    # Split the two embeddings and calculate the cosinus similarity
    emb1, emb2 = embeddings[0], embeddings[1]
    score = F.cosine_similarity(emb1.unsqueeze(0), emb2.unsqueeze(0)).item()
    return score

df["similarityScore"] = df.apply(lambda row: compute_similarity(row["sourceText"], row["translatedText"]), axis=1)

print(df[["sourceText", "translatedText", "similarityScore"]])
        

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

2025-05-17 13:53:15.323303: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747489995.586109      13 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747489995.659774      13 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

                                             sourceText  \
0     Kaisa-Leena Mäkäräinen (born 11 January 1983) ...   
1     Outside sports, Mäkäräinen is currently studyi...   
2     Her team coach is Jonne Kähkönen, while Jarmo ...   
3     Mäkäräinen was originally a cross-country skie...   
4        She started training for the biathlon in 2003.   
...                                                 ...   
2937  Speaking to Madrid-based Diario AS in 2013 abo...   
2938  Rossell proceeded to try again first under San...   
2939  In the documentary "Un Sueño Real", she reveal...   
2940                  Her struggle proved unsuccessful.   
2941  It wasn't until 2013, in Perez's second stint ...   

                                         translatedText  similarityScore  
0     Kaisa-Leena Mäkäräinen (geboren am 11. Januar ...         0.939866  
1     Neben dem Sport studiert Mäkäräinen derzeit Ph...         0.878054  
2     Ihr Mannschaftstrainer ist Jonne Kähkönen, Jar...         0.

As long as the score, the more faithful the translation is to the meaning of the original text