In [1]:
import pandas as pd
from transformers import MarianMTModel, MarianTokenizer
from tqdm import tqdm

print("Cell 1 OK")

  from .autonotebook import tqdm as notebook_tqdm


Cell 1 OK


In [2]:
model_name = "Helsinki-NLP/opus-mt-en-tl"

tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

print("Cell 2 OK — model loaded")

Cell 2 OK — model loaded


In [3]:
def translate_batch(texts, batch_size=2):
    results = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(
            batch,
            return_tensors="pt",
            padding=True,
            truncation=True
        )
        outputs = model.generate(**inputs)
        decoded = tokenizer.batch_decode(
            outputs,
            skip_special_tokens=True
        )
        results.extend(decoded)
    return results

print("Cell 3 OK — translate_batch ready")

Cell 3 OK — translate_batch ready


In [4]:
test_sentences = [
    "The cat is sleeping on the sofa.",
    "This experiment measures translation quality."
]

translate_batch(test_sentences)

['Ang pusa ay natutulog sa sofa.',
 'Ang eksperimentong ito ay sumusukat sa kalidad ng pagsasalin.']

In [5]:
df_bcopa = pd.read_csv("../../datasets/cleaned/cleaned_bcopa.csv")
df_bcopa.columns

Index(['id', 'premise', 'choice1', 'choice2', 'question', 'label', 'mirrored'], dtype='object')

In [6]:
premises = df_bcopa["premise"].tolist()

translated_premises = []
chunk_size = 50  # safe on CPU

for i in range(0, len(premises), chunk_size):
    chunk = premises[i:i+chunk_size]
    translated_chunk = translate_batch(chunk, batch_size=2)
    translated_premises.extend(translated_chunk)
    print(f"Translated {len(translated_premises)} / {len(premises)}")

df_bcopa["premise_opus"] = translated_premises

df_bcopa.to_csv(
    "cleaned_bcopa_opus_translated.csv",
    index=False
)

print("BCOPA translation finished")


Translated 50 / 400
Translated 100 / 400
Translated 150 / 400
Translated 200 / 400
Translated 250 / 400
Translated 300 / 400
Translated 350 / 400
Translated 400 / 400
BCOPA translation finished


In [7]:
df_xlsum = pd.read_csv("../../datasets/cleaned/cleaned_xlsum.csv")
df_xlsum.columns

Index(['text', 'summary'], dtype='object')

In [8]:
texts = df_xlsum["text"].tolist()

translated_texts = []
chunk_size = 20  # smaller because XLSUM texts are long

for i in range(0, len(texts), chunk_size):
    chunk = texts[i:i+chunk_size]
    translated_chunk = translate_batch(chunk, batch_size=2)
    translated_texts.extend(translated_chunk)
    print(f"Translated {len(translated_texts)} / {len(texts)}")

df_xlsum["text_opus"] = translated_texts

df_xlsum.to_csv(
    "cleaned_xlsum_opus_translated.csv",
    index=False
)

print("XLSUM translation finished")

Translated 20 / 100
Translated 40 / 100
Translated 60 / 100
Translated 80 / 100
Translated 100 / 100
XLSUM translation finished


In [9]:
df_xnli = pd.read_csv("../../datasets/cleaned/cleaned_xnli.csv")
df_xnli.columns

Index(['gold_label', 'sentence1', 'sentence2'], dtype='object')

In [10]:
s1 = df_xnli["sentence1"].tolist()
s2 = df_xnli["sentence2"].tolist()

translated_s1 = []
translated_s2 = []

chunk_size = 50  # safe size on CPU

for i in range(0, len(s1), chunk_size):
    chunk1 = s1[i:i+chunk_size]
    chunk2 = s2[i:i+chunk_size]

    translated_s1.extend(translate_batch(chunk1, batch_size=2))
    translated_s2.extend(translate_batch(chunk2, batch_size=2))

    print(f"Translated {len(translated_s1)} / {len(s1)}")

df_xnli["sentence1_opus"] = translated_s1
df_xnli["sentence2_opus"] = translated_s2

df_xnli.to_csv(
    "cleaned_xnli_opus_translated.csv",
    index=False
)

print("XNLI translation finished")

Translated 50 / 600
Translated 100 / 600
Translated 150 / 600
Translated 200 / 600
Translated 250 / 600
Translated 300 / 600
Translated 350 / 600
Translated 400 / 600
Translated 450 / 600
Translated 500 / 600
Translated 550 / 600
Translated 600 / 600
XNLI translation finished


In [None]:
df = pd.read_csv("../../data/paws.csv")

In [None]:
import os

os.getcwd()

In [None]:
os.listdir("..")

In [None]:
os.listdir("../..")

In [None]:
os.listdir("../../datasets")

In [None]:
os.listdir("../../data-raw")


In [None]:
os.listdir("../../data")

In [None]:
os.listdir("../../data/paws")

In [None]:
os.listdir("../../datasets")

In [None]:
import os
os.listdir("../../datasets/cleaned")

In [None]:
os.listdir("../../datasets/cleaned")

In [None]:
df = pd.read_csv("../../datasets/cleaned/cleaned_paws.csv")

In [None]:
df.head()
df.columns

In [None]:
sentence1_list = df["sentence1"].tolist()
sentence2_list = df["sentence2"].tolist()

len(sentence1_list), len(sentence2_list)

In [None]:
from transformers import MarianMTModel, MarianTokenizer

model_name = "Helsinki-NLP/opus-mt-en-tl"

tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

In [None]:
from tqdm import tqdm

def translate_batch(sentences, batch_size=8):
    translations = []

    for i in tqdm(range(0, len(sentences), batch_size)):
        batch = sentences[i:i+batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True)
        outputs = model.generate(**inputs)
        decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        translations.extend(decoded)

    return translations

In [None]:
test_translations = translate_batch(sentence1_list[:10])
test_translations

In [None]:
df["sentence1_opus"] = translate_batch(sentence1_list)
df["sentence2_opus"] = translate_batch(sentence2_list)

In [None]:
df.to_csv("cleaned_paws_opus_translated.csv", index=False)

## BCOPA — Helsinki Opus Translation

In [None]:
df_bcopa = pd.read_csv("../../datasets/cleaned/cleaned_bcopa.csv")
df_bcopa.columns

In [None]:
bcopa_list = df_bcopa["premise"].tolist()

df_bcopa["premise_opus"] = translate_batch(
    bcopa_list,
    batch_size=8
)

df_bcopa.to_csv(
    "cleaned_bcopa_opus_translated.csv",
    index=False
)