In [1]:
import multiprocessing as mp

import evaluate
import datasets

from src.translator import Translator

N_PROC = mp.cpu_count()

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


Read FRMT dataset.

In [2]:
frmt = datasets.load_dataset("hugosousa/frmt", split="test")
frmt = frmt.filter(lambda x: x["pt"] is not None)
frmt = frmt.filter(lambda x: x["br"] is not None)
frmt = frmt.remove_columns(["tw", "cn", "tw_simplified"])
frmt = frmt.to_pandas().drop_duplicates()
frmt = datasets.Dataset.from_pandas(frmt)

Built the translator.

In [3]:
translator = Translator("pt", "en")

Make the translations from PT.

In [4]:
def translate(example):
    example["pt_en"] = translator.translate(example["pt"])
    example["br_en"] = translator.translate(example["pt"])
    return example

In [5]:
frmt = frmt.map(lambda x: translate(x), num_proc=N_PROC)

Map (num_proc=96): 100%|██████████| 2611/2611 [00:35<00:00, 74.30 examples/s] 


Evaluate the translations.

In [6]:
# count the number of examples that pt_en and br_en are the same
count = 0
for example in frmt:
    if example["pt_en"] == example["br_en"]:
        count += 1
print(f"Number of examples where pt_en and br_en are the same: {count}/{len(frmt)}")

Number of examples where pt_en and br_en are the same: 2277/2611


In [7]:
blue = evaluate.load("bleu")

In [8]:
blue_pt = blue.compute(predictions=frmt["pt_en"], references=frmt["en"])
print(f"BLEU score for pt: {blue_pt['bleu']:.2f}")
print(blue_pt)


BLEU score for pt: 0.51
{'bleu': 0.5074696103170111, 'precisions': [0.7640640934300044, 0.5652728643507511, 0.44074922395346733, 0.3483859836245569], 'brevity_penalty': 1.0, 'length_ratio': 1.0153761046255976, 'translation_length': 71583, 'reference_length': 70499}


In [9]:
blue_br = blue.compute(predictions=frmt["br_en"], references=frmt["en"])
print(f"BLEU score for br: {blue_br['bleu']:.2f}")
print(blue_br)

BLEU score for br: 0.51
{'bleu': 0.5062229945125449, 'precisions': [0.7638807388157711, 0.5642735959862535, 0.4393471282383615, 0.34677229586634245], 'brevity_penalty': 1.0, 'length_ratio': 1.0152484432403297, 'translation_length': 71574, 'reference_length': 70499}


In [10]:
blue_pt_br = blue.compute(predictions=frmt["br_en"], references=frmt["pt_en"])
print(f"BLEU score for pt_br: {blue_pt_br['bleu']:.2f}")
print(blue_pt_br)

BLEU score for pt_br: 0.97
{'bleu': 0.9680847157768272, 'precisions': [0.986615251348255, 0.9724054928004873, 0.9617349630009193, 0.9524041101262845], 'brevity_penalty': 0.9998742639201975, 'length_ratio': 0.9998742718243159, 'translation_length': 71574, 'reference_length': 71583}


Checkout the differences.

In [20]:
errors = frmt.filter(lambda x: x["pt_en"] != x["br_en"])

Filter: 100%|██████████| 2611/2611 [00:00<00:00, 43300.43 examples/s]


In [26]:
# print the first 10 errors
for example in errors.select(range(10)):
    print(f"en:\t{example['en']}")
    print(f"pt_en:\t{example['pt_en']}")
    print(f"br_en:\t{example['br_en']}")
    print()

en:	A dense forest that existed until the 20th century, in Quinta da Cerca in this area, that protected and sheltered the settlement from winds.
pt_en:	A dense forest that existed until the 20th century, in Quinta da Cerca in this area, which protected and sheltered the village from the winds.
br_en:	A dense forest that existed until the 20th century, at Quinta da Cerca in this area, which protected and sheltered the village from the winds.

en:	A hierarchy of buses of six sizes feed one other.
pt_en:	A hierarchy of buses of six sizes are controlled automatically.
br_en:	A hierarchy of buses of six sizes are automatically controlled.

en:	A punch line used by then, O Beirão de que todos gostam (port.
pt_en:	The closing phrase used at the time, "The Beirão that everyone likes",
br_en:	The shot used at the time, "O Beirão that everyone likes",

en:	A well-known spa town is Caldas de Monchique.
pt_en:	Some very well-known spas are Caldas de Monchique.
br_en:	One very well-known spa is Cal