## Install Necessary Dependencies

In [1]:
!pip install sacrebleu
!pip install evaluate
!pip install bert_score
!pip install -U accelerate
!pip install -U transformers
!pip install chinese-converter
!pip install unbabel-comet
!pip install hLepor
!pip install nptyping

Collecting sacrebleu
  Downloading sacrebleu-2.4.0-py3-none-any.whl (106 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.3/106.3 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-2.8.2-py3-none-any.whl (17 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-2.8.2 sacrebleu-2.4.0
Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m999.2 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
Collecting dill (from evaluate)
  Downloadin

## Import

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import evaluate
import random
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
from tqdm import tqdm
import chinese_converter
from os import listdir
from hlepor import hlepor_score

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## Data

In [4]:
base_path = "/content/drive/MyDrive/Cantonese-NLP/"

In [5]:
with open(f"{base_path}Result/bing_translated.txt", "r", encoding='utf-8') as f:
  bing_translated = f.readlines()

In [6]:
with open(f"{base_path}Result/baidu_translated.txt", "r", encoding='utf-8') as f:
  baidu_translated = f.readlines()

In [7]:
with open(f"{base_path}Result/gpt4_translated.txt", "r", encoding='utf-8') as f:
  gpt4_translated = f.readlines()

In [8]:
with open(f"{base_path}/Processed-Data/test/en.txt", "r", encoding='utf-8') as f:
  reference = f.readlines()

In [9]:
with open(f"{base_path}/Processed-Data/test/yue.txt", "r", encoding='utf-8') as f:
  test_data = f.readlines()

## Get Result for Each Model

In [None]:
nllb_tokenizer = AutoTokenizer.from_pretrained(
    "facebook/nllb-200-distilled-600M",
    src_lang="yue_Hant",
    tgt_lang="eng_Latn"
)
mbart_tokenizer = MBart50TokenizerFast.from_pretrained(
    "facebook/mbart-large-50-many-to-many-mmt"
)
mbart_tokenizer.src_lang = "zh_CN"

opus_tokenizer = AutoTokenizer.from_pretrained(
    "Helsinki-NLP/opus-mt-zh-en"
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/3.55k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/805k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/807k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.62M [00:00<?, ?B/s]



In [None]:
def get_test_translation_nllb(model, nllb_tokenizer=nllb_tokenizer, data=test_data, TRANSLATION_BATCH_SIZE=50):
  pred = []
  for i in tqdm(range(0, len(data), TRANSLATION_BATCH_SIZE)):
    inputs = nllb_tokenizer(
        data[i:i+TRANSLATION_BATCH_SIZE],
        return_tensors="pt",
        padding=True,
        truncation=True
    ).to("cuda")
    translated_tokens = model.generate(
        **inputs, forced_bos_token_id=nllb_tokenizer.lang_code_to_id["eng_Latn"], max_length=100
    )
    pred.extend(nllb_tokenizer.batch_decode(translated_tokens, skip_special_tokens=True))
  return pred


In [None]:
def get_test_translation_mbart(model, tokenizer=mbart_tokenizer, data=test_data, batch_size=50):
  pred = []
  source_lang = "zh_CN"
  target_lang = "en_XX"

  for i in tqdm(range(0, len(data), batch_size)):
    batch = data[i:i + batch_size]
    encoded_input = tokenizer(
        batch,
        padding=True,
        truncation=True,
        max_length = 512,
        return_tensors="pt"
    ).to("cuda")
    generated_tokens = model.generate(
        **encoded_input,
        forced_bos_token_id=tokenizer.lang_code_to_id[target_lang]
    )
    translated_batch = tokenizer.batch_decode(
        generated_tokens,
        skip_special_tokens=True
    )
    for ind, sentence in enumerate(translated_batch):
      translated_batch[ind] = chinese_converter.to_traditional(sentence)
    pred.extend(translated_batch)

  return pred


In [None]:
def get_test_translation_opus(model, tokenizer=opus_tokenizer, data=test_data, batch_size=50):
  pred = []

  for i in tqdm(range(0, len(data), batch_size)):
    batch = data[i:i + batch_size]
    encoded_input = tokenizer(
        batch,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=512
    ).to("cuda")
    translated_output = model.generate(**encoded_input)
    translated_batch = [tokenizer.decode(t, skip_special_tokens=True) for t in translated_output]
    pred.extend(translated_batch)

  return pred

In [None]:
nllb_baseline = AutoModelForSeq2SeqLM.from_pretrained(
    f"{base_path}model/nllb-200-distilled-600M-finetuned",
    local_files_only=True
).to("cuda")

In [None]:
nllb_baseline_translated = get_test_translation_nllb(nllb_baseline)

100%|██████████| 60/60 [00:37<00:00,  1.62it/s]


In [None]:
with open(f"{base_path}Result/nllb_bl_translated.txt", "w", encoding='utf-8') as f:
  for line in nllb_baseline_translated:
    f.write(f"{line}\n")

In [None]:
nllb_1to1_10E_model = AutoModelForSeq2SeqLM.from_pretrained(
    f"{base_path}model/nllb-200-distilled-600M-finetuned-it1-1:1-10E",
    local_files_only=True
).to("cuda")

In [None]:
nllb_1t1_10E_translated = get_test_translation_nllb(nllb_1to1_10E_model)

100%|██████████| 60/60 [00:36<00:00,  1.64it/s]


In [None]:
with open(f"{base_path}Result/nllb_1to1_10E_translated.txt", "w", encoding='utf-8') as f:
  for line in nllb_1t1_10E_translated:
    f.write(f"{line}\n")

In [None]:
nllb_1to1_model = AutoModelForSeq2SeqLM.from_pretrained(
    f"{base_path}model/nllb-200-distilled-600M-finetuned-it1-1:1",
    local_files_only=True
).to("cuda")

In [None]:
nllb_1t1_translated = get_test_translation_nllb(nllb_1to1_model)

100%|██████████| 60/60 [00:36<00:00,  1.63it/s]


In [None]:
with open(f"{base_path}Result/nllb_1to1_translated.txt", "w", encoding='utf-8') as f:
  for line in nllb_1t1_translated:
    f.write(f"{line}\n")

In [None]:
nllb_1to3_model = AutoModelForSeq2SeqLM.from_pretrained(
    f"{base_path}model/nllb-200-distilled-600M-finetuned-it1-1:3",
    local_files_only=True
).to("cuda")

In [None]:
nllb_1t3_translated = get_test_translation_nllb(nllb_1to3_model)

100%|██████████| 60/60 [00:37<00:00,  1.61it/s]


In [None]:
with open(f"{base_path}Result/nllb_1to3_translated.txt", "w", encoding='utf-8') as f:
  for line in nllb_1t3_translated:
    f.write(f"{line}\n")

In [None]:
nllb_1to5_model = AutoModelForSeq2SeqLM.from_pretrained(
    f"{base_path}model/nllb-200-distilled-600M-finetuned-it1-1:5",
    local_files_only=True
).to("cuda")

In [None]:
nllb_1t5_translated = get_test_translation_nllb(nllb_1to5_model)

100%|██████████| 60/60 [00:36<00:00,  1.62it/s]


In [None]:
with open(f"{base_path}Result/nllb_1to5_translated.txt", "w", encoding='utf-8') as f:
  for line in nllb_1t5_translated:
    f.write(f"{line}\n")

In [None]:
nllb_1to1_opus__model = AutoModelForSeq2SeqLM.from_pretrained(
    f"{base_path}model/nllb-200-distilled-600M-finetuned-opus-1:1",
    local_files_only=True
).to("cuda")

In [None]:
nllb_1t1_opus_translated = get_test_translation_nllb(nllb_1to1_opus__model)

100%|██████████| 60/60 [00:37<00:00,  1.58it/s]


In [None]:
with open(f"{base_path}Result/nllb_1to1_opus_translated.txt", "w", encoding='utf-8') as f:
  for line in nllb_1t1_opus_translated:
    f.write(f"{line}\n")

In [None]:
nllb_1to3_opus__model = AutoModelForSeq2SeqLM.from_pretrained(
    f"{base_path}model/nllb-200-distilled-600M-finetuned-opus-1:3",
    local_files_only=True
).to("cuda")

In [None]:
nllb_1t3_opus_translated = get_test_translation_nllb(nllb_1to3_opus__model)

100%|██████████| 60/60 [00:37<00:00,  1.60it/s]


In [None]:
with open(f"{base_path}Result/nllb_1to3_opus_translated.txt", "w", encoding='utf-8') as f:
  for line in nllb_1t3_opus_translated:
    f.write(f"{line}\n")

In [None]:
nllb_1to1_mbart__model = AutoModelForSeq2SeqLM.from_pretrained(
    f"{base_path}model/nllb-200-distilled-600M-finetuned-mbart-1:1",
    local_files_only=True
).to("cuda")

In [None]:
nllb_1t1_mbart_translated = get_test_translation_nllb(nllb_1to1_mbart__model)

100%|██████████| 60/60 [00:35<00:00,  1.70it/s]


In [None]:
with open(f"{base_path}Result/nllb_1to1_mbart_translated.txt", "w", encoding='utf-8') as f:
  for line in nllb_1t1_mbart_translated:
    f.write(f"{line}\n")

In [None]:
nllb_1to3_mbart__model = AutoModelForSeq2SeqLM.from_pretrained(
    f"{base_path}model/nllb-200-distilled-600M-finetuned-mbart-1:3",
    local_files_only=True
).to("cuda")

In [None]:
nllb_1t3_mbart_translated = get_test_translation_nllb(nllb_1to1_mbart__model)

100%|██████████| 60/60 [00:35<00:00,  1.69it/s]


In [None]:
with open(f"{base_path}Result/nllb_1to3_mbart_translated.txt", "w", encoding='utf-8') as f:
  for line in nllb_1t3_mbart_translated:
    f.write(f"{line}\n")

In [None]:
mBart_1t1_model = AutoModelForSeq2SeqLM.from_pretrained(
    f"{base_path}model/mBart-1:1-nllb",
    local_files_only=True
).to("cuda")

In [None]:
mBart_1t1_translated = get_test_translation_mbart(mBart_1t1_model)

100%|██████████| 60/60 [01:43<00:00,  1.72s/it]


In [None]:
with open(f"{base_path}Result/mbart_1to1_translated.txt", "w", encoding='utf-8') as f:
  for line in mBart_1t1_translated:
    f.write(f"{line}\n")

In [None]:
mBart_1t3_model = AutoModelForSeq2SeqLM.from_pretrained(
    f"{base_path}model/mBart-1:3-nllb",
    local_files_only=True
).to("cuda")

In [None]:
mBart_1t3_translated = get_test_translation_mbart(mBart_1t3_model)

100%|██████████| 60/60 [01:43<00:00,  1.73s/it]


In [None]:
with open(f"{base_path}Result/mbart_1to3_translated.txt", "w", encoding='utf-8') as f:
  for line in mBart_1t3_translated:
    f.write(f"{line}\n")

In [None]:
mBart_ft_model = AutoModelForSeq2SeqLM.from_pretrained(
    f"{base_path}model/mBart-baseline",
    local_files_only=True
).to("cuda")

In [None]:
mBart_ft_translated = get_test_translation_mbart(mBart_ft_model)

100%|██████████| 60/60 [01:46<00:00,  1.78s/it]


In [None]:
with open(f"{base_path}Result/mbart_ft_translated.txt", "w", encoding='utf-8') as f:
  for line in mBart_ft_translated:
    f.write(f"{line}\n")

In [None]:
opus_ft_model = AutoModelForSeq2SeqLM.from_pretrained(
    f"{base_path}model/opus-mt-zh-en-finetuned",
    local_files_only=True
).to("cuda")

In [None]:
opus_ft_translated = get_test_translation_opus(opus_ft_model)

100%|██████████| 60/60 [01:01<00:00,  1.03s/it]


In [None]:
with open(f"{base_path}Result/opus_ft_translated.txt", "w", encoding='utf-8') as f:
  for line in opus_ft_translated:
    f.write(f"{line}\n")

In [None]:
opus_1t1_model = AutoModelForSeq2SeqLM.from_pretrained(
    f"{base_path}model/opus-mt-zh-en-1:1-10E-nllb",
    local_files_only=True
).to("cuda")

In [None]:
opus_1t1_translated = get_test_translation_opus(opus_1t1_model)

100%|██████████| 60/60 [01:47<00:00,  1.79s/it]


In [None]:
with open(f"{base_path}Result/opus_1t1_translated.txt", "w", encoding='utf-8') as f:
  for line in opus_1t1_translated:
    f.write(f"{line}\n")

In [None]:
opus_1t3_model = AutoModelForSeq2SeqLM.from_pretrained(
    f"{base_path}model/opus-mt-zh-en-1:3-10E-nllb",
    local_files_only=True
).to("cuda")

In [None]:
opus_1t3_translated = get_test_translation_opus(opus_1t3_model)

100%|██████████| 60/60 [01:49<00:00,  1.83s/it]


In [None]:
with open(f"{base_path}Result/opus_1t3_translated.txt", "w", encoding='utf-8') as f:
  for line in opus_1t3_translated:
    f.write(f"{line}\n")

## Get Model Data

In [11]:
translations = {}
for fn in listdir(f"{base_path}Result"):
  with open(f"{base_path}Result/{fn}", "r", encoding='utf-8') as f:
    translated = f.readlines()
  model_name = fn.rpartition("_")[0]
  translations[model_name] = translated

## SacreBleu Evaluation

In [None]:
sacrebleu = evaluate.load("sacrebleu")

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [None]:
for model, translated in translations.items():
  res = sacrebleu.compute(predictions=translated, references=reference)["score"]
  print(f"{model}: {res}")

bing: 17.109785638339876
baidu: 16.566853140833853
gpt4: 19.16223505955193
nllb_bl: 16.511663482120092
nllb_1to1_10E: 16.52034387806999
nllb_1to1: 16.59010962870376
nllb_1to3: 15.917471594249239
nllb_1to5: 15.80736184949603
nllb_1to1_opus: 16.553720523092817
nllb_1to3_opus: 15.934790800528097
nllb_1to1_mbart: 16.807699923840037
nllb_1to3_mbart: 16.807699923840037
mbart_1to1: 16.035756358682907
mbart_1to3: 15.325995016467633
mbart_ft: 15.751320872414007
opus_ft: 15.060205105877598
opus_1t1: 13.062283169701946
opus_1t3: 13.366554317566488


## BertScore Evaluation

In [None]:
bertscore = evaluate.load("bertscore")

Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

In [None]:
for model, translated in translations.items():
  res = bertscore.compute(predictions=translated, references=reference, lang="en")
  bert_f1 = sum(res["f1"])/len(res["f1"])
  print(f"Bert F1 Score for {model}: {bert_f1}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Bert F1 Score for bing: 0.9258104058106741
Bert F1 Score for baidu: 0.9242970142761866
Bert F1 Score for gpt4: 0.9359668886860212
Bert F1 Score for nllb_bl: 0.9247503545681636
Bert F1 Score for nllb_1to1_10E: 0.9247346252202988
Bert F1 Score for nllb_1to1: 0.9249665616353353
Bert F1 Score for nllb_1to3: 0.9240035186211268
Bert F1 Score for nllb_1to5: 0.9237049496769905
Bert F1 Score for nllb_1to1_opus: 0.9253989733854929
Bert F1 Score for nllb_1to3_opus: 0.9242393557826678
Bert F1 Score for nllb_1to1_mbart: 0.9255628749529521
Bert F1 Score for nllb_1to3_mbart: 0.9255628749529521
Bert F1 Score for mbart_1to1: 0.9241247810522715
Bert F1 Score for mbart_1to3: 0.9224773534735043
Bert F1 Score for mbart_ft: 0.9227211884260178
Bert F1 Score for opus_ft: 0.9218569373687109
Bert F1 Score for opus_1t1: 0.9164279973308246
Bert F1 Score for opus_1t3: 0.9167215960423152


## Comet

In [None]:
comet = evaluate.load('comet')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

README.md:   0%|          | 0.00/3.53k [00:00<?, ?B/s]

LICENSE:   0%|          | 0.00/9.69k [00:00<?, ?B/s]

hparams.yaml:   0%|          | 0.00/567 [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

model.ckpt:   0%|          | 0.00/2.32G [00:00<?, ?B/s]

INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.2.0.post0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/371e9839ca4e213dde891b066cf3080f75ec7e72/checkpoints/model.ckpt`


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/core/saving.py:188: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']


In [None]:
for model, translated in translations.items():
  res = comet.compute(predictions=translated, references=reference, sources=test_data)
  comet_score = sum(res["scores"])/len(res["scores"])
  print(f"Comet Score for {model}: {comet_score}")

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Comet Score for bing: 0.7473919637103875


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Comet Score for baidu: 0.7400669962366422


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Comet Score for gpt4: 0.805035679101944


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Comet Score for nllb_bl: 0.7376299103001753


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Comet Score for nllb_1to1_10E: 0.7379801347951094


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Comet Score for nllb_1to1: 0.7409206756651402


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Comet Score for nllb_1to3: 0.7375647439161936


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Comet Score for nllb_1to5: 0.7386101472874482


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Comet Score for nllb_1to1_opus: 0.7416294215023518


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Comet Score for nllb_1to3_opus: 0.7373770111103852


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Comet Score for nllb_1to1_mbart: 0.7424624876181285


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Comet Score for nllb_1to3_mbart: 0.7424624876181285


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Comet Score for mbart_1to1: 0.7379750194648902


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Comet Score for mbart_1to3: 0.731935152053833


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Comet Score for mbart_ft: 0.73143410607179


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Comet Score for opus_ft: 0.7193340418636799


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Comet Score for opus_1t1: 0.6896684716343879
Comet Score for opus_1t3: 0.6957021108269692


## hLepor

In [None]:
for model, translated in translations.items():
  score = hlepor_score(reference, translated)
  print(f"HLepor Score for {model}: {round(score, 4)}")

HLepor Score for bing: 0.5735
HLepor Score for baidu: 0.5654
HLepor Score for gpt4: 0.5917
HLepor Score for nllb_bl: 0.5651
HLepor Score for nllb_1to1_10E: 0.5689
HLepor Score for nllb_1to1: 0.5686
HLepor Score for nllb_1to3: 0.5626
HLepor Score for nllb_1to5: 0.562
HLepor Score for nllb_1to1_opus: 0.5704
HLepor Score for nllb_1to3_opus: 0.5651
HLepor Score for nllb_1to1_mbart: 0.571
HLepor Score for nllb_1to3_mbart: 0.571
HLepor Score for mbart_1to1: 0.5681
HLepor Score for mbart_1to3: 0.5584
HLepor Score for mbart_ft: 0.5623
HLepor Score for opus_ft: 0.5581
HLepor Score for opus_1t1: 0.5409
HLepor Score for opus_1t3: 0.5442


### MetricX XXL

In [21]:
import json
for model, translated in translations.items():
  with open(f"{base_path}json/{model}.jsonl", "w") as f:
    for ref, trans in zip(reference, translated):
      ref = ref.rstrip()
      trans = trans.rstrip()
      item = {
          "reference" : ref,
          "hypothesis": trans,
      }
      f.write(f"{json.dumps(item)}\n")
