In [None]:
import json
import datasets

import numpy as np

from src.data.eval import FRMT
from src.metrics import VIdScore
from src.constants import RESULTS_PATH, CACHE_PATH

## FRMT

In [4]:
frmt = FRMT()

### Deepl

In [5]:
pred_deepl_frmt = json.load((CACHE_PATH / "frmt_deepl.json").open())
pred_llama_frmt = json.load((CACHE_PATH / "frmt_llama3_fft.json").open())

In [23]:
deepl_frmt = json.load((RESULTS_PATH / "frmt" / "deepl.json").open())
google_br_frmt = json.load((RESULTS_PATH / "frmt" / "google_br.json").open())
google_pt_frmt = json.load((RESULTS_PATH / "frmt" / "google_pt.json").open())
llama_frmt = json.load((RESULTS_PATH / "frmt" / "llama3_fft.json").open())

In [24]:
deepl_comet = np.array(deepl_frmt["comet"]["scores"])
google_br_comet = np.array(google_br_frmt["comet"]["scores"])
google_pt_comet = np.array(google_pt_frmt["comet"]["scores"])
llama_comet = np.array(llama_frmt["comet"]["scores"])

Check the examples where the difference is big.

In [6]:
diff = deepl_comet - llama_comet
print(f"Min diff: {diff.min()}")
print(f"Max diff: {diff.max()}")
print(f"Mean diff: {diff.mean()}")

Min diff: -0.244423508644104
Max diff: 0.6132988035678864
Mean diff: 0.020507604523425177


#### Where deepl is better

In [7]:
top_10 = np.argsort(diff)[-10:]
print(f"Top 10 differences:\n{diff[top_10]}")

Top 10 differences:
[0.2358793  0.23833495 0.23924768 0.24147129 0.24796951 0.26088756
 0.26499724 0.2705037  0.27102154 0.6132988 ]


Check the translations for this cases.

In [8]:
for idx in top_10:
    print(f"Sample {idx}")
    print(f"Eng: {frmt.source[idx]}")
    print(f"Pt: {frmt.target[idx]}")
    print(f"Deepl: {pred_deepl_frmt[str(idx)]}")
    print(f"Llama: {pred_llama_frmt[str(idx)]}")
    print(f"Comet diff: {diff[idx]}")

    print()

Sample 1307
Eng: In China, the one child policy was largely responsible for an unbalanced sex ratio.
Pt: Na China, a pol√≠tica de filho √∫nico √© respons√°vel por uma rela√ß√£o sexual desequilibrada.
Deepl: Na China, a pol√≠tica do filho √∫nico foi em grande parte respons√°vel por um r√°cio sexual desequilibrado.
Llama: Na China, a pol√≠tica de uma crian√ßa foi em grande parte respons√°vel por uma rela√ß√£o de sexos desequilibrada.
Comet diff: 0.2358793020248413

Sample 462
Eng: Ltda.
Pt: Ltda.
Deepl: Ltda.
Llama: Lda.
Comet diff: 0.23833495378494263

Sample 2235
Eng: No east‚Äìwest highway existed at this point yet, leading to a spur end.
Pt: Neste momento, ainda n√£o existiam estradas este-oeste, levando a um final n√£o refletido.
Deepl: Ainda n√£o existia uma autoestrada este-oeste neste ponto, o que levou a um ramal.
Llama: Ainda n√£o existia nesta altura uma autoestrada nascente-poente, a dar num extremo de espig√£o.
Comet diff: 0.23924767971038818

Sample 238
Eng: He had traveled

#### Where LLama is better

In [9]:
top_10 = np.argsort(-diff)[-10:]
print(f"Top 10 differences:\n{diff[top_10]}")

Top 10 differences:
[-0.12054902 -0.12777668 -0.12931502 -0.1294899  -0.13128668 -0.13474226
 -0.16152954 -0.1876933  -0.22183281 -0.24442351]


Check the translations for this cases.

In [10]:
for idx in top_10:
    print(f"Sample {idx}")
    print(f"Eng: {frmt.source[idx]}")
    print(f"Pt: {frmt.target[idx]}")
    print(f"Deepl: {pred_deepl_frmt[str(idx)]}")
    print(f"Llama: {pred_llama_frmt[str(idx)]}")
    print(f"Comet diff: {diff[idx]}")

    print()

Sample 1390
Eng: Integral designs have the advantages that they have been well-tested for strength and stability, and also are off-the-shelf.
Pt: Os designs integrais t√™m as vantagens de terem sido sujeitos a testes rigorosos de resist√™ncia e estabilidade e est√£o tamb√©m prontos a ser utilizados.
Deepl: As concep√ß√µes integrais t√™m a vantagem de terem sido bem testadas quanto √† sua resist√™ncia e estabilidade, e de serem tamb√©m de prateleira.
Llama: Os designs integrais t√™m as vantagens de terem sido bem testados para a resist√™ncia e estabilidade, e tamb√©m s√£o off-the-shelf.
Comet diff: -0.1205490231513977

Sample 1575
Eng: Some unusual old patterns such as diamonds are now rare everywhere.
Pt: Alguns padr√µes antigos invulgares, como losangos, n√£o s√£o usados com frequ√™ncia.
Deepl: Alguns padr√µes antigos invulgares, como os diamantes, s√£o agora raros em todo o lado.
Llama: Alguns padr√µes antigos invulgares como os losangos s√£o agora raros em todo o lado.
Comet diff: -

## Confidence intervals

In [19]:
from src.metrics import compute_confidence_interval

In [20]:
mean, confidence_interval = compute_confidence_interval(deepl_comet)
print(f"Mean: {mean}")
print(f"95% Confidence Interval: {confidence_interval}")

Mean: 0.885
95% Confidence Interval: (0.882, 0.887)


In [21]:
mean, confidence_interval = compute_confidence_interval(google_br_comet)
print(f"Mean: {mean}")
print(f"95% Confidence Interval: {confidence_interval}")

Mean: 0.874
95% Confidence Interval: (0.872, 0.877)


In [25]:
mean, confidence_interval = compute_confidence_interval(google_pt_comet)
print(f"Mean: {mean}")
print(f"95% Confidence Interval: {confidence_interval}")

Mean: 0.879
95% Confidence Interval: (0.876, 0.881)


In [22]:
mean, confidence_interval = compute_confidence_interval(llama_comet)
print(f"Mean: {mean}")
print(f"95% Confidence Interval: {confidence_interval}")

Mean: 0.864
95% Confidence Interval: (0.861, 0.867)
