In [71]:
%load_ext autoreload
%autoreload 2

# Use evaluate for BLEU and chrf, use comet-babel for COMET
import evaluate
import sacrebleu
import multiprocessing as mp

bleu = evaluate.load("bleu")
chrf = evaluate.load("chrf")
comet = evaluate.load("comet")

try:
    mp.set_start_method('spawn', force=True)
except RuntimeError:
    pass

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.5.1.post0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../../../../../../../.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/2760a223ac957f30acfb18c8aa649b01cf1d75f2/checkpoints/model.ckpt`
Encoder model frozen.
/Users/yixiantan/opt/miniconda3/envs/omscs-llm/lib/python3.10/site-packages/pytorch_lightning/core/saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']


In [86]:
def calculate_single_ref_score(model, predictions, reference, source=None):
    """
    COMET requires the source text in the original language.
    """

    scores = []
    for prediction in predictions:

        match model:
            # case "bleu":
            #     scores.append(bleu.compute(predictions=[prediction], references=[reference]))
            case "sentence_bleu":
                scores.append(sacrebleu.sentence_bleu(prediction, [reference]).score)
            case "chrf":
                scores.append(chrf.compute(predictions=[prediction], references=[reference])["score"])
            case "comet":
                if source is None:
                    raise ValueError("COMET requires source texts for scoring.")
                scores.append(comet.compute(predictions=[prediction], references=[reference], sources=[source], gpus=1, progress_bar=False)["scores"][0])
            case _:
                raise ValueError(f"Unknown model: {model}")

    return scores


def review_all_models(predictions, reference, sources=None):
    """
    Calculate scores for all predictions against their corresponding references.
    If sources are provided, they will be used for COMET scoring.
    """
    scores = {
        "sentence_bleu": [],
        "chrf": [],
        "comet": []
    }

    for k, v in scores.items():
        result = calculate_single_ref_score(k, predictions, reference, sources)
        scores[k] = result

    return scores

In [88]:
predictions = ["this is a test", "this is a test too"]
reference = "this is a test"
source = "le test est un test"

# sentence_bleu
scores = calculate_single_ref_score("sentence_bleu", predictions, reference)
print(f"Sentence BLEU score: {scores}")  # ➜ Sentence BLEU score: 100.0

# chrf
scores = calculate_single_ref_score("chrf", predictions, reference)
print(f"CHRF score: {scores}")  # ➜ CHRF score: 100.0

# comet
scores = calculate_single_ref_score("comet", predictions, reference, source)
print(f"COMET score: {scores}")  # ➜ COMET score: 0.123456 (example value, actual value will vary)

Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


Sentence BLEU score: [100.00000000000004, 66.87403049764218]
CHRF score: [100.0, 93.2143290609026]


Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


COMET score: [0.9899783730506897, 0.8943490386009216]


In [80]:
review_all_scores = review_all_models(predictions, reference, source)
print(f"Review all scores: {review_all_scores}")  # ➜ Review all scores: [100.0]

Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


Review all scores: {'sentence_bleu': [100.00000000000004], 'chrf': [100.0], 'comet': [0.9899783730506897]}


In [91]:
source = "经理审查了报告，并提供了反馈。"
reference = "The manager reviewed the report and provided feedback."
predictions = ["Feedback was provided by the manager after reviewing the report.", "The manager reviewed the cat and provided dog", "a1a2"]

review_all_scores = review_all_models(predictions, reference, source)
print(f"Review all scores: {review_all_scores}")

Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


Review all scores: {'sentence_bleu': [9.425159511373677, 36.28241434631104, 0.0], 'chrf': [61.89834169664148, 66.83394759294124, 1.3020833333333335], 'comet': [0.8905420899391174, 0.581606924533844, 0.34312549233436584]}
