In [1]:
%load_ext autoreload
%autoreload 2

import multiprocessing as mp
import logging

import evaluate
import sacrebleu

bleu = evaluate.load("bleu")
chrf = evaluate.load("chrf")
comet = evaluate.load("comet")

try:
    mp.set_start_method('spawn', force=True)
except RuntimeError:
    pass

logging.getLogger("lightning.pytorch.utilities.rank_zero").setLevel(logging.ERROR)
# For older Lightning versions, you may also need:
logging.getLogger("pytorch_lightning.utilities.rank_zero").setLevel(logging.ERROR)


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.5.1.post0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../../../../../../../.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/2760a223ac957f30acfb18c8aa649b01cf1d75f2/checkpoints/model.ckpt`
Encoder model frozen.
/Users/yixiantan/opt/miniconda3/envs/omscs-llm/lib/python3.10/site-packages/pytorch_lightning/core/saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']


In [2]:
def calculate_single_ref_score(model, predictions, reference, source=None):
    """
    COMET requires the source text in the original language.
    """

    scores = []
    for prediction in predictions:

        match model:
            # case "bleu":
            #     scores.append(bleu.compute(predictions=[prediction], references=[reference]))
            case "sentence_bleu":
                scores.append(sacrebleu.sentence_bleu(prediction, [reference]).score)
            case "chrf":
                scores.append(chrf.compute(predictions=[prediction], references=[reference])["score"])
            case "comet":
                if source is None:
                    raise ValueError("COMET requires source texts for scoring.")
                scores.append(comet.compute(predictions=[prediction], references=[reference], sources=[source], gpus=1, progress_bar=False)["scores"][0])
            case _:
                raise ValueError(f"Unknown model: {model}")

    return scores


def review_all_models(predictions, reference, sources=None):
    """
    Calculate scores for all predictions against their corresponding references.
    If sources are provided, they will be used for COMET scoring.
    """
    scores = {
        "sentence_bleu": [],
        "chrf": [],
        "comet": []
    }

    for k, v in scores.items():
        result = calculate_single_ref_score(k, predictions, reference, sources)
        scores[k] = result

    return scores

In [3]:
predictions = ["this is a test", "this is a test too"]
reference = "this is a test"
source = "le test est un test"

# sentence_bleu
scores = calculate_single_ref_score("sentence_bleu", predictions, reference)
print(f"Sentence BLEU score: {scores}")  # ➜ Sentence BLEU score: 100.0

# chrf
scores = calculate_single_ref_score("chrf", predictions, reference)
print(f"CHRF score: {scores}")  # ➜ CHRF score: 100.0

# comet
scores = calculate_single_ref_score("comet", predictions, reference, source)
print(f"COMET score: {scores}")  # ➜ COMET score: 0.123456 (example value, actual value will vary)

Sentence BLEU score: [100.00000000000004, 66.87403049764218]
CHRF score: [100.0, 93.2143290609026]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


COMET score: [0.9899783730506897, 0.8943490386009216]


In [4]:
review_all_scores = review_all_models(predictions, reference, source)
print(f"Review all scores: {review_all_scores}")  # ➜ Review all scores: [100.0]

Review all scores: {'sentence_bleu': [100.00000000000004, 66.87403049764218], 'chrf': [100.0, 93.2143290609026], 'comet': [0.9899783730506897, 0.8943490386009216]}


In [5]:
source = "经理审查了报告，并提供了反馈。"
reference = "The manager reviewed the report and provided feedback."
predictions = ["After reviewing the report, the manager gave some feedback", "The report reviewed the manager and provided feedback.", "The manager review the report and provide feedback."]

review_all_scores = review_all_models(predictions, reference, source)
print(f"Review all scores: {review_all_scores}")

Review all scores: {'sentence_bleu': [9.980099403873663, 39.281465090051306, 26.084743001221455], 'chrf': [54.45980419390965, 84.94445636002766, 82.81454822707872], 'comet': [0.9057448506355286, 0.8854628801345825, 0.9285221695899963]}


In [6]:
source = "部長におかれましては、報告書をご確認の上、貴重なご意見を賜りました。"
reference = "The department head kindly reviewed the report and provided valuable feedback."
predictions = ["The manager reviewed the report and provided feedback.", "The head of department graciously looked over the report and shared constructive insights."]
review_all_scores = review_all_models(predictions, reference, source)
print(f"Review all scores: {review_all_scores}")

Review all scores: {'sentence_bleu': [38.03141958086991, 13.06511329838856], 'chrf': [55.24003630622099, 36.57487303249166], 'comet': [0.8867788910865784, 0.8811579942703247]}


In [7]:
source = "Saya pergi ke pasar untuk membeli beras, lalu menyiapkan semangkuk nasi yang lezat."
reference = "I went to the market to buy rice, and prepared a delicious bowl of rice."
predictions = ["I went to the market to buy rice, then prepared a delicious bowl of rice.", 
               "I went to the market to buy rice grains, then prepared a delicious bowl of cooked rice."]
review_all_scores = review_all_models(predictions, reference, source)
print(f"Review all scores: {review_all_scores}")

Review all scores: {'sentence_bleu': [82.82477531331043, 60.04981752197521], 'chrf': [89.94918001993271, 80.94468790216838], 'comet': [0.9501563906669617, 0.9194379448890686]}


In [8]:
source = "Saya pergi ke pasar untuk membeli beras, lalu menyiapkan semangkuk nasi yang lezat."
reference = "I am dead"
predictions = ["I went to the market to buy rice, then prepared a delicious bowl of rice.", 
               "I am dead"]
review_all_scores = review_all_models(predictions, reference, source)
print(f"Review all scores: {review_all_scores}")

Review all scores: {'sentence_bleu': [2.2869567780619007, 100.00000000000004], 'chrf': [8.743825924442664, 100.0], 'comet': [0.44467708468437195, 0.9866496920585632]}
