### Generate ratings (chrF, BLEU, COMET)
TODO: Present correlations as a table in paper for each language

In [2]:
%load_ext autoreload
%autoreload 2

import json
import os, sys
ROOT = os.path.abspath("..")
sys.path.insert(0, ROOT)

from rich import print
from tqdm import tqdm
from scipy.stats import pearsonr, spearmanr

from src.utils.trad_metrics import TradMetrics
from src.utils.altscore import get_altscores
from lang_datasets.scripts.constants import LANGUAGES

In [4]:
RATINGS_FILE = "../ratings/flores200-ratings.json"
TRANSLATIONS_FILE = "../lang_datasets/sample_dataset/flores_sample_translations.json"
CANDIDATE_MAP = {
    "candidate_1": "google_translate",
    "candidate_2": "nllb_translate",
    "candidate_3": "llm_translate",
}

In [None]:
trad_metrics = TradMetrics()

In [6]:
with open(RATINGS_FILE, "r", encoding="utf-8") as f:
    ratings = json.load(f)

with open(TRANSLATIONS_FILE, "r", encoding="utf-8") as f:
    translations = json.load(f)

In [18]:
# Per language model rating
def get_candidate_ratings(candidate_id, languages):

    candidate_ratings = {
        candidate_id: {
            language: {
                "sentence_bleu": {"scores": [], "avg": None},
                "chrf": {"scores": [], "avg": None},
                "comet": {"scores": [], "avg": None},
            }
            for language in languages
        }
    }

    for data in tqdm(ratings):
        curr_lang = data["language_code"]
        source_sentence = data["source_sentence"]
        reference_sentence = data["original_english_text"]
        for candidate in data["candidate_sentence_evaluations"]:
            if candidate["candidate_id"] != candidate_id:
                continue

            candidate_sentence = candidate["candidate_sentence"]
            review_all_scores = trad_metrics.review_all_models(
                [candidate_sentence], reference_sentence, source_sentence
            )

        # Append the scores to the candidate ratings
        candidate_ratings[candidate_id][curr_lang]["sentence_bleu"]["scores"].append(
            {
                "score": review_all_scores["sentence_bleu"][0],
                "translation": candidate_sentence,
            }
        )
        candidate_ratings[candidate_id][curr_lang]["chrf"]["scores"].append(
            {"score": review_all_scores["chrf"][0], "translation": candidate_sentence}
        )
        candidate_ratings[candidate_id][curr_lang]["comet"]["scores"].append(
            {"score": review_all_scores["comet"][0], "translation": candidate_sentence}
        )

        # Update the average scores
        curr_bleu_score_sum = sum(
            score["score"]
            for score in candidate_ratings[candidate_id][curr_lang]["sentence_bleu"][
                "scores"
            ]
        )
        curr_chrf_score_sum = sum(
            score["score"]
            for score in candidate_ratings[candidate_id][curr_lang]["chrf"]["scores"]
        )
        curr_comet_score_sum = sum(
            score["score"]
            for score in candidate_ratings[candidate_id][curr_lang]["comet"]["scores"]
        )

        candidate_ratings[candidate_id][curr_lang]["sentence_bleu"]["avg"] = (
            curr_bleu_score_sum
            / len(candidate_ratings[candidate_id][curr_lang]["sentence_bleu"]["scores"])
        )
        candidate_ratings[candidate_id][curr_lang]["chrf"]["avg"] = (
            curr_chrf_score_sum
            / len(candidate_ratings[candidate_id][curr_lang]["chrf"]["scores"])
        )
        candidate_ratings[candidate_id][curr_lang]["comet"]["avg"] = (
            curr_comet_score_sum
            / len(candidate_ratings[candidate_id][curr_lang]["comet"]["scores"])
        )

    return candidate_ratings

def generate_ratings(cands_output_files, output_path): 
    for cands_output_file in cands_output_files:
        file_output_path = os.path.join(output_path, cands_output_file)

        # As long as the output file does not exist, we will write the ratings to the file
        if not os.path.exists(file_output_path):

            print(f"{cands_output_file} does not exist, generating ratings for all...")

            google_trans_ratings = get_candidate_ratings("candidate_1", LANGUAGES.keys())
            nllb_trans_ratings = get_candidate_ratings("candidate_2", LANGUAGES.keys()) 
            llm_trans_ratings = get_candidate_ratings("candidate_3", LANGUAGES.keys())

            with open(os.path.join(f"{output_path}/google_ratings.json"), "w", encoding="utf-8") as f:
                json.dump(google_trans_ratings, f, indent=4, ensure_ascii=False)

            with open(os.path.join(f"{output_path}/nllb_ratings.json"), "w", encoding="utf-8") as f:
                json.dump(nllb_trans_ratings, f, indent=4, ensure_ascii=False)

            with open(os.path.join(f"{output_path}/llm_ratings.json"), "w", encoding="utf-8") as f:
                json.dump(llm_trans_ratings, f, indent=4, ensure_ascii=False)
            
            return f"Rating generation complete for chrF, BLEU, COMET scores. Saved at {output_path}."
    
    print("All ratings files already exist, skipping generation...")

In [9]:
OUTPUT_PATH = "../ratings/"
cands_output_files = ["google_ratings.json", "nllb_ratings.json", "llm_ratings.json"]

In [None]:
generate_ratings(cands_output_files= cands_output_files, output_path=OUTPUT_PATH)

  0%|          | 0/120 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
100%|██████████| 120/120 [02:12<00:00,  1.10s/it]
100%|██████████| 120/120 [01:50<00:00,  1.09it/s]
100%|██████████| 120/120 [03:06<00:00,  1.55s/it]


'Rating generation complete for chrF, BLEU, COMET scores. Saved at ../ratings/.'

### Generate ratings (ALTScore)

In [19]:
ratings_dict = {
    f"candidate_{i}": {
        lang: {"scores": [], "avg_score": None} for lang in LANGUAGES.keys()
    }
    for i in range(1, len(CANDIDATE_MAP) + 1)
}

for i, rating in enumerate(ratings):
    try:
        altscores = get_altscores(rating)
        lang = rating["language_code"]
        for candidate, score in altscores.items():
            ratings_dict[candidate][lang]["scores"].append(altscores[candidate])
            curr_score = 0
            for score in ratings_dict[candidate][lang]["scores"]:
                curr_score += score["altscore"]
            ratings_dict[candidate][lang]["avg_score"] = curr_score / len(
                ratings_dict[candidate][lang]["scores"]
            )
    except Exception as e:
        print(f"Error processing rating {i}: {e}")
        print(rating)
        break

# Save ratings dict to a JSON file
OUTPUT_PATH_ALT = f"{OUTPUT_PATH}/alt_bench_ratings.json"

with open(OUTPUT_PATH_ALT, "w", encoding="utf-8") as f:
    json.dump(ratings_dict, f, indent=4, ensure_ascii=False)

print(f"Rating generation complete for ALTScore. Ratings saved to {OUTPUT_PATH_ALT}.")

### Computing Pearson and Spearman correlations

In [20]:
OUTPUT_PATH = "../ratings/"

In [21]:
# Load the ratings
google_ratings = json.load(open(os.path.join(OUTPUT_PATH, "google_ratings.json"), "r", encoding="utf-8"))
nllb_ratings = json.load(open(os.path.join(OUTPUT_PATH, "nllb_ratings.json"), "r", encoding="utf-8"))
llm_ratings = json.load(open(os.path.join(OUTPUT_PATH, "llm_ratings.json"), "r", encoding="utf-8"))
altscores = json.load(open(os.path.join(OUTPUT_PATH, "alt_bench_ratings.json"), "r", encoding="utf-8"))

In [22]:
# Add the altscores to the ratings
def add_alt_scores(rating, altscores, candidate_id):
    for lang, _ in rating[candidate_id].items():
        rating[candidate_id][lang]["altscore"] = {"scores": []}
        rating[candidate_id][lang]["altscore"]["scores"] = altscores[candidate_id][lang]["scores"]

add_alt_scores(google_ratings, altscores, "candidate_1")
add_alt_scores(nllb_ratings, altscores, "candidate_2")
add_alt_scores(llm_ratings, altscores, "candidate_3")

In [25]:
def compute_correlations(ratings, candidate_id, lang):

    # Ensure that the translations are in order
    sentence_bleu_translations = [
        score_dict["translation"]
        for score_dict in ratings[candidate_id][lang]["sentence_bleu"]["scores"]
    ]
    chrf_translations = [
        score_dict["translation"]
        for score_dict in ratings[candidate_id][lang]["chrf"]["scores"]
    ]
    comet_translations = [
        score_dict["translation"]
        for score_dict in ratings[candidate_id][lang]["comet"]["scores"]
    ]
    altscore_translations = [
        score_dict["translation"]
        for score_dict in ratings[candidate_id][lang]["altscore"]["scores"]
    ]

    assert (
        sentence_bleu_translations
        == chrf_translations
        == comet_translations
        == altscore_translations
    ), "Translations for the different metrics do not match. Please check the ratings data."

    # Extract the scores for the specified candidate and language
    sentence_bleu_scores = [
        score_dict["score"]
        for score_dict in ratings[candidate_id][lang]["sentence_bleu"]["scores"]
    ]
    chrf_scores = [
        score_dict["score"]
        for score_dict in ratings[candidate_id][lang]["chrf"]["scores"]
    ]
    comet_scores = [
        score_dict["score"]
        for score_dict in ratings[candidate_id][lang]["comet"]["scores"]
    ]
    altscore_scores = [
        score_dict["altscore"]
        for score_dict in ratings[candidate_id][lang]["altscore"]["scores"]
    ]

    # Calculate Pearson and Spearman correlations
    pearson_bleu_altscore = pearsonr(sentence_bleu_scores, altscore_scores)
    spearman_bleu_altscore = spearmanr(sentence_bleu_scores, altscore_scores)

    pearson_chrf_altscore = pearsonr(chrf_scores, altscore_scores)
    spearman_chrf_altscore = spearmanr(chrf_scores, altscore_scores)

    pearson_comet_altscore = pearsonr(comet_scores, altscore_scores)
    spearman_comet_altscore = spearmanr(comet_scores, altscore_scores)

    pearson_bleu_comet = pearsonr(sentence_bleu_scores, comet_scores)
    spearman_bleu_comet = spearmanr(sentence_bleu_scores, comet_scores)

    pearson_chrf_bleu = pearsonr(chrf_scores, sentence_bleu_scores)
    spearman_chrf_bleu = spearmanr(chrf_scores, sentence_bleu_scores)

    return (
        {
            "pearson_bleu_altscore": pearson_bleu_altscore,
            "spearman_bleu_altscore": spearman_bleu_altscore,
            "pearson_chrf_altscore": pearson_chrf_altscore,
            "spearman_chrf_altscore": spearman_chrf_altscore,
            "pearson_comet_altscore": pearson_comet_altscore,
            "spearman_comet_altscore": spearman_comet_altscore,
            "pearson_bleu_comet": pearson_bleu_comet,
            "spearman_bleu_comet": spearman_bleu_comet,
            "pearson_chrf_bleu": pearson_chrf_bleu,
            "spearman_chrf_bleu": spearman_chrf_bleu,
            "sentence_bleu_scores": sentence_bleu_scores,
            "chrf_scores": chrf_scores,
            "comet_scores": comet_scores,
            "altscore_scores": altscore_scores,
        },
    )


google_corrs = (
    compute_correlations(google_ratings, "candidate_1", "tha_Thai")
)
nllb_corrs = compute_correlations(nllb_ratings, "candidate_2", "tha_Thai")
llm_corrs = compute_correlations(llm_ratings, "candidate_3", "tha_Thai")

In [26]:
google_corrs = {}
nllb_corrs = {}
llm_corrs = {}

for lang in LANGUAGES.keys():
    google_corrs[lang] = compute_correlations(google_ratings, "candidate_1", lang)
    nllb_corrs[lang] = compute_correlations(nllb_ratings, "candidate_2", lang)
    llm_corrs[lang] = compute_correlations(llm_ratings, "candidate_3", lang)

all_corrs = {
    "google": google_corrs,
    "nllb": nllb_corrs,
    "llm": llm_corrs,
}

# Save the correlations to a JSON file
CORRELATIONS_OUTPUT_PATH = f"{OUTPUT_PATH}/correlations.json"
with open(CORRELATIONS_OUTPUT_PATH, "w", encoding="utf-8") as f:
    json.dump(all_corrs, f, indent=4, ensure_ascii=False)
print(f"Correlations saved to {CORRELATIONS_OUTPUT_PATH}.")