# Analyze Human Rankings

A parallel corpus of medical texts was assembled and translated with four translators: DeepL, CEF, Google, and IBM. The translated sentences were compared with expected translations using three different similarity scores: BLEU, ROUGE, and BLEURT.

Additionally, two human raters pair-wise compared the similarity of translated sentences to expected translations, assigning whether one sentence was more, equally, or less similar to the expected translation than the other. TS also assessed the (subjective) translation quality.

## Human Ranking

The following code analyzes how translators ranked according to similarity by DP and TS and according to actual translation by TS.

In [3]:
import pandas as pd
import sys
import typing

rating_data: pd.DataFrame = pd.read_csv('FinalCombinedRating.csv', delimiter=';')

SCORE_NAMES: list[str] = ['Similarity.rating.DP', 'Translation.rating.DP', 'Similarity.rating.TS', 'Translation.rating.TS',
               'Similarity.rating.BLEU', 'Similarity.rating.ROUGE', 'Similarity.rating.BLEURT']
TRANSLATORS: list[str] = ['CEF', 'Deepl', 'Google', 'IBM']

TEST_INTERNAL_VALIDITY = True

def test_sentence_uniqueness(original_sentence: str, sentence_ratings: pd.DataFrame) -> None:
    # Each sentence should have six comparisons (combinations of four translators)
    sentence_unique = len(sentence_ratings) == 6
    if not sentence_unique:
        sys.exit(
            '[Error] Original sentence covered multiple times in the data: ' + original_sentence)


def test_valid_corpus_error_entry(original_sentence: str, sentence_ratings: pd.DataFrame, score_name: str) -> None:
    if any(sentence_ratings[score_name] == 'E') and not all(sentence_ratings[score_name] == 'E'):
        sys.exit(
            '[Error] Not all comparisons for the same original sentence were noted as corpus error: ' + original_sentence + ' (' + score_name + ')')


def get_translator_ranking(translator_ratings: dict[str, int], translator: str) -> dict[str, int]:
    present_ratings = sorted(
        set(translator_ratings.values()), reverse=True)
    translator_rating = translator_ratings[translator]
    return {'rank': present_ratings.index(translator_rating) + 1, 'of': len(present_ratings)}
    

translator_ratings: dict[str, dict[str, dict[str, typing.Union[int, list[dict[str, int]]]]]] = {}
for score_name in SCORE_NAMES:
    translator_ratings[score_name] = {}
    for translator in TRANSLATORS:
        translator_ratings[score_name][translator] = { 'times_rated_better': 0, 'sentence_rankings': [] }
original_sentences: set[str] = set(rating_data['Original'])
for original_sentence in original_sentences:
    sentence_ratings: pd.DataFrame = rating_data[rating_data['Original'] == original_sentence]
    test_sentence_uniqueness(original_sentence, sentence_ratings)
    for score_name in SCORE_NAMES:
        test_valid_corpus_error_entry(original_sentence, sentence_ratings, score_name)
        current_translator_ratings: dict[str, int] = {}
        for translator in TRANSLATORS:
            current_translator_ratings[translator] = 0
        # Build overall rating based on single ratings
        for index in sentence_ratings.index:
            current_rating: dict[str, str] = sentence_ratings.loc[index, [
                score_name, 'Satz.1.TranslatorSource', 'Satz.2.TranslatorSource']].to_dict()
            operator: str = current_rating[score_name]
            first_translator: str = current_rating['Satz.1.TranslatorSource']
            second_translator: str = current_rating['Satz.2.TranslatorSource']
            if operator == ">":
                current_translator_ratings[first_translator] += 1
            if operator == "<":
                current_translator_ratings[second_translator] += 1
        # Store rating and single sentence ranking
        for translator in TRANSLATORS:
            translator_ratings[score_name][translator]['times_rated_better'] += current_translator_ratings[translator]
            translator_ratings[score_name][translator]['sentence_rankings'].append(
                get_translator_ranking(current_translator_ratings, translator))
        
        if TEST_INTERNAL_VALIDITY:
            # Test internal validity of ratings
            for index in sentence_ratings.index:
                current_rating: dict[str, str] = sentence_ratings.loc[index, [
                    score_name, 'Satz.1.TranslatorSource', 'Satz.2.TranslatorSource']].to_dict()
                operator: str = current_rating[score_name]
                first_translator: str = current_rating['Satz.1.TranslatorSource']
                second_translator: str = current_rating['Satz.2.TranslatorSource']
                if (operator == ">" and current_translator_ratings[first_translator] <= current_translator_ratings[second_translator]) or \
                    (operator == "=" and current_translator_ratings[first_translator] != current_translator_ratings[second_translator]) or \
                    (operator == "<" and current_translator_ratings[first_translator] >= current_translator_ratings[second_translator]):
                    print('[WARNING] Internally invalid rating for ' + score_name + ' in ' + original_sentence + \
                          ' – ' + first_translator + ' should be ' + operator + ' ' + second_translator + ' but is not')

# Asseble overall ranking table

rating_table_data: dict[str, dict[str, int]] = {}
for score_name in SCORE_NAMES:
    rating_table_data[score_name] = {}
    for translator in TRANSLATORS:
        rating_table_data[score_name][translator] = translator_ratings[score_name][translator]['times_rated_better']

ranking_table_data: dict[str, list[str]] = {}
for translator in TRANSLATORS:
    ranking_table_data[translator] = []
    for score_name in SCORE_NAMES:
        current_translator_ratings = rating_table_data[score_name]
        translator_ranking = get_translator_ranking(current_translator_ratings, translator)
        ranking_table_data[translator].append('{} ({})'.format(translator_ranking['rank'], current_translator_ratings[translator]))

ranking_table_data = pd.DataFrame(ranking_table_data)
ranking_table_data.index = SCORE_NAMES
print(ranking_table_data)


                             CEF    Deepl  Google     IBM
Similarity.rating.DP      3 (49)   1 (68)  2 (64)  3 (49)
Translation.rating.DP     4 (20)   1 (58)  2 (41)  3 (28)
Similarity.rating.TS      4 (44)   2 (69)  1 (72)  3 (45)
Translation.rating.TS     4 (15)   1 (51)  2 (42)  3 (27)
Similarity.rating.BLEU    4 (66)  1 (103)  2 (92)  3 (70)
Similarity.rating.ROUGE   3 (71)   2 (85)  1 (92)  3 (71)
Similarity.rating.BLEURT  4 (72)   1 (98)  2 (97)  3 (80)
