# Analyze Human Rankings

A parallel corpus of medical texts was assembled and translated with four translators: DeepL, CEF, Google, and IBM. The translated sentences were compared with expected translations using three different similarity scores: BLEU, ROUGE, and BLEURT.

Additionally, two human raters pair-wise compared (1) the similarity of translated sentences to expected translations, assigning whether one sentence was more, equally, or less similar to the expected translation than the other, and (2) the (subjective) translation quality.

The following code analyzes how translators ranked according to similarity by DP and TS and according to actual translation by TS.

In [1]:
import pandas as pd
import sys
import typing

rating_data: pd.DataFrame = pd.read_csv('FinalCombinedRating.csv', delimiter=';')

SCORE_NAMES: list[str] = ['Similarity.rating.DP', 'Translation.rating.DP', 'Similarity.rating.TS', 'Translation.rating.TS',
               'Similarity.rating.BLEU', 'Similarity.rating.ROUGE', 'Similarity.rating.BLEURT']
TRANSLATORS: list[str] = ['CEF', 'Deepl', 'Google', 'IBM']

def test_sentence_uniqueness(original_sentence: str, sentence_ratings: pd.DataFrame) -> None:
    # Each sentence should have six comparisons (combinations of four translators)
    sentence_unique = len(sentence_ratings) == 6
    if not sentence_unique:
        sys.exit(
            '[Error] Original sentence covered multiple times in the data: ' + original_sentence)

def test_valid_corpus_error_entry(original_sentence: str, sentence_ratings: pd.DataFrame, score_name: str) -> None:
    if any(sentence_ratings[score_name] == 'E') and not all(sentence_ratings[score_name] == 'E'):
        sys.exit(
            '[Error] Not all comparisons for the same original sentence were noted as corpus error: ' + original_sentence + ' (' + score_name + ')')

def get_sentence_ratings(rating_data: pd.DataFrame, original_sentence: str) -> pd.DataFrame:
    return rating_data[rating_data['Original'] == original_sentence]

def get_translator_ranking(translator_ratings: dict[str, int], translator: str) -> dict[str, int]:
    present_ratings = sorted(
        set(translator_ratings.values()), reverse=True)
    translator_rating = translator_ratings[translator]
    return {'rank': present_ratings.index(translator_rating) + 1, 'of': len(present_ratings)}

# translator_ratings: {
#   SCORE_NAME: {
#     TRANSLATOR: {
#       'times_rated_better': int,
#       'sentence_rankings': [
#         { 'rank': int, 'of': int }, # per original sentence
#         ...
#       ]
#     }
#   },
#   ...
# }
translator_ratings: dict[str, dict[str, dict[str, typing.Union[int, list[dict[str, int]]]]]] = {}
# internal_invalidities: {
#   SCORE_NAME: {
#     ORIGINAL_SENTENCE: [
#       {
#         'first_translator': TRANSLATOR,
#         'second_translator': TRANSLATOR,
#         'operator': OPERATOR
#       }
#     ]
#   }
# }
internal_invalidities: dict[str:[dict[str: list[dict[str: str]]]]] = {}

for score_name in SCORE_NAMES:
    translator_ratings[score_name] = {}
    for translator in TRANSLATORS:
        translator_ratings[score_name][translator] = { 'times_rated_better': 0, 'sentence_rankings': [] }
original_sentences: set[str] = set(rating_data['Original'])

for original_sentence in original_sentences:
    sentence_ratings: pd.DataFrame = get_sentence_ratings(rating_data, original_sentence)
    test_sentence_uniqueness(original_sentence, sentence_ratings)
    for score_name in SCORE_NAMES:
        test_valid_corpus_error_entry(original_sentence, sentence_ratings, score_name)
        current_translator_ratings: dict[str, int] = {}
        for translator in TRANSLATORS:
            current_translator_ratings[translator] = 0
        # Build overall rating based on single ratings
        for index in sentence_ratings.index:
            current_rating: dict[str, str] = sentence_ratings.loc[index, [
                score_name, 'Satz.1.TranslatorSource', 'Satz.2.TranslatorSource']].to_dict()
            operator: str = current_rating[score_name]
            first_translator: str = current_rating['Satz.1.TranslatorSource']
            second_translator: str = current_rating['Satz.2.TranslatorSource']
            if operator == ">":
                current_translator_ratings[first_translator] += 1
            if operator == "<":
                current_translator_ratings[second_translator] += 1
        # Test if rating is internally valid
        internally_invalid_rankings: list[dict[str: str]] = []
        for index in sentence_ratings.index:
            current_rating: dict[str, str] = sentence_ratings.loc[index, [
                score_name, 'Satz.1.TranslatorSource', 'Satz.2.TranslatorSource']].to_dict()
            operator: str = current_rating[score_name]
            first_translator: str = current_rating['Satz.1.TranslatorSource']
            second_translator: str = current_rating['Satz.2.TranslatorSource']
            if (operator == ">" and current_translator_ratings[first_translator] <= current_translator_ratings[second_translator]) or \
                (operator == "=" and current_translator_ratings[first_translator] != current_translator_ratings[second_translator]) or \
                (operator == "<" and current_translator_ratings[first_translator] >= current_translator_ratings[second_translator]):
                internally_invalid_rankings.append({
                    'first_translator': first_translator,
                    'second_translator': second_translator,
                    'operator': operator
                })
        # Store invalid rankings
        for invalid_ranking in internally_invalid_rankings:
            if not score_name in internal_invalidities.keys():
                internal_invalidities[score_name] = {}
            if not original_sentence in internal_invalidities[score_name].keys():
                internal_invalidities[score_name][original_sentence] = []
            internal_invalidities[score_name][original_sentence].append(invalid_ranking)
    
        # Store rating and single sentence ranking
        if len(internally_invalid_rankings) == 0:
            for translator in TRANSLATORS:
                translator_ratings[score_name][translator]['times_rated_better'] += current_translator_ratings[translator]
                translator_ratings[score_name][translator]['sentence_rankings'].append(
                    get_translator_ranking(current_translator_ratings, translator))

# Print internal invalidities
if len(internal_invalidities) > 0:
    print('[WARING] Internal invalidities were found; the regarding sentences were skipped for the score ranking')
for score_name, sentence_invalidities in internal_invalidities.items():
    print('  {}'.format(score_name))
    for original_sentence, invalidities in sentence_invalidities.items():
        sentence_ratings: pd.DataFrame = get_sentence_ratings(rating_data, original_sentence)
        print('  - {}'.format(original_sentence))
        for translator in TRANSLATORS:
            for index in sentence_ratings.index:
                current_rating: dict[str, str] = sentence_ratings.loc[index, [
                    score_name, 'Satz.1.TranslatorSource', 'Satz.2.TranslatorSource', 'Satz.1', 'Satz.2']
                ].to_dict()
                first_translator: str = current_rating['Satz.1.TranslatorSource']

# Assemble overall ranking table

# Accumulate times rated better per score and translator
rating_table_data: dict[str, dict[str, int]] = {}
for score_name in SCORE_NAMES:
    rating_table_data[score_name] = {}
    for translator in TRANSLATORS:
        rating_table_data[score_name][translator] = translator_ratings[score_name][translator]['times_rated_better']

# Get ranking based on accumulated rating
ranking_table_data: dict[str, list[str]] = {}
for translator in TRANSLATORS:
    ranking_table_data[translator] = []
    for score_name in SCORE_NAMES:
        current_translator_ratings = rating_table_data[score_name]
        translator_ranking = get_translator_ranking(current_translator_ratings, translator)
        ranking_table_data[translator].append('{} ({})'.format(translator_ranking['rank'], current_translator_ratings[translator]))

ranking_table_data = pd.DataFrame(ranking_table_data)
ranking_table_data.index = SCORE_NAMES
print('')
print('Rank (times rated better) per translator per score')
print('')
print(ranking_table_data)


[WARING] Internal invalidities were found; the regarding sentences were skipped for the score ranking
  Similarity.rating.TS
  - Die Tonsillektomie zhlt zu den in Deutschland am hufigsten durchgefhrten Operationen, deren gefhrlichste Komplikation vor allem fr Kinder die Nachblutung darstellt.
  - Entsorgen Sie Kanle und Spritze entsprechend der Anweisung Ihres Arztes, der medizinischen Fachkraft oder Ihres Apothekers.
  - Der klinische Krankheitsverlauf beginnt mit grippehnlichen Symptomen, die sich schnell zu einer schweren Erkrankung mit Blutungen weiterentwickeln.
  Similarity.rating.DP
  - In Studien wurden seitdem auch Fledermuse, Frettchen und Hauskatzen mit dem SARS-CoV infiziert, und es wurde festgestellt, dass sie das Virus bertragen knnen.
  - Entsorgen Sie Kanle und Spritze entsprechend der Anweisung Ihres Arztes, der medizinischen Fachkraft oder Ihres Apothekers.
  Translation.rating.TS
  - Es kann sein, dass die Anwendung von Fluoxetin beendet werden muss.
  - Ausschlag (1