# Translated MedMentions Datasets

We use [EasyProject](https://github.com/edchengg/easyproject) to translate the English MedMentions dataset to other languages and align the entities at the same time, so we can use the translated datasets for training rerankers in the target language (see [../benchmarks/translate_dataset.py](../benchmarks/translate_dataset.py). Here, we report the results only.

In [1]:
import sys
sys.path.append('../benchmarks')

In [2]:
from xmen.data import get_cuis
import pandas as pd
import dataloaders
import datasets

dataset_en = dataloaders.load_medmentions_st21pv()[0]

Found cached dataset medmentions (/home/Florian.Borchert/.cache/huggingface/datasets/bigbio___medmentions/medmentions_st21pv_bigbio_kb/1.0.0/4ed5b6a69d807969022e559198c5a7386b9a978268a558758a090db6b451d6c4)


  0%|          | 0/3 [00:00<?, ?it/s]

Map:   0%|          | 0/2635 [00:00<?, ? examples/s]

Map:   0%|          | 0/878 [00:00<?, ? examples/s]

Map:   0%|          | 0/879 [00:00<?, ? examples/s]

## Statistics

In [3]:
def count_ents(ds_dict):
    return sum([len(get_cuis(d)) for d in ds_dict.values()])
    
n_ents_en = count_ents(dataset_en)
stats = [{'dataset' : 'MedMentions (en)', 'n_entities' : n_ents_en, 'loss (%)': '-'}]

translated_datasets = { lang : datasets.load_from_disk(f'../data/medmentions_st21pv_en_{lang}/') for lang in ['es', 'fr', 'nl', 'de'] }

for lang, translated_dataset in translated_datasets.items():    
    n_ents = count_ents(translated_dataset)
    loss = (n_ents_en - n_ents) / n_ents_en * 100
    stats.append({'dataset' : f'MedMentions ({lang})', 'n_entities' : n_ents, 'loss (%)' : f'{loss:.2f}'})

stats = pd.DataFrame(stats).set_index('dataset')
stats

Unnamed: 0_level_0,n_entities,loss (%)
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1
MedMentions (en),203282,-
MedMentions (es),185029,8.98
MedMentions (fr),181958,10.49
MedMentions (nl),200231,1.50
MedMentions (de),199006,2.10


## Examples

In [4]:
def to_ent_df(ds):
    errors = 0
    df = []
    for d in ds:
        text = ""
        for p in d['passages']:
            for txt in p['text']:
                text += (txt + " ")
        for e in d['entities']:
            e = e.copy()
            e['normalized'] = [n['db_id'] for n in e['normalized']]
            df.append({'document_id': d['document_id']} | e)
            for o, txt in zip(e['offsets'], e['text']):
                if not text[o[0]:o[1]] == txt:
                    errors += 1
    return pd.DataFrame(df).set_index(['document_id', 'id']), errors

def get_entity_dataframe(dataset, translated_dataset, indices, split='validation'):
    df_orig, ec_orig = to_ent_df(dataset[split])
    assert ec_orig == 0
    df_trans, ec_trans = to_ent_df(translated_dataset[split])
    if ec_trans > 0:
        print('Misaligned entities:', ec_trans)
    return pd.merge(df_orig, df_trans, left_index=True, right_index=True, suffixes=['_orig', '_trans'], how='outer').iloc[indices]

In [5]:
import numpy as np
indices = np.random.randint(0, 1000, 10) # Pick some examples
indices

array([806, 793, 861, 381, 244, 639, 139, 932, 386, 255])

### Spanish

In [6]:
get_entity_dataframe(dataset_en, translated_datasets['es'], indices)

Unnamed: 0_level_0,Unnamed: 1_level_0,type_orig,text_orig,offsets_orig,normalized_orig,type_trans,text_trans,offsets_trans,normalized_trans
document_id,id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
27251117,864,T017,[organelle],"[[1124, 1133]]",[C0029219],T017,[organelos],"[[1325, 1334]]",[C0029219]
27251117,851,T038,[pole-ward drift],"[[727, 742]]",[C0007613],T038,[ hacia el polo],"[[882, 896]]",[C0007613]
27252074,925,T058,[flow cytometry],"[[760, 774]]",[C0016263],T058,[ citometría de flujo],"[[857, 877]]",[C0016263]
27236780,409,T058,[cauterized],"[[1054, 1064]]",[C0007471],T058,[cauterizados],"[[1141, 1153]]",[C0007471]
27235870,266,T103,[Neurotrophins],"[[0, 13]]",[C0027754],T103,[Neurotrofinas],"[[0, 13]]",[C0027754]
27242761,685,T103,[chromate],"[[1119, 1127]]",[C0008543],T103,[cromato],"[[1217, 1224]]",[C0008543]
27131339,149,T017,[HERG1],"[[913, 918]]",[C1416571],T017,[de HERG1],"[[1010, 1018]]",[C1416571]
27252721,1017,T082,[Italy],"[[995, 1000]]",[C0022277],T082,[de Italia],"[[1134, 1143]]",[C0022277]
27236780,414,T058,[cautery],"[[1142, 1149]]",[C0007471],,,,
27235870,277,T103,[TrkB],"[[177, 181]]",[C0084873],T103,[TrkB],"[[176, 180]]",[C0084873]


### French

In [7]:
get_entity_dataframe(dataset_en, translated_datasets['fr'], indices)

Unnamed: 0_level_0,Unnamed: 1_level_0,type_orig,text_orig,offsets_orig,normalized_orig,type_trans,text_trans,offsets_trans,normalized_trans
document_id,id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
27251117,864,T017,[organelle],"[[1124, 1133]]",[C0029219],T017,[organelles],"[[1390, 1400]]",[C0029219]
27251117,851,T038,[pole-ward drift],"[[727, 742]]",[C0007613],T038,[ vers les pôles],"[[901, 916]]",[C0007613]
27252074,925,T058,[flow cytometry],"[[760, 774]]",[C0016263],T058,[citométrie en flux],"[[898, 916]]",[C0016263]
27236780,409,T058,[cauterized],"[[1054, 1064]]",[C0007471],T058,[cautérisés],"[[1189, 1199]]",[C0007471]
27235870,266,T103,[Neurotrophins],"[[0, 13]]",[C0027754],T103,[Neurotrophines],"[[0, 14]]",[C0027754]
27242761,685,T103,[chromate],"[[1119, 1127]]",[C0008543],T103,[chromate],"[[1228, 1236]]",[C0008543]
27131339,149,T017,[HERG1],"[[913, 918]]",[C1416571],T017,[des niveaux d'HERG1],"[[1058, 1077]]",[C1416571]
27252721,1017,T082,[Italy],"[[995, 1000]]",[C0022277],T082,[de l'Italie],"[[1205, 1216]]",[C0022277]
27236780,414,T058,[cautery],"[[1142, 1149]]",[C0007471],T058,[cauterie],"[[1303, 1311]]",[C0007471]
27235870,277,T103,[TrkB],"[[177, 181]]",[C0084873],T103,[TrkB],"[[189, 193]]",[C0084873]


### German

In [8]:
get_entity_dataframe(dataset_en, translated_datasets['de'], indices)

Unnamed: 0_level_0,Unnamed: 1_level_0,type_orig,text_orig,offsets_orig,normalized_orig,type_trans,text_trans,offsets_trans,normalized_trans
document_id,id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
27251117,864,T017,[organelle],"[[1124, 1133]]",[C0029219],T017,[organellen],"[[1313, 1323]]",[C0029219]
27251117,851,T038,[pole-ward drift],"[[727, 742]]",[C0007613],T038,[pole-ward drift],"[[852, 867]]",[C0007613]
27252074,925,T058,[flow cytometry],"[[760, 774]]",[C0016263],T058,[Flusszytometrie],"[[846, 861]]",[C0016263]
27236780,409,T058,[cauterized],"[[1054, 1064]]",[C0007471],T058,[kauteriert],"[[1138, 1148]]",[C0007471]
27235870,266,T103,[Neurotrophins],"[[0, 13]]",[C0027754],T103,[Neurotrophine],"[[0, 13]]",[C0027754]
27242761,685,T103,[chromate],"[[1119, 1127]]",[C0008543],T103,[Chromat],"[[1197, 1204]]",[C0008543]
27131339,149,T017,[HERG1],"[[913, 918]]",[C1416571],T017,[HERG1],"[[947, 952]]",[C1416571]
27252721,1017,T082,[Italy],"[[995, 1000]]",[C0022277],T082,[Italien],"[[1099, 1106]]",[C0022277]
27236780,414,T058,[cautery],"[[1142, 1149]]",[C0007471],T058,[Cautery],"[[1231, 1238]]",[C0007471]
27235870,277,T103,[TrkB],"[[177, 181]]",[C0084873],T103,[TrkB],"[[181, 185]]",[C0084873]


### Dutch

In [9]:
get_entity_dataframe(dataset_en, translated_datasets['nl'], indices)

Unnamed: 0_level_0,Unnamed: 1_level_0,type_orig,text_orig,offsets_orig,normalized_orig,type_trans,text_trans,offsets_trans,normalized_trans
document_id,id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
27251117,864,T017,[organelle],"[[1124, 1133]]",[C0029219],T017,[organellen],"[[1253, 1263]]",[C0029219]
27251117,851,T038,[pole-ward drift],"[[727, 742]]",[C0007613],T038,[pole-ward drift],"[[815, 830]]",[C0007613]
27252074,925,T058,[flow cytometry],"[[760, 774]]",[C0016263],T058,[stroomcytometrie],"[[794, 810]]",[C0016263]
27236780,409,T058,[cauterized],"[[1054, 1064]]",[C0007471],T058,[gecauterizeerd],"[[1106, 1120]]",[C0007471]
27235870,266,T103,[Neurotrophins],"[[0, 13]]",[C0027754],T103,[Neurotrofinen],"[[0, 13]]",[C0027754]
27242761,685,T103,[chromate],"[[1119, 1127]]",[C0008543],T103,[chroom],"[[1017, 1023]]",[C0008543]
27131339,149,T017,[HERG1],"[[913, 918]]",[C1416571],T017,[HERG1],"[[918, 923]]",[C1416571]
27252721,1017,T082,[Italy],"[[995, 1000]]",[C0022277],T082,[Italië],"[[1069, 1075]]",[C0022277]
27236780,414,T058,[cautery],"[[1142, 1149]]",[C0007471],T058,[cautery],"[[1206, 1213]]",[C0007471]
27235870,277,T103,[TrkB],"[[177, 181]]",[C0084873],T103,[TrkB],"[[174, 178]]",[C0084873]
