# Translated MedMentions datasets

In [1]:
import sys
sys.path.append('../benchmarks')

In [2]:
from xmen.data import get_cuis
import pandas as pd
import dataloaders
import datasets

dataset_en = dataloaders.load_medmentions_st21pv()[0]

Found cached dataset medmentions (/home/Florian.Borchert/.cache/huggingface/datasets/bigbio___medmentions/medmentions_st21pv_bigbio_kb/1.0.0/4ed5b6a69d807969022e559198c5a7386b9a978268a558758a090db6b451d6c4)


  0%|          | 0/3 [00:00<?, ?it/s]

Map:   0%|          | 0/2635 [00:00<?, ? examples/s]

Map:   0%|          | 0/878 [00:00<?, ? examples/s]

Map:   0%|          | 0/879 [00:00<?, ? examples/s]

In [3]:
def count_ents(ds_dict):
    return sum([len(get_cuis(d)) for d in ds_dict.values()])
    
n_ents_en = count_ents(dataset_en)
stats = [{'dataset' : 'MedMentions (en)', 'n_entities' : n_ents_en, 'loss (%)': '-'}]

translated_datasets = { lang : datasets.load_from_disk(f'../data/medmentions_st21pv_en_{lang}/') for lang in ['es', 'fr', 'nl', 'de'] }

for lang, translated_dataset in translated_datasets.items():    
    n_ents = count_ents(translated_dataset)
    loss = (n_ents_en - n_ents) / n_ents_en * 100
    stats.append({'dataset' : f'MedMentions ({lang})', 'n_entities' : n_ents, 'loss (%)' : f'{loss:.2f}'})

stats = pd.DataFrame(stats).set_index('dataset')
stats

Unnamed: 0_level_0,n_entities,loss (%)
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1
MedMentions (en),203282,-
MedMentions (es),185029,8.98
MedMentions (fr),181958,10.49
MedMentions (nl),200231,1.50
MedMentions (de),199006,2.10


# Examples

In [4]:
def to_ent_df(ds):
    errors = 0
    df = []
    for d in ds:
        text = ""
        for p in d['passages']:
            for txt in p['text']:
                text += (txt + " ")
        for e in d['entities']:
            e = e.copy()
            e['normalized'] = [n['db_id'] for n in e['normalized']]
            df.append({'document_id': d['document_id']} | e)
            for o, txt in zip(e['offsets'], e['text']):
                if not text[o[0]:o[1]] == txt:
                    errors += 1
    return pd.DataFrame(df).set_index(['document_id', 'id']), errors

def get_entity_dataframe(dataset, translated_dataset, indices, split='validation'):
    df_orig, ec_orig = to_ent_df(dataset[split])
    assert ec_orig == 0
    df_trans, ec_trans = to_ent_df(translated_dataset[split])
    if ec_trans > 0:
        print('Misaligned entities:', ec_trans)
    return pd.merge(df_orig, df_trans, left_index=True, right_index=True, suffixes=['_orig', '_trans'], how='outer').iloc[indices]

In [5]:
import numpy as np
indices = np.random.randint(0, 1000, 10) # Pick some examples
indices

array([281, 751, 471, 379, 861, 642, 891, 312, 186, 752])

### Spanish

In [6]:
get_entity_dataframe(dataset_en, translated_datasets['es'], indices)

Unnamed: 0_level_0,Unnamed: 1_level_0,type_orig,text_orig,offsets_orig,normalized_orig,type_trans,text_trans,offsets_trans,normalized_trans
document_id,id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
27235870,303,T017,[tracts],"[[665, 671]]",[C0029954],T017,[tractos],"[[698, 705]]",[C0029954]
27250823,806,T058,[EBUS],"[[1121, 1125]]",[C2959489],T058,[EBUS],"[[1258, 1262]]",[C2959489]
27239027,505,T204,[human],"[[358, 363]]",[C0086418],T204,[humano],"[[423, 429]]",[C0086418]
27236780,407,T082,[focally],"[[980, 987]]",[C0205234],T082,[focal],"[[1059, 1064]]",[C0205234]
27252074,925,T058,[flow cytometry],"[[760, 774]]",[C0016263],T058,[ citometría de flujo],"[[857, 877]]",[C0016263]
27242761,688,T038,[oxidative stress],"[[1181, 1197]]",[C0242606],T038,[elicitores de estrés oxidativo],"[[1276, 1306]]",[C0242606]
27252074,955,T017,[CTL],"[[1308, 1311]]",[C0039195],T017,[CTL],"[[1474, 1477]]",[C0039195]
27235870,334,T017,[ductal epithelium],"[[1308, 1325]]",[C1512086],T017,[epitelio ductal],"[[1406, 1421]]",[C1512086]
27233918,202,T038,[acclimatization],"[[133, 148]]",[C0000934],T038,[aclimatación],"[[145, 157]]",[C0000934]
27250823,807,T074,[probe],"[[1126, 1131]]",[C0182400],T074,[probe],"[[1263, 1268]]",[C0182400]


### French

In [7]:
get_entity_dataframe(dataset_en, translated_datasets['fr'], indices)

Unnamed: 0_level_0,Unnamed: 1_level_0,type_orig,text_orig,offsets_orig,normalized_orig,type_trans,text_trans,offsets_trans,normalized_trans
document_id,id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
27235870,303,T017,[tracts],"[[665, 671]]",[C0029954],T017,[tracts],"[[729, 735]]",[C0029954]
27250823,806,T058,[EBUS],"[[1121, 1125]]",[C2959489],T058,[EBUS],"[[1266, 1270]]",[C2959489]
27239027,505,T204,[human],"[[358, 363]]",[C0086418],T204,[humain],"[[432, 438]]",[C0086418]
27236780,407,T082,[focally],"[[980, 987]]",[C0205234],T082,[focalement],"[[1105, 1115]]",[C0205234]
27252074,925,T058,[flow cytometry],"[[760, 774]]",[C0016263],T058,[citométrie en flux],"[[898, 916]]",[C0016263]
27242761,688,T038,[oxidative stress],"[[1181, 1197]]",[C0242606],T038,[stress oxydatif],"[[1309, 1324]]",[C0242606]
27252074,955,T017,[CTL],"[[1308, 1311]]",[C0039195],T017,[CTL],"[[1594, 1597]]",[C0039195]
27235870,334,T017,[ductal epithelium],"[[1308, 1325]]",[C1512086],,,,
27233918,202,T038,[acclimatization],"[[133, 148]]",[C0000934],T038,[l'acclimatation],"[[168, 183]]",[C0000934]
27250823,807,T074,[probe],"[[1126, 1131]]",[C0182400],T074,[probe],"[[1271, 1276]]",[C0182400]


### German

In [8]:
get_entity_dataframe(dataset_en, translated_datasets['de'], indices)

Unnamed: 0_level_0,Unnamed: 1_level_0,type_orig,text_orig,offsets_orig,normalized_orig,type_trans,text_trans,offsets_trans,normalized_trans
document_id,id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
27235870,303,T017,[tracts],"[[665, 671]]",[C0029954],T017,[tracts],"[[706, 712]]",[C0029954]
27250823,806,T058,[EBUS],"[[1121, 1125]]",[C2959489],T058,[EBUS],"[[1278, 1282]]",[C2959489]
27239027,505,T204,[human],"[[358, 363]]",[C0086418],T204,[menschliches],"[[388, 400]]",[C0086418]
27236780,407,T082,[focally],"[[980, 987]]",[C0205234],T082,[fokal],"[[1042, 1047]]",[C0205234]
27252074,925,T058,[flow cytometry],"[[760, 774]]",[C0016263],T058,[Flusszytometrie],"[[846, 861]]",[C0016263]
27242761,688,T038,[oxidative stress],"[[1181, 1197]]",[C0242606],T038,[Oxidationsstress],"[[1293, 1309]]",[C0242606]
27252074,955,T017,[CTL],"[[1308, 1311]]",[C0039195],T017,[CTL],"[[1450, 1453]]",[C0039195]
27235870,334,T017,[ductal epithelium],"[[1308, 1325]]",[C1512086],T017,[duktale Epithel],"[[1424, 1439]]",[C1512086]
27233918,202,T038,[acclimatization],"[[133, 148]]",[C0000934],T038,[Aklimatisierung],"[[157, 172]]",[C0000934]
27250823,807,T074,[probe],"[[1126, 1131]]",[C0182400],T074,[probe],"[[1283, 1288]]",[C0182400]


### Dutch

In [9]:
get_entity_dataframe(dataset_en, translated_datasets['nl'], indices)

Unnamed: 0_level_0,Unnamed: 1_level_0,type_orig,text_orig,offsets_orig,normalized_orig,type_trans,text_trans,offsets_trans,normalized_trans
document_id,id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
27235870,303,T017,[tracts],"[[665, 671]]",[C0029954],T017,[tracts],"[[698, 704]]",[C0029954]
27250823,806,T058,[EBUS],"[[1121, 1125]]",[C2959489],T058,[EBUS],"[[1208, 1212]]",[C2959489]
27239027,505,T204,[human],"[[358, 363]]",[C0086418],T204,[menselijk],"[[392, 401]]",[C0086418]
27236780,407,T082,[focally],"[[980, 987]]",[C0205234],T082,[focaal],"[[1029, 1035]]",[C0205234]
27252074,925,T058,[flow cytometry],"[[760, 774]]",[C0016263],T058,[stroomcytometrie],"[[794, 810]]",[C0016263]
27242761,688,T038,[oxidative stress],"[[1181, 1197]]",[C0242606],T038,[oxidatieve stress],"[[1090, 1107]]",[C0242606]
27252074,955,T017,[CTL],"[[1308, 1311]]",[C0039195],T017,[CTL],"[[1378, 1381]]",[C0039195]
27235870,334,T017,[ductal epithelium],"[[1308, 1325]]",[C1512086],T017,[duktale epitheel],"[[1362, 1378]]",[C1512086]
27233918,202,T038,[acclimatization],"[[133, 148]]",[C0000934],T038,[acclimatisering],"[[156, 171]]",[C0000934]
27250823,807,T074,[probe],"[[1126, 1131]]",[C0182400],T074,[probe],"[[1213, 1218]]",[C0182400]
