# Evaluación de los modelos entrenados

In [None]:
!pip install transformers -q
!pip install evaluate -q

[K     |████████████████████████████████| 5.8 MB 4.8 MB/s 
[K     |████████████████████████████████| 182 kB 85.7 MB/s 
[K     |████████████████████████████████| 7.6 MB 65.5 MB/s 
[K     |████████████████████████████████| 72 kB 675 kB/s 
[K     |████████████████████████████████| 132 kB 9.3 MB/s 
[K     |████████████████████████████████| 451 kB 66.1 MB/s 
[K     |████████████████████████████████| 212 kB 88.1 MB/s 
[K     |████████████████████████████████| 127 kB 89.4 MB/s 
[?25h

In [None]:
import torch
from torch.utils.data import DataLoader

import pandas as pd
from sklearn.model_selection import train_test_split

from transformers import BertTokenizer, BertForQuestionAnswering
from transformers import pipeline

from pathlib import Path
from urllib.request import urlopen
import json

In [None]:
# Tomado de https://huggingface.co/spaces/evaluate-metric/squad
from evaluate import load
squad_metric = load("squad")

Downloading builder script:   0%|          | 0.00/4.53k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.32k [00:00<?, ?B/s]

## Carga de datos

In [None]:
dataset={
'Arabic':'xquad.ar.json','German':'xquad.de.json','Greek':'xquad.el.json','English':'xquad.en.json',
'Spanish':'xquad.es.json','Hindi':'xquad.hi.json','Russian':'xquad.ru.json','Thai':'xquad.th.json',
'Turkish':'xquad.tr.json','Vietnamese':'xquad.vi.json','Chinese':'xquad.zh.json','Romanian':'xquad.ro.json'
}

base_url='https://raw.githubusercontent.com/deepmind/xquad/master/{}'

def read_crossxsquad(language, is_url=True):
  if is_url:
    path=base_url.format(dataset[language])
    response = urlopen(path)
    squad_dict = json.loads(response.read())
  else:
    path = Path(path)
    squad_dict = json.load(path)
  
  es_squad_dict = json.loads(urlopen('https://raw.githubusercontent.com/deepmind/xquad/master/xquad.es.json').read())
  data=[]
  
  for group, group_es in zip(squad_dict['data'], es_squad_dict['data']):
    for passage, passage_es in zip(group['paragraphs'], group_es['paragraphs']):
      context = passage['context']
      context_es = passage_es['context']
      for qa, qa_es in zip(passage['qas'], passage_es['qas']):
        question = qa['question']
        question_es = qa_es['question']
        for answer, answer_es in zip(qa['answers'], qa_es['answers']):
          data.append((context, question_es, answer))
    
  return data

def split_squad(data, train_size=10, only_testing=True):
  data_train, data_test = train_test_split(data,train_size=train_size, random_state=13)
  train_contexts = [p[0] for p in data_train]
  train_questions = [p[1] for p in data_train]
  train_answers = [p[2] for p in data_train]
  test_contexts = [p[0] for p in data_test]
  test_questions = [p[1] for p in data_test]
  test_answers = [p[2] for p in data_test]
  return (train_contexts, train_questions, train_answers), (test_contexts, test_questions, test_answers)

data_cross_XQuAD={}
for language in dataset.keys(): 
  data_cross_XQuAD[language]={}
  language_data = read_crossxsquad(language)
  _, test_data = split_squad(language_data, train_size=10)
  data_cross_XQuAD[language]['test_contexts'] = test_data[0]
  data_cross_XQuAD[language]['test_questions'] = test_data[1]
  data_cross_XQuAD[language]['test_answers'] = test_data[2]

## Carga de los modelos

In [None]:
models={
    'BERT-XQuAD': 'mrm8488/bert-multi-cased-finetuned-xquadv1',
    'BERT-CrossXQuAD-1Sample': 'LeoAngel/bert-finetuned-crossxquadv1_1sbl',
    'BERT-CrossXQuAD-2Samples': 'LeoAngel/bert-finetuned-crossxquadv1_2sbl',
    'BERT-CrossXQuAD-5Samples':'LeoAngel/bert-finetuned-crossxquadv1_5sbl',
    'BERT-CrossXQuAD-10Samples':'LeoAngel/bert-finetuned-crossxquadv1_10sbl',
    'BERT-CrossXQuAD-20Samples':'LeoAngel/bert-finetuned-crossxquadv1_20sbl',
    'BERT-CrossXQuAD-25Samples':'LeoAngel/bert-finetuned-crossxquadv1_25sbl'
    }

# Evaluación sobre Cross-XQuAD (spanish)

A continuación se evaluará cada uno de los modelos entrenados en el notebook CrossLanguageExperiment.ipynb. Para el dataset Cross-XQuAD (spanish) que consiste en triplas de *(Contexto, Pregunta, Respuesta)* donde *Pregunta* siempre está en español y, *Contexto* y *Pregunta* están en el mismo idioma (no necesariamente español).

Al ejecutar este notebook, se debe ajustar el número de registros que se usarán por idioma para evaluar el modelo. Si no es muy demorado, se recomienda usar el máximo.

In [None]:
max_samples_by_language = 50 #@param {type:"slider", min:0, max:5000, step:1}

In [None]:
def evaluate_XQuAD(pipeline, data_to_evaluate):
  predictions={}
  answers={}

  for language in dataset.keys():
    predictions[language]=[]
    answers[language]=[]
    tupled_data=zip(data_to_evaluate[language]['test_contexts'],
                    data_to_evaluate[language]['test_questions'],
                    data_to_evaluate[language]['test_answers'])
    for idx, (context, question, answer) in enumerate(tupled_data):
      prediction = pipeline({'context': context,
                             'question': question})
      predictions[language].append({'prediction_text': prediction['answer'], 'id': str(idx)})
      formatted_answer={'answers': {'answer_start': [answer['answer_start']], 'text': [answer['text']]},
                        'id': str(idx)}
      answers[language].append(formatted_answer)
      if idx >= max_samples_by_language:
        break
  results={}
  for language in dataset.keys():
    results[language] = squad_metric.compute(predictions=predictions[language],
                                            references=answers[language])
  results=[]
  for language in dataset.keys():
    new_result=squad_metric.compute(predictions=predictions[language],
                                    references=answers[language])
    new_result['language']=language
    results.append(new_result)
  return pd.DataFrame.from_dict(results).set_index('language').sort_values(by='f1', ascending=False)

### BERT junto con fine-tuning sobre XQuAD
Este corresponde al modelo base: BERT entrenado con XQuAD. Este modelo no fue entrenado para el dataset Cross-XBERT-spanish.

In [None]:
model_name='BERT-XQuAD'
url_model=models[model_name]
qa_pipeline = pipeline("question-answering", model=url_model, tokenizer=url_model)

print(model_name)
print('_'*70+'\n'+'Results over Cross-XQuAD', end='\n'+'_'*70)
evaluate_XQuAD(qa_pipeline, data_cross_XQuAD)

Downloading:   0%|          | 0.00/657 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/711M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/40.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

BERT-XQuAD
______________________________________________________________________
Results over Cross-XQuAD
______________________________________________________________________

Unnamed: 0_level_0,exact_match,f1
language,Unnamed: 1_level_1,Unnamed: 2_level_1
Spanish,96.078431,98.996265
English,94.117647,95.798319
German,90.196078,90.686275
Arabic,76.470588,82.945268
Russian,74.509804,81.305633
Romanian,64.705882,77.323605
Vietnamese,70.588235,75.470588
Chinese,68.627451,71.24183
Greek,56.862745,68.499066
Hindi,58.823529,67.846314


Los modelos subsecuentes fueron usando como base el anterior, y usando n datos de entrenamiento por idioma (n<=25).

## BERT-CrossXQuAD usando solo un ejemplo de entrenamiento por idioma

In [None]:
model_name='BERT-CrossXQuAD-1Sample'
url_model=models[model_name]
qa_pipeline = pipeline("question-answering", model=url_model, tokenizer=url_model)

print(model_name)
print('_'*70+'\n'+'Results over Cross-XQuAD', end='\n'+'_'*70)
evaluate_XQuAD(qa_pipeline, data_cross_XQuAD)

Downloading:   0%|          | 0.00/878 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/709M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/603 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.92M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/125 [00:00<?, ?B/s]

BERT-CrossXQuAD-1Sample
______________________________________________________________________
Results over Cross-XQuAD
______________________________________________________________________

Unnamed: 0_level_0,exact_match,f1
language,Unnamed: 1_level_1,Unnamed: 2_level_1
Spanish,84.313725,87.058824
English,80.392157,81.372549
Russian,62.745098,66.375944
German,52.941176,59.155354
Vietnamese,47.058824,53.924053
Hindi,43.137255,51.842275
Romanian,41.176471,49.774096
Arabic,43.137255,48.857047
Greek,41.176471,48.193277
Turkish,35.294118,45.644532


### BERT-CrossXQuAD usando 2 ejemplos de entrenamiento por idioma

In [None]:
model_name='BERT-CrossXQuAD-2Samples'
url_model=models[model_name]
qa_pipeline = pipeline("question-answering", model=url_model, tokenizer=url_model)

print(model_name)
print('_'*70+'\n'+'Results over Cross-XQuAD', end='\n'+'_'*70)
evaluate_XQuAD(qa_pipeline, data_cross_XQuAD)

Downloading:   0%|          | 0.00/878 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/709M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/603 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.92M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/125 [00:00<?, ?B/s]

BERT-CrossXQuAD-2Samples
______________________________________________________________________
Results over Cross-XQuAD
______________________________________________________________________

Unnamed: 0_level_0,exact_match,f1
language,Unnamed: 1_level_1,Unnamed: 2_level_1
Spanish,66.666667,75.355933
Vietnamese,41.176471,46.469734
German,39.215686,44.110791
English,35.294118,42.119514
Hindi,29.411765,37.071034
Romanian,25.490196,35.620799
Greek,27.45098,35.420168
Russian,27.45098,34.625881
Turkish,25.490196,34.097691
Arabic,27.45098,31.770212


### BERT-CrossXQuAD usando 5 ejemplos de entrenamiento por idioma

In [None]:
model_name='BERT-CrossXQuAD-5Samples'
url_model=models[model_name]
qa_pipeline = pipeline("question-answering", model=url_model, tokenizer=url_model)

print(model_name)
print('_'*70+'\n'+'Results over Cross-XQuAD', end='\n'+'_'*70)
evaluate_XQuAD(qa_pipeline, data_cross_XQuAD)

Downloading:   0%|          | 0.00/878 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/709M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/603 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.92M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/125 [00:00<?, ?B/s]

BERT-CrossXQuAD-5Samples
______________________________________________________________________
Results over Cross-XQuAD
______________________________________________________________________

Unnamed: 0_level_0,exact_match,f1
language,Unnamed: 1_level_1,Unnamed: 2_level_1
Spanish,78.431373,80.694197
English,54.901961,60.949834
Vietnamese,43.137255,53.455827
German,39.215686,44.141915
Romanian,33.333333,43.153749
Russian,35.294118,39.119345
Arabic,33.333333,36.514161
Chinese,29.411765,34.640523
Hindi,25.490196,33.627059
Thai,25.490196,29.45845


### BERT-CrossXQuAD usando 10 ejemplos de entrenamiento por idioma

In [None]:
model_name='BERT-CrossXQuAD-10Samples'
url_model=models[model_name]
qa_pipeline = pipeline("question-answering", model=url_model, tokenizer=url_model)

print(model_name)
print('_'*70+'\n'+'Results over Cross-XQuAD', end='\n'+'_'*70)
evaluate_XQuAD(qa_pipeline, data_cross_XQuAD)

Downloading:   0%|          | 0.00/878 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/709M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/603 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.92M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/125 [00:00<?, ?B/s]

BERT-CrossXQuAD-10Samples
______________________________________________________________________
Results over Cross-XQuAD
______________________________________________________________________

Unnamed: 0_level_0,exact_match,f1
language,Unnamed: 1_level_1,Unnamed: 2_level_1
Spanish,58.823529,65.423647
English,43.137255,49.936382
German,37.254902,40.506117
Hindi,35.294118,39.300046
Vietnamese,33.333333,36.992317
Chinese,31.372549,35.294118
Romanian,23.529412,28.9472
Turkish,21.568627,27.561744
Greek,21.568627,27.550717
Russian,23.529412,24.888097


### BERT-CrossXQuAD usando 20 ejemplos de entrenamiento por idioma

In [None]:
model_name='BERT-CrossXQuAD-20Samples'
url_model=models[model_name]
qa_pipeline = pipeline("question-answering", model=url_model, tokenizer=url_model)

print(model_name)
print('_'*70+'\n'+'Results over Cross-XQuAD', end='\n'+'_'*70)
evaluate_XQuAD(qa_pipeline, data_cross_XQuAD)

Downloading:   0%|          | 0.00/878 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/709M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/603 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.92M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/125 [00:00<?, ?B/s]

BERT-CrossXQuAD-20Samples
______________________________________________________________________
Results over Cross-XQuAD
______________________________________________________________________

Unnamed: 0_level_0,exact_match,f1
language,Unnamed: 1_level_1,Unnamed: 2_level_1
Spanish,74.509804,79.315565
English,70.588235,73.230134
German,54.901961,60.82669
Vietnamese,52.941176,58.655462
Russian,47.058824,49.180036
Chinese,45.098039,49.019608
Hindi,41.176471,46.487826
Romanian,35.294118,46.415327
Arabic,41.176471,44.242141
Greek,39.215686,44.171874


### BERT-CrossXQuAD usando 25 ejemplos de entrenamiento por idioma

In [None]:
model_name='BERT-CrossXQuAD-25Samples'
url_model=models[model_name]
qa_pipeline = pipeline("question-answering", model=url_model, tokenizer=url_model)

print(model_name)
print('_'*70+'\n'+'Results over Cross-XQuAD', end='\n'+'_'*70)
evaluate_XQuAD(qa_pipeline, data_cross_XQuAD)

Downloading:   0%|          | 0.00/878 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/709M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/603 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.92M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/125 [00:00<?, ?B/s]

BERT-CrossXQuAD-25Samples
______________________________________________________________________
Results over Cross-XQuAD
______________________________________________________________________

Unnamed: 0_level_0,exact_match,f1
language,Unnamed: 1_level_1,Unnamed: 2_level_1
English,68.627451,73.721886
Spanish,64.705882,68.951277
Vietnamese,45.098039,54.027596
Chinese,43.137255,49.673203
German,43.137255,49.1939
Russian,47.058824,48.529412
Turkish,39.215686,47.654623
Hindi,37.254902,44.139194
Romanian,31.372549,40.114899
Arabic,31.372549,38.723499


# Evaluación sobre XQuAD

Con esta evaluación se pretender revisar qué tanta capacidad pierde o gana cada modelo de QA después de ser entrenado sobre Cross-XQuAD. 

In [None]:
def read_xquad(language, is_url=True):
  if is_url:
    path=base_url.format(dataset[language])
    response = urlopen(path)
    squad_dict = json.loads(response.read())
  else:
    path = Path(path)
    squad_dict = json.load(path)
  
  data=[]
  
  for group in squad_dict['data']:
    for passage in group['paragraphs']:
      context = passage['context']
      for qa in passage['qas']:
        question = qa['question']
        for answer in qa['answers']:
          data.append((context, question, answer))
  return data

data_XQUAD={}
for language in dataset.keys(): 
  data_XQUAD[language]={}
  language_XQUAD_data = read_xquad(language)
  _, test_data = split_squad(language_XQUAD_data, train_size=10)
  data_XQUAD[language]['test_contexts'] = test_data[0]
  data_XQUAD[language]['test_questions'] = test_data[1]
  data_XQUAD[language]['test_answers'] = test_data[2]

### BERT-XQuAD 

In [None]:
model_name='BERT-XQuAD'
url_model=models[model_name]
qa_pipeline = pipeline("question-answering", model=url_model, tokenizer=url_model)

print(model_name)
print('_'*70+'\n'+'Results over Cross-XQuAD', end='\n'+'_'*70)
evaluate_XQuAD(qa_pipeline, data_XQUAD)

BERT-XQuAD
______________________________________________________________________
Results over Cross-XQuAD
______________________________________________________________________

Unnamed: 0_level_0,exact_match,f1
language,Unnamed: 1_level_1,Unnamed: 2_level_1
English,100.0,100.0
Spanish,96.078431,98.996265
German,98.039216,98.529412
Vietnamese,94.117647,96.843837
Russian,92.156863,95.866791
Turkish,92.156863,95.447962
Chinese,94.117647,94.117647
Hindi,86.27451,93.672644
Arabic,88.235294,92.893055
Greek,78.431373,89.876284


### BERT-CrossXQuAD usando solo un ejemplo de entrenamiento por idioma

In [None]:
model_name='BERT-CrossXQuAD-1Sample'
url_model=models[model_name]
qa_pipeline = pipeline("question-answering", model=url_model, tokenizer=url_model)

print(model_name)
print('_'*70+'\n'+'Results over Cross-XQuAD', end='\n'+'_'*70)
evaluate_XQuAD(qa_pipeline, data_XQUAD)

BERT-CrossXQuAD-1Sample
______________________________________________________________________
Results over Cross-XQuAD
______________________________________________________________________

Unnamed: 0_level_0,exact_match,f1
language,Unnamed: 1_level_1,Unnamed: 2_level_1
English,90.196078,90.196078
Spanish,84.313725,87.058824
Russian,80.392157,84.973686
Chinese,78.431373,81.045752
German,78.431373,80.429864
Turkish,74.509804,79.282877
Vietnamese,74.509804,79.221802
Arabic,66.666667,74.064019
Greek,64.705882,69.680083
Hindi,62.745098,68.835739


### BERT-CrossXQuAD usando 2 ejemplos de entrenamiento por idioma

In [None]:
model_name='BERT-CrossXQuAD-2Samples'
url_model=models[model_name]
qa_pipeline = pipeline("question-answering", model=url_model, tokenizer=url_model)

print(model_name)
print('_'*70+'\n'+'Results over Cross-XQuAD', end='\n'+'_'*70)
evaluate_XQuAD(qa_pipeline, data_XQUAD)

BERT-CrossXQuAD-2Samples
______________________________________________________________________
Results over Cross-XQuAD
______________________________________________________________________

Unnamed: 0_level_0,exact_match,f1
language,Unnamed: 1_level_1,Unnamed: 2_level_1
Spanish,66.666667,75.355933
English,62.745098,70.504202
Vietnamese,58.823529,62.73271
Russian,56.862745,61.089466
Turkish,50.980392,59.32645
Hindi,54.901961,59.31193
Arabic,52.941176,57.833082
German,50.980392,56.699346
Greek,50.980392,53.071895
Chinese,45.098039,52.941176


### BERT-CrossXQuAD usando 5 ejemplos de entrenamiento por idioma

In [None]:
model_name='BERT-CrossXQuAD-5Samples'
url_model=models[model_name]
qa_pipeline = pipeline("question-answering", model=url_model, tokenizer=url_model)

print(model_name)
print('_'*70+'\n'+'Results over Cross-XQuAD', end='\n'+'_'*70)
evaluate_XQuAD(qa_pipeline, data_XQUAD)

BERT-CrossXQuAD-5Samples
______________________________________________________________________
Results over Cross-XQuAD
______________________________________________________________________

Unnamed: 0_level_0,exact_match,f1
language,Unnamed: 1_level_1,Unnamed: 2_level_1
Spanish,78.431373,80.694197
English,72.54902,75.014006
German,70.588235,71.926548
Vietnamese,66.666667,71.401598
Russian,64.705882,68.302351
Turkish,58.823529,63.875865
Greek,56.862745,62.602496
Arabic,58.823529,61.24183
Hindi,54.901961,57.415787
Chinese,49.019608,51.633987


### BERT-CrossXQuAD usando 10 ejemplos de entrenamiento por idioma

In [None]:
model_name='BERT-CrossXQuAD-10Samples'
url_model=models[model_name]
qa_pipeline = pipeline("question-answering", model=url_model, tokenizer=url_model)

print(model_name)
print('_'*70+'\n'+'Results over Cross-XQuAD', end='\n'+'_'*70)
evaluate_XQuAD(qa_pipeline, data_XQUAD)

BERT-CrossXQuAD-10Samples
______________________________________________________________________
Results over Cross-XQuAD
______________________________________________________________________

Unnamed: 0_level_0,exact_match,f1
language,Unnamed: 1_level_1,Unnamed: 2_level_1
Spanish,58.823529,65.423647
English,58.823529,62.088151
Turkish,47.058824,51.548672
Vietnamese,43.137255,49.505026
German,45.098039,48.311547
Hindi,41.176471,45.58644
Chinese,39.215686,44.444444
Arabic,39.215686,42.110177
Greek,37.254902,40.616246
Russian,33.333333,34.58061


### BERT-CrossXQuAD usando 20 ejemplos de entrenamiento por idioma

In [None]:
model_name='BERT-CrossXQuAD-20Samples'
url_model=models[model_name]
qa_pipeline = pipeline("question-answering", model=url_model, tokenizer=url_model)

print(model_name)
print('_'*70+'\n'+'Results over Cross-XQuAD', end='\n'+'_'*70)
evaluate_XQuAD(qa_pipeline, data_XQUAD)

BERT-CrossXQuAD-20Samples
______________________________________________________________________
Results over Cross-XQuAD
______________________________________________________________________

Unnamed: 0_level_0,exact_match,f1
language,Unnamed: 1_level_1,Unnamed: 2_level_1
Spanish,74.509804,79.315565
English,72.54902,79.139225
Vietnamese,64.705882,71.594686
German,68.627451,70.424837
Chinese,56.862745,65.686275
Russian,60.784314,62.284611
Hindi,58.823529,61.710838
Turkish,54.901961,58.169935
Arabic,50.980392,55.40206
Greek,43.137255,51.262716


### BERT-CrossXQuAD usando 25 ejemplos de entrenamiento por idioma

In [None]:
model_name='BERT-CrossXQuAD-25Samples'
url_model=models[model_name]
qa_pipeline = pipeline("question-answering", model=url_model, tokenizer=url_model)

print(model_name)
print('_'*70+'\n'+'Results over Cross-XQuAD', end='\n'+'_'*70)
evaluate_XQuAD(qa_pipeline, data_XQUAD)

BERT-CrossXQuAD-25Samples
______________________________________________________________________
Results over Cross-XQuAD
______________________________________________________________________

Unnamed: 0_level_0,exact_match,f1
language,Unnamed: 1_level_1,Unnamed: 2_level_1
English,70.588235,74.198568
Spanish,64.705882,68.951277
Vietnamese,60.784314,64.851126
Turkish,56.862745,64.604514
Russian,58.823529,63.883584
German,58.823529,61.546841
Hindi,54.901961,58.300654
Arabic,50.980392,56.239839
Chinese,50.980392,55.882353
Greek,49.019608,55.73296
