In this notebook we will generate the QALD9-ES dataset by adding our Spanish translations to QALD_9_plus, in addition we will make an updated verion of QALD9 by replacing the old translations with the new native translations generated.

In [1]:
import json

def read_json(filename):
    with open(filename, 'r', encoding="utf8") as f:
        return json.load(f)
    
def save_json(filename, data):
    """save json"""
    with open(filename, 'w', encoding="utf8") as f:
        json.dump(data, f, indent=4, ensure_ascii=False)

In [2]:
import pandas as pd
new_translations_train = pd.read_excel('../Review files/qald9_train_set_review_table.xlsx')
new_translations_test = pd.read_excel('../Review files/qald9_test_set_review_table.xlsx')

In [3]:
new_translations_train.head()

Unnamed: 0,id,en_question,es_question,native_translation,en_keywords,es_keywords,native_keywords,Case 1,Case 2,Case 3,Case 4,Case 5,Case 6,Case 7
0,1,List all boardgames by GMT.,Lista todas juegos de mesa por GMT.,¿Qué juegos de mesa fueron hechos por GMT?,"boardgame, GMT","juego de mesa , GMT","juego de mesa , GMT",x,,,,,x,
1,2,Who developed Skype?,Quien desarrollado Skype?,¿Quién desarrolló Skype?,"develop, Skype","desarrollar , Skype","desarrollar , Skype",x,,,,x,,
2,3,Which people were born in Heraklion?,Cual gente fueron nacido en Heraklion?,¿Qué personas nacieron en Heraklion?,"people, born, heraklion","gente , nacido , heraklion","personas , nacer , Heraklion",x,,,x,x,,
3,4,In which U.S. state is Area 51 located?,En cual Nosotros estado es Zona 51 ¿situado?,¿En cuál estado de Estados Unidos esta situada...,"Area 51, located, U.S. state","Zona 51 , situado , Nosotros estado",,x,x,,,x,,
4,5,Who is the mayor of New York City?,¿Quién es el alcalde de la cuidad de Nueva York?,¿Quién es el alcalde de la cuidad de Nueva York?,"New York City, mayor","cuidad de Nueva York, alcalde","cuidad de Nueva York, alcalde",,,,,,,x


# New QALD9 dataset
We will replace QALD9 translations and save the updated dataset  in the folder "updated QALD_9_plus".

In [4]:
def modify_question(question, modification, mode):
    if mode == 'add':
        es_translation = {
            'language' : 'es',
            'string' : modification.iloc[0]['native_translation']
        }
        question.get('question').append(es_translation)
    else:
        es_question = next(filter((lambda x: x.get('language') == 'es'), question.get('question')))
        es_question['string'] = modification.iloc[0]['native_translation']
        es_question['keywords'] = modification.iloc[0]['native_keywords']
    
    return question

def update_dataset(path, new_translations, mode, destiny_path):
    data = read_json(path)
    for question in data.get('questions'):
        #search the id in the modifications list
        correction = new_translations.loc[new_translations['id'] == int(question.get('id'))]
        if not correction.empty:
            question = modify_question(question, correction, mode)
    save_json(destiny_path, data)

In [5]:
update_dataset('../QALD9_data/qald-9-test-multilingual.json', new_translations_test, 'upadte', '../QALD9_ES_data/updated QALD9/updated-qald-9-test-multilingual.json' )

In [6]:
update_dataset('../QALD9_data/qald-9-train-multilingual.json', new_translations_train, 'upadte', '../QALD9_ES_data/updated QALD9/updated-qald-9-train-multilingual.json' )

The resulting dataset:

In [7]:
train_updated_QALD9 = read_json('../QALD9_ES_data/updated QALD9/updated-qald-9-train-multilingual.json')
train_updated_QALD9

{'dataset': {'id': 'qald-9-train-multilingual'},
 'questions': [{'id': '1',
   'answertype': 'resource',
   'aggregation': False,
   'onlydbo': True,
   'hybrid': False,
   'question': [{'language': 'de',
     'string': 'Liste alle Brettspiele von GMT. ',
     'keywords': 'Brettspiel ,  GMT'},
    {'language': 'ru',
     'string': 'Список все настольные игры от ВРЕМЯ ПО ГРИНВИЧУ. ',
     'keywords': 'настольная игра ,  время по Гринвичу '},
    {'language': 'pt',
     'string': 'Liste todos os jogos de mesa da GMT.',
     'keywords': 'jogo de tabuleiro ,  GMT '},
    {'language': 'en',
     'string': 'List all boardgames by GMT.',
     'keywords': 'boardgame, GMT'},
    {'language': 'hi_IN',
     'string': 'जीएमटी द्वारा सभी बोर्ड गेम सूचीबद्ध करें।',
     'keywords': 'विशेष प्रकार के बोर्ड या पट्टे के खेल जैसे शतरंज, साँप सीढ़ी आदि, जीएमटी'},
    {'language': 'fa',
     'string': 'همه بازی های تخته ای GMT را لیست کن',
     'keywords': '  بازی تخته,    GMT'},
    {'language': 'it',
   

In [8]:
test_updated_QALD9 = read_json('../QALD9_ES_data/updated QALD9/updated-qald-9-test-multilingual.json')
test_updated_QALD9

{'dataset': {'id': 'qald-9-test-multilingual'},
 'questions': [{'id': '99',
   'answertype': 'resource',
   'aggregation': False,
   'onlydbo': True,
   'hybrid': False,
   'question': [{'language': 'de',
     'string': 'In welcher Zeitzone liegt Salt Lake City?',
     'keywords': 'Salt Lake City,  Zeitzone'},
    {'language': 'ru',
     'string': 'Какие является  время зона из Поваренная соль Озеро Город? ',
     'keywords': 'Поваренная соль Озеро город ,   время зона '},
    {'language': 'pt',
     'string': 'Qual é o fuso horário da Salt Lake City?',
     'keywords': 'Cidade de Salt Lake, fuso horário'},
    {'language': 'en',
     'string': 'What is the time zone of Salt Lake City?',
     'keywords': 'Salt Lake City,  time zone'},
    {'language': 'hi_IN',
     'string': 'साल्ट लेक सिटी का समय क्षेत्र क्या है?',
     'keywords': 'साल्ट लेक सिटी,  समय क्षेत्र'},
    {'language': 'fa',
     'string': 'منطقه زمانی سالت لیک سیتی چیست؟',
     'keywords': 'سالت\u200cلیک\u200cسیتی,  منطقه

# QALD9-ES
We well create QALD9-ES by adding our translations to the improved translations of QALD_9_plus, for this dataset they've decided to remove the question keywords, we will respect this, and only add an object containing the question and the language.

In [9]:
update_dataset('../QALD_9_plus_data/qald_9_plus_test_wikidata.json', new_translations_test, 'add', '../QALD9_ES_data/QALD9-ES/qald9_es_test_wikidata.json')

In [10]:
update_dataset('../QALD_9_plus_data/qald_9_plus_test_dbpedia.json', new_translations_test, 'add', '../QALD9_ES_data/QALD9-ES/qald9_es_test_dbpedia.json')

In [11]:
update_dataset('../QALD_9_plus_data/qald_9_plus_train_wikidata.json', new_translations_train, 'add', '../QALD9_ES_data/QALD9-ES/qald9_es_train_wikidata.json')

In [12]:
update_dataset('../QALD_9_plus_data/qald_9_plus_train_dbpedia.json', new_translations_train, 'add', '../QALD9_ES_data/QALD9-ES/qald9_es_train_dbpedia.json')

The resulting dataset:

In [13]:
train_wikidata_QALD9_ES = read_json('../QALD9_ES_data/QALD9-ES/qald9_es_train_wikidata.json')
train_wikidata_QALD9_ES

{'questions': [{'id': '1',
   'question': [{'language': 'en', 'string': 'List all boardgames by GMT.'},
    {'language': 'de', 'string': 'Liste die Brettspiele von GMT auf.'},
    {'language': 'de', 'string': 'Zeige mir alle Brettspiele von GMT.'},
    {'language': 'ru', 'string': 'Перечислите все игры GMT.'},
    {'language': 'lt',
     'string': 'Išvardinkite visus stalo žaidimus pagal GMT.'},
    {'language': 'uk', 'string': 'Перерахуйте всі ігри GMT.'},
    {'language': 'lt', 'string': 'Išvardykite visus GMT žaidimus.'},
    {'language': 'fr', 'string': 'Listez tous les jeux de société de GMT.'},
    {'language': 'es',
     'string': '¿Qué juegos de mesa fueron hechos por GMT?'}],
   'query': {'sparql': 'PREFIX wdt: <http://www.wikidata.org/prop/direct/> PREFIX wd: <http://www.wikidata.org/entity/> SELECT ?uri WHERE { ?uri wdt:P31 wd:Q131436 . }'},
   'answers': [{'head': {'vars': ['uri']},
     'results': {'bindings': [{'uri': {'type': 'uri',
         'value': 'http://www.wikidata

In [14]:
test_wikidata_QALD9_ES = read_json('../QALD9_ES_data/QALD9-ES/qald9_es_test_wikidata.json')
test_wikidata_QALD9_ES

{'questions': [{'id': '99',
   'question': [{'language': 'en',
     'string': 'What is the time zone of Salt Lake City?'},
    {'language': 'de', 'string': 'In welcher Zeitzone liegt Salt Lake City?'},
    {'language': 'de', 'string': 'Was ist die Zeitzone von Salt Lake City?'},
    {'language': 'ru', 'string': 'Какой часовой пояс в Солт-Лейк-Сити'},
    {'language': 'ru',
     'string': 'В каком часовом поясе расположен Солт-Лейк-Сити?'},
    {'language': 'uk', 'string': 'Який часовий пояс у Солт-Лейк Сіті?'},
    {'language': 'lt', 'string': 'Kokia Solt Leik Sičio laiko zona?'},
    {'language': 'be', 'string': 'Які гадзінны пояс у Солт-Лэйк-Сіці'},
    {'language': 'lt', 'string': 'Kokia laiko juosta yra Solt Leik Sityjes'},
    {'language': 'ba', 'string': 'Ниндей вакыт поясы Солт-Лейк-Ситила'},
    {'language': 'es',
     'string': '¿En qué zona horaria se encuentra Salt Lake City?'}],
   'query': {'sparql': 'SELECT DISTINCT ?o1 WHERE { <http://www.wikidata.org/entity/Q23337>  <ht

In [15]:
train_dbpedia_QALD9_ES = read_json('../QALD9_ES_data/QALD9-ES/qald9_es_train_dbpedia.json')
train_dbpedia_QALD9_ES

{'questions': [{'id': '1',
   'question': [{'language': 'en', 'string': 'List all boardgames by GMT.'},
    {'language': 'de', 'string': 'Liste die Brettspiele von GMT auf.'},
    {'language': 'de', 'string': 'Zeige mir alle Brettspiele von GMT.'},
    {'language': 'ru', 'string': 'Перечислите все игры GMT.'},
    {'language': 'lt',
     'string': 'Išvardinkite visus stalo žaidimus pagal GMT.'},
    {'language': 'uk', 'string': 'Перерахуйте всі ігри GMT.'},
    {'language': 'lt', 'string': 'Išvardykite visus GMT žaidimus.'},
    {'language': 'fr', 'string': 'Listez tous les jeux de société de GMT.'},
    {'language': 'es',
     'string': '¿Qué juegos de mesa fueron hechos por GMT?'}],
   'query': {'sparql': 'PREFIX dbo: <http://dbpedia.org/ontology/> PREFIX res: <http://dbpedia.org/resource/> PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> SELECT ?uri WHERE { ?uri dbo:publisher res:GMT_Games }'},
   'answers': [{'head': {'link': [], 'vars': ['uri']},
     'results': {'bindings': [

In [16]:
test_dbpedia_QALD9_ES = read_json('../QALD9_ES_data/QALD9-ES/qald9_es_test_dbpedia.json')
test_dbpedia_QALD9_ES

{'questions': [{'id': '99',
   'question': [{'language': 'en',
     'string': 'What is the time zone of Salt Lake City?'},
    {'language': 'de', 'string': 'In welcher Zeitzone liegt Salt Lake City?'},
    {'language': 'de', 'string': 'Was ist die Zeitzone von Salt Lake City?'},
    {'language': 'ru', 'string': 'Какой часовой пояс в Солт-Лейк-Сити'},
    {'language': 'ru',
     'string': 'В каком часовом поясе расположен Солт-Лейк-Сити?'},
    {'language': 'uk', 'string': 'Який часовий пояс у Солт-Лейк Сіті?'},
    {'language': 'lt', 'string': 'Kokia Solt Leik Sičio laiko zona?'},
    {'language': 'be', 'string': 'Які гадзінны пояс у Солт-Лэйк-Сіці'},
    {'language': 'lt', 'string': 'Kokia laiko juosta yra Solt Leik Sityjes'},
    {'language': 'ba', 'string': 'Ниндей вакыт поясы Солт-Лейк-Ситила'},
    {'language': 'es',
     'string': '¿En qué zona horaria se encuentra Salt Lake City?'}],
   'query': {'sparql': 'PREFIX res: <http://dbpedia.org/resource/> PREFIX dbp: <http://dbpedia.o

# Generating testing english translations (for gerbril)
We will replace QALD9 and QALD9-ES english tranlsations with MT to compare the spanish tranlsations using GERBRIL, as we are only interest in mesuring the quality of the translations, we will compare QALD9 with the updated version of QALD9 that has the QALD9-ES translations

In [17]:
! pip install transformers



In [18]:
! pip install sacremoses



In [19]:
from transformers import pipeline

translator = pipeline('translation', model='Helsinki-NLP/opus-mt-es-en')

translator('Donde nació el papa Juan Pablo II?')

[{'translation_text': 'Where was Pope John Paul II born?'}]

In [20]:
mt_qald9_train = read_json('../QALD9_data/qald-9-train-multilingual.json')
mt_qald9_test = read_json('../QALD9_data/qald-9-test-multilingual.json')

In [38]:
def replace_with_mt(path, destiny_path):
    dataset = read_json(path)
    for question in dataset.get('questions'):
        es_question = next(filter((lambda x: x.get('language') == 'es'), question.get('question'))).get('string')
        en_question = next(filter((lambda x: x.get('language') == 'en'), question.get('question')))
        #print(es_question)
        #print(en_question)
        if es_question is not None:
            en_question['string'] = translator(es_question)[0].get('translation_text')
        else:
            en_question['string'] = ''
    save_json(destiny_path, dataset)

In [32]:
replace_with_mt('../QALD9_data/qald-9-test-multilingual.json', '../Test datasets/QALD9/mt-qald-9-test-multilingual.json')

In [39]:
replace_with_mt('../QALD9_data/qald-9-train-multilingual.json', '../Test datasets/QALD9/mt-qald-9-train-multilingual.json')

In [41]:
replace_with_mt('../QALD9_ES_data/updated QALD9/updated-qald-9-test-multilingual.json', '../Test datasets/QALD9-es/mt-updated-qald-9-test-multilingual.json')

In [42]:
replace_with_mt('../QALD9_ES_data/updated QALD9/updated-qald-9-train-multilingual.json', '../Test datasets/QALD9-es/mt-updated-qald-9-train-multilingual.json')