In [1]:
import tempfile
import shutil
import json
import re
from pathlib import Path

import deepl
import gdown
import sqlparse
from tqdm import tqdm

## Download oryginal spider

In [2]:
def download_spider(directory_name):
    id = "1TqleXec_OykOYFREKKtschzY29dUcVAQ"
    with tempfile.TemporaryDirectory() as tmp_dir:
        zip_path = str(Path(tmp_dir) / "spider.zip")
        gdown.download(id=id, output=zip_path, quiet=False)
        shutil.unpack_archive(zip_path, None)
        shutil.move('spider', directory_name)

In [3]:
download_spider('../../spider-en')

Downloading...
From (uriginal): https://drive.google.com/uc?id=1TqleXec_OykOYFREKKtschzY29dUcVAQ
From (redirected): https://drive.google.com/uc?id=1TqleXec_OykOYFREKKtschzY29dUcVAQ&confirm=t&uuid=e5443417-8c83-44d6-a146-deece5a3310c
To: /tmp/tmp1r6xhh2f/spider.zip
100%|██████████| 99.7M/99.7M [00:01<00:00, 68.0MB/s]


## DeepL Translation toolbox

In [3]:
translator = deepl.Translator(input("Enter DeepL API key"))

def translate_sentence(question_en):
    return translator.translate_text(
        question_en,
        source_lang="EN",
        target_lang="PL",
        formality='prefer_less'
    ).text
    
    
def translate_phrase(value_en):
    if value_en.strip() == '':
        return ''
    
    return translator.translate_text(
        value_en,
        source_lang="EN",
        target_lang="PL",
        preserve_formatting=True,
        split_sentences='off',
        formality='prefer_less'
    ).text

## Functions for translation

In [4]:
def find_tokens_to_translate(statement):
    tokens = [token for token in statement.flatten() if str(token).strip() != '']
    
    tokens_to_translate = []
    for i in range(len(tokens)):
        if str(tokens[i].ttype).startswith('Token.Literal.String'):
            # do not translate 
            if i > 0 and str(tokens[i-1]).lower() == 'like':
                continue
            tokens_to_translate.append(tokens[i])
        
    return tokens_to_translate

In [5]:
def translate_token(token):
    assert "'" in token.value or '"' in token.value
    value_en = token.value.strip("'\" ")
    value_pl = translate_phrase(value_en)
    token.value = f'"{value_pl}"'

In [6]:
def translate_query(query):
    statement = sqlparse.parse(query)[0]
    tokens = find_tokens_to_translate(statement)

    for token in tokens:
        translate_token(token)
        
    return str(statement)

In [13]:
def translate_quotes(sentence):
    matches = re.finditer(r"['\"](.*?)['\"]", sentence)
    sentence = list(sentence)
    for match in reversed(list(matches)):
        start, end = match.start(), match.end()
        text = ''.join(sentence[start:end])
        text_pl = translate_phrase(text)
        sentence[start:end] = text_pl
    return ''.join(sentence)

In [8]:
def translate_question(question):
    stage1 = translate_phrase(question)
    stage2 = translate_quotes(stage1)
    return stage2

## Translation

In [10]:
def translate_file(src_path, dst_path):
    with open(src_path) as json_data:
        samples_en = json.load(json_data)
        
    samples_pl = []
        
    for i, sample in tqdm(enumerate(samples_en), total=len(samples_en)):
        db_id = sample['db_id']
        question_en = sample['question']
        query_en = sample['query']
        
        question_pl = translate_question(question_en)
        query_pl = translate_query(query_en)
        
        sample_pl = {
            'db_id': db_id,
            'question': question_en,
            'question_pl': question_pl,
            'query': query_en,
            'query_pl': query_pl
        }
        
        samples_pl.append(sample_pl)
        
        if i % 10 == 0:
            with open(dst_path, 'w') as f:
                json.dump(samples_pl, f, indent=4, ensure_ascii=False)
                
        with open(dst_path, 'w') as f:
            json.dump(samples_pl, f, indent=4, ensure_ascii=False)

In [None]:
file = 'train_spider.json'    # one of dev, train_others, train_spider

spider_path = Path('../auxiliary/machine_translated')
target_path = Path('../auxiliary/machine_translated2')

translate_file(
    spider_path / file,
    target_path / file
)