In [190]:
import json
import re
from pprint import pprint

import deepl
from tqdm import tqdm

In [214]:
with open('../../spider-en/tables.json', 'r') as f:
    data = json.load(f)

In [222]:
tables = []
columns = []

for entry in data:
    tables_names = entry['table_names']
    tables_names_original = entry['table_names_original']
    column_names = entry['column_names']
    column_names_original = entry['column_names_original']
    db_id = entry['db_id']
    foreign_keys = entry['foreign_keys']
    primary_keys = entry['primary_keys']
    
    foreign_keys = {a: tables_names[column_names[b][0]] for (a, b) in foreign_keys}
    
    for name, name_original in zip(tables_names, tables_names_original):
        entry = {
            'db_id': db_id,
            'name': name,
            'name_original': name_original
        }
        tables.append(entry)
        
    for column_idx, ((table_idx, name), (_, column_name_original)) in enumerate(zip(column_names, column_names_original)):
        if name == '*':
            continue
        
        entry = {
            'db_id': db_id,
            'table_name': tables_names[table_idx],
            'column_name': name,
            'column_name_original': column_name_original,
            'primary_key': column_idx in primary_keys,
            'foreign_key': foreign_keys.get(column_idx, '')
        }
        columns.append(entry)

In [225]:
columns[1]

{'db_id': 'perpetrator',
 'table_name': 'perpetrator',
 'column_name': 'people id',
 'column_name_original': 'People_ID',
 'primary_key': False,
 'foreign_key': 'people'}

In [66]:
translator = deepl.Translator(input("Enter DeepL API key"))
    
def translate_sentence(question_en):
    return translator.translate_text(
        question_en,
        source_lang="EN",
        target_lang="PL",
        formality='prefer_less',
        preserve_formatting=True,
    ).text

In [204]:
def translate_name(name, container_name, other_container_name=None):
    container_name = re.sub(r'[^a-zA-Z ]', ' ', container_name)
    if not other_container_name:
        text = f"{name} (from {container_name})"
    else:
        other_container_name = re.sub(r'[^a-zA-Z ]', ' ', other_container_name)
        text = f"{name} (from {container_name} and {other_container_name})"
    text_pl = translate_sentence(text)
    paren_idx = text_pl.index('(')
    return text_pl[:paren_idx].strip()

In [205]:
# translate columns
path = '../auxiliary/translated_schema/columns_names.json'

for i, table in tqdm(enumerate(columns), total=len(columns)):
    name_pl = translate_name(table['column_name'], table['table_name'], table['db_id'])
    table['column_name_pl'] = name_pl
    
    if i % 50 == 0:
        with open(path, 'w') as f:
            json.dump(columns, f, indent=4, ensure_ascii=False)
            
with open(path, 'w') as f:
    json.dump(columns, f, indent=4, ensure_ascii=False)

4503it [09:07,  8.23it/s]


In [213]:
# translate tables
path = '../auxiliary/translated_schema/tables_names.json'

for i, table in tqdm(enumerate(tables), total=len(tables)):
    name_pl = translate_name(table['name'], table['db_id'])
    table['name_pl'] = name_pl
    
    if i % 50 == 0:
        with open(path, 'w') as f:
            json.dump(tables, f, indent=4, ensure_ascii=False)
            
with open(path, 'w') as f:
    json.dump(tables, f, indent=4, ensure_ascii=False)

100%|██████████| 876/876 [01:44<00:00,  8.38it/s]
