In [1]:
import json
from pprint import pprint

from translation.schema import create_translation_entries

In [2]:
TABLES_PATH = '../components/schema/base/tables.json'
COLUMN_TRANS_PATH = '../components/schema/translations/columns_pl.json'
TABLE_TRANS_PATH = '../components/schema/translations/tables_pl.json'

## Fix tables.json stuff

In [3]:
with open(TABLES_PATH) as f:
    tables_ent = json.load(f)

In [4]:
tables_part = tables_ent[-3:]

In [None]:
new_tables, new_columns = create_translation_entries(tables_part)

pprint(new_tables[0])
pprint(new_columns[0])

In [16]:
with open(TABLE_TRANS_PATH) as f:
    columns_trans = json.load(f)
    
columns_trans.extend(new_tables)

with open(TABLE_TRANS_PATH, "w", encoding="utf-8") as f:
    json.dump(columns_trans, f, indent=4, ensure_ascii=False)

In [17]:
with open(COLUMN_TRANS_PATH) as f:
    columns_trans = json.load(f)
    
columns_trans.extend(new_columns)

with open(COLUMN_TRANS_PATH, "w", encoding="utf-8") as f:
    json.dump(columns_trans, f, indent=4, ensure_ascii=False)

## Translate schema

In [3]:
from translation.schema import DoubleSchemaTranslation
from common import save_json
from tqdm import tqdm

In [None]:
with open(COLUMN_TRANS_PATH) as f:
    column_entries = json.load(f)
    
with open(TABLE_TRANS_PATH) as f:
    table_entries = json.load(f)
    
translation = DoubleSchemaTranslation()

for i, table_entry in tqdm(enumerate(column_entries), total=len(column_entries)):
    if not table_entry['db_id'] in ['new_pets_1', 'new_orchestra', 'new_concert_singer']:
        continue
    db_table_entries = [te for te in table_entries if te['db_id'] == table_entry['db_id']]
    table_entry = [te for te in db_table_entries if te['name_original'] == table_entry['table_name_original']][0]
    table_entry['column_name_original_pl'] = translation.translate_column_name_original(table_entry, table_entry, db_table_entries)
    table_entry['column_name_pl'] = translation.translate_column_name(table_entry, table_entry, db_table_entries)
    
    if i % 50 == 0:
        save_json(COLUMN_TRANS_PATH, column_entries)
        
save_json(COLUMN_TRANS_PATH, column_entries)

In [8]:
save_json(TABLE_TRANS_PATH, table_entries)

In [None]:
with open(TABLE_TRANS_PATH) as f:
    table_entries = json.load(f)
    
translation = DoubleSchemaTranslation()

for i, table_entry in tqdm(enumerate(table_entries), total=len(table_entries)):
    if not table_entry['db_id'] in ['new_pets_1', 'new_orchestra', 'new_concert_singer']:
        continue
    table_entry['name_original_pl'] = translation.translate_table_name_original(table_entry)
    table_entry['name_pl'] = translation.translate_table_name(table_entry)
    
    if i % 50 == 0:
        save_json(TABLE_TRANS_PATH, table_entries)
        
save_json(TABLE_TRANS_PATH, table_entries)

## Translate questions

In [9]:
from translation.samples import translate_question, translate_query
from tqdm import tqdm

In [2]:
from common import load_json, save_json

data = load_json('../components/samples/spider_dk/samples.json')

new_samples = []
for sample in data:
    new_sample = {
        "type": sample["type"],
        "db_id": sample["db_id"],
        "question": sample["question"],
        "question_pl": "...",
        "query": sample["query"],
        "query_pl": "...",
    }
    
    new_samples.append(new_sample)
save_json('../components/samples/spider_dk/samples.json', new_samples)

In [10]:
data = load_json('../components/samples/spider_dk/samples.json')

for i, sample in tqdm(enumerate(data), total=len(data)):
    sample["query_pl"] = translate_query(sample["query"])
    if i % 50 == 0:
        save_json('../components/samples/spider_dk/samples.json', data)

save_json('../components/samples/spider_dk/samples.json', data)

100%|██████████| 535/535 [00:17<00:00, 31.26it/s] 


## Other

In [2]:
SPIDER_DK_PATH = '../../components/samples/spider_dk/samples.json'
SPIDER_PATH = '../../components/samples/spider/train_spider.json'

In [None]:
with open(SPIDER_DK_PATH) as f:
    spider_syn = json.load(f)

In [None]:
with open(SPIDER_PATH) as f:
    spider = json.load(f)

In [None]:
new_samples = []

for sample, org_sample in zip(spider_syn, spider):
    new_sample = {
        "db_id": sample["db_id"],
        "question_original_en": sample["SpiderQuestion"],
        "question_original_pl": "...",
        "question_en": sample["SpiderSynQuestion"],
        "question_pl": "...",
        "query_en": sample["query"],
        "query_pl": "..."
    }
    
    new_samples.append(new_sample)

In [None]:
with open(SPIDER_DK_PATH, "w", encoding="utf-8") as f:
        json.dump(new_samples, f, indent=4, ensure_ascii=False)

In [None]:
len(spider_syn), len(spider)

(7000, 7000)