In [190]:
import json
import re
from pprint import pprint

import deepl
from tqdm import tqdm

In [None]:
translator = deepl.Translator(input("Enter DeepL API key"))
    
def translate_sentence(question_en):
    return translator.translate_text(
        question_en,
        source_lang="EN",
        target_lang="PL",
        formality='prefer_less',
        preserve_formatting=True,
    ).text

## Simple names translation

In [374]:
with open('../../spider-en/tables.json', 'r') as f:
    data = json.load(f)

In [222]:
tables = []
columns = []

for entry in data:
    tables_names = entry['table_names']
    tables_names_original = entry['table_names_original']
    column_names = entry['column_names']
    column_names_original = entry['column_names_original']
    db_id = entry['db_id']
    foreign_keys = entry['foreign_keys']
    primary_keys = entry['primary_keys']
    
    foreign_keys = {a: tables_names[column_names[b][0]] for (a, b) in foreign_keys}
    
    for name, name_original in zip(tables_names, tables_names_original):
        entry = {
            'db_id': db_id,
            'name': name,
            'name_original': name_original
        }
        tables.append(entry)
        
    for column_idx, ((table_idx, name), (_, column_name_original)) in enumerate(zip(column_names, column_names_original)):
        if name == '*':
            continue
        
        entry = {
            'db_id': db_id,
            'table_name': tables_names[table_idx],
            'column_name': name,
            'column_name_original': column_name_original,
            'primary_key': column_idx in primary_keys,
            'foreign_key': foreign_keys.get(column_idx, '')
        }
        columns.append(entry)

In [204]:
def translate_name(name, container_name, other_container_name=None):
    container_name = re.sub(r'[^a-zA-Z ]', ' ', container_name)
    if not other_container_name:
        text = f"{name} (from {container_name})"
    else:
        other_container_name = re.sub(r'[^a-zA-Z ]', ' ', other_container_name)
        text = f"{name} (from {container_name} and {other_container_name})"
    text_pl = translate_sentence(text)
    paren_idx = text_pl.index('(')
    return text_pl[:paren_idx].strip()

In [205]:
# translate columns
path = '../auxiliary/translated_schema/columns_names.json'

for i, table in tqdm(enumerate(columns), total=len(columns)):
    name_pl = translate_name(table['column_name'], table['table_name'], table['db_id'])
    table['column_name_pl'] = name_pl
    
    if i % 50 == 0:
        with open(path, 'w') as f:
            json.dump(columns, f, indent=4, ensure_ascii=False)
            
with open(path, 'w') as f:
    json.dump(columns, f, indent=4, ensure_ascii=False)

4503it [09:07,  8.23it/s]


In [213]:
# translate tables
path = '../auxiliary/translated_schema/tables_names.json'

for i, table in tqdm(enumerate(tables), total=len(tables)):
    name_pl = translate_name(table['name'], table['db_id'])
    table['name_pl'] = name_pl
    
    if i % 50 == 0:
        with open(path, 'w') as f:
            json.dump(tables, f, indent=4, ensure_ascii=False)
            
with open(path, 'w') as f:
    json.dump(tables, f, indent=4, ensure_ascii=False)

100%|██████████| 876/876 [01:44<00:00,  8.38it/s]


### Checking suspicious translations

In [None]:
# get list of columns names which translated name is the same as oryginal - to check them manually
with open('../auxiliary/translated_schema/columns_names.json', 'r') as f:
    columns = json.load(f)

suspicious_columns = [column['column_name_pl'] for column in columns if column['column_name'].lower() == column['column_name_pl'].lower()]
suspicious_columns = list(set(suspicious_columns))

In [None]:
# get list of tables names which translated name is the same as oryginal - to check them manually
with open('../auxiliary/translated_schema/tables_names.json', 'r') as f:
    tables = json.load(f)

for table in tables:
    if table['name'].lower() == table['name_pl'].lower():
        print(table['name'])

### Checking names conflicts

In [392]:
with open('../auxiliary/translated_schema/columns_names.json', 'r') as f:
    columns = json.load(f)
    
db_ids = set(column['db_id'] for column in columns)
for db_id in db_ids:
    tables_names = set(column['table_name'] for column in columns if column['db_id'] == db_id)
    for table_name in tables_names:
        columns_names = [column['column_name_pl'] for column in columns if column['db_id'] == db_id and column['table_name'] == table_name]
        duplicates = set([name for name in columns_names if len([x for x in columns_names if x == name]) > 1])
        if duplicates:
            print(f'Conflicting columns {duplicates} in table {table_name} in database {db_id}')

In [397]:
with open('../auxiliary/translated_schema/tables_names.json', 'r') as f:
    tables = json.load(f)
    
db_ids = set(column['db_id'] for column in columns)
for db_id in db_ids:
    tables_names = list(table['name_pl'] for table in tables if table['db_id'] == db_id)
    duplicates = set([name for name in tables_names if len([x for x in tables_names if x == name]) > 1])
    if duplicates:
        print(f'Conflicting columns {duplicates} in table {table_name} in database {db_id}')

## Oryginal names translation

In [251]:
from random import choice, choices

In [240]:
with open('../auxiliary/translated_schema/columns_names.json', 'r') as f:
    columns = json.load(f)
    
with open('../auxiliary/translated_schema/tables_names.json', 'r') as f:
    tables = json.load(f)

In [319]:
difficult_columns = []
simple_columns = []
for column in columns:
    name = column['column_name_original']
    simple = all([part == part.upper() or part == part.lower() or part == part.capitalize() for part in name.split('_')])
    
    if (name == name.lower() or name == name.upper() or simple) and len(name) > 3:
        simple_columns.append(column)
    else:
        difficult_columns.append(column)
        
print(len(simple_columns), '/', len(difficult_columns))

3847 / 656


In [320]:
print([column['column_name_original'] for column in choices(simple_columns, k=10)])

['Start_Date', 'msid', 'catalog_entry_id', 'Claim_Status_Code', 'candidate_id', 'Major', 'amenity_name', 'next_entry_id', 'Date_Payment_Made', 'Competition_ID']


In [321]:
print([column['column_name_original'] for column in choices(difficult_columns, k=10)])

['Aug', 'CheckIn', 'Age', 'ID', 'CheckIn', 'id', 'lat', 'cs', 'eid', 'g_c']


In [332]:
columns[0]

{'db_id': 'perpetrator',
 'table_name': 'perpetrator',
 'column_name': 'perpetrator id',
 'column_name_original': 'Perpetrator_ID',
 'primary_key': True,
 'foreign_key': '',
 'column_name_pl': 'identyfikator sprawcy'}

In [335]:
table_names_translations = {table['name']: table['name_pl'] for table in tables}

In [None]:
#
batch = choices(difficult_columns, k=100)
for column in batch:
    table_name_pl = table_names_translations[column['table_name']]
    text = f"{column['column_name_original']} ({column['column_name']} - {column['column_name_pl']}) from {column['table_name']} ({table_name_pl})"
    print(text)

## Auxiliary

In [353]:
def get_columns_from_table(db_id, table_name):
    with open('../../spider-en/tables.json', 'r') as f:
        data = json.load(f)
    db = [db for db in data if db['db_id'] == db_id][0]
    table_idx = db['table_names'].index(table_name)
    columns = [column[1] for column in db['column_names'] if column[0] == table_idx]
    return columns

In [None]:
get_columns_from_table('cre_Drama_Workshop_Groups', 'clients')