In [12]:
import json
import re
from pprint import pprint
from random import choice, choices

import deepl
from tqdm import tqdm

In [520]:
translator = deepl.Translator(input("Enter DeepL API key"))
    
def translate_sentence(question_en):
    return translator.translate_text(
        question_en,
        source_lang="EN",
        target_lang="PL",
        formality='prefer_less',
        preserve_formatting=True,
    ).text
    
def translate_phrase(value_en):
    return translator.translate_text(
        value_en,
        source_lang="EN",
        target_lang="PL",
        preserve_formatting=True,
        split_sentences='off',
        formality='prefer_less'
    ).text

## Simple names translation

In [13]:
with open('../../spider-en/tables.json', 'r') as f:
    data = json.load(f)

In [17]:
tables = []
columns = []

for entry in data:
    tables_names = entry['table_names']
    tables_names_original = entry['table_names_original']
    column_names = entry['column_names']
    column_names_original = entry['column_names_original']
    db_id = entry['db_id']
    foreign_keys = entry['foreign_keys']
    primary_keys = entry['primary_keys']
    
    foreign_keys = {a: tables_names[column_names[b][0]] for (a, b) in foreign_keys}
    
    for name, name_original in zip(tables_names, tables_names_original):
        entry = {
            'db_id': db_id,
            'name': name,
            'name_original': name_original
        }
        tables.append(entry)
        
    for column_idx, ((table_idx, name), (_, column_name_original)) in enumerate(zip(column_names, column_names_original)):
        if name == '*':
            continue
        
        entry = {
            'db_id': db_id,
            'table_name_original': tables_names_original[table_idx],
            'column_name': name,
            'column_name_original': column_name_original,
            'primary_key': column_idx in primary_keys,
            'foreign_key': foreign_keys.get(column_idx, '')
        }
        columns.append(entry)

In [690]:
def translate_name(name, container_name, other_container_name=None):
    container_name = re.sub(r'[^a-zA-Z ]', ' ', container_name)
    if not other_container_name:
        text = f"{name} (from {container_name})"
    else:
        other_container_name = re.sub(r'[^a-zA-Z ]', ' ', other_container_name)
        text = f"{name} (from {container_name} and {other_container_name})"
    text_pl = translate_sentence(text)
    paren_idx = text_pl.index('(')
    return text_pl[:paren_idx].strip()

In [205]:
# translate columns
path = '../auxiliary/translated_schema/columns_names.json'

for i, table in tqdm(enumerate(columns), total=len(columns)):
    name_pl = translate_name(table['column_name'], table['table_name'], table['db_id'])
    table['column_name_pl'] = name_pl
    
    if i % 50 == 0:
        with open(path, 'w') as f:
            json.dump(columns, f, indent=4, ensure_ascii=False)
            
with open(path, 'w') as f:
    json.dump(columns, f, indent=4, ensure_ascii=False)

4503it [09:07,  8.23it/s]


In [213]:
# translate tables
path = '../auxiliary/translated_schema/tables_names.json'

for i, table in tqdm(enumerate(tables), total=len(tables)):
    name_pl = translate_name(table['name'], table['db_id'])
    table['name_pl'] = name_pl
    
    if i % 50 == 0:
        with open(path, 'w') as f:
            json.dump(tables, f, indent=4, ensure_ascii=False)
            
with open(path, 'w') as f:
    json.dump(tables, f, indent=4, ensure_ascii=False)

100%|██████████| 876/876 [01:44<00:00,  8.38it/s]


### Checking suspicious translations

In [696]:
# get list of columns names which translated name is the same as oryginal - to check them manually
with open('../auxiliary/translated_schema/columns_names.json', 'r') as f:
    columns = json.load(f)

suspicious_columns = [column['column_name_original_pl'] for column in columns if column['column_name_original'].lower() == column['column_name_original_pl'].lower()]
suspicious_columns = list(set(suspicious_columns))

In [None]:
# get list of tables names which translated name is the same as oryginal - to check them manually
with open('../auxiliary/translated_schema/tables_names.json', 'r') as f:
    tables = json.load(f)

for table in tables:
    if table['name'].lower() == table['name_pl'].lower():
        print(table['name'])

### Checking names conflicts

In [677]:
with open('../auxiliary/translated_schema/columns_names.json', 'r') as f:
    columns = json.load(f)
    
db_ids = set(column['db_id'] for column in columns)
for db_id in db_ids:
    tables_names = set(column['table_name'] for column in columns if column['db_id'] == db_id)
    for table_name in tables_names:
        columns_names = [column['column_name_pl'] for column in columns if column['db_id'] == db_id and column['table_name'] == table_name]
        duplicates = set([name for name in columns_names if len([x for x in columns_names if x == name]) > 1])
        if duplicates:
            print(f'Conflicting columns {duplicates} in table {table_name} in database {db_id}')

In [680]:
with open('../auxiliary/translated_schema/tables_names.json', 'r') as f:
    tables = json.load(f)
    
db_ids = set(column['db_id'] for column in columns)
for db_id in db_ids:
    tables_names = list(table['name_pl'] for table in tables if table['db_id'] == db_id)
    duplicates = set([name for name in tables_names if len([x for x in tables_names if x == name]) > 1])
    if duplicates:
        print(f'Conflicting columns {duplicates} in table {table_name} in database {db_id}')

## Oryginal names translation

In [647]:
# find diffucult columns names
with open('../auxiliary/translated_schema/columns_names.json', 'r') as f:
    columns = json.load(f)

difficult_columns = []
simple_columns = []
for column in columns:
    name = column['column_name_original']
    simple = all([part == part.upper() or part == part.lower() or part == part.capitalize() for part in name.split('_')])
    
    if (name == name.lower() or name == name.upper() or simple) and len(name) > 3:
        simple_columns.append(column)
    else:
        difficult_columns.append(column)
        
print(len(simple_columns), '/', len(difficult_columns))

3847 / 656


In [648]:
# find difficult tables names
with open('../auxiliary/translated_schema/tables_names.json', 'r') as f:
    tables = json.load(f)

difficult_tables = []
simple_tables = []
for table in tables:
    name = table['name_original']
    simple = all([part == part.upper() or part == part.lower() or part == part.capitalize() for part in name.split('_')])
    
    if (name == name.lower() or name == name.upper() or simple) and len(name) > 3:
        simple_tables.append(column)
    else:
        difficult_tables.append(column)
        
print(len(simple_tables), '/', len(difficult_tables))

862 / 14


In [698]:
# translate name
def translate_original_name(name, container, other_container=None):
    natural_text_en = name.replace('_', ' ')
    was_titled = all(word[0]==word[0].upper() and word[1:]==word[1:].lower() for word in natural_text_en.split(' '))
    natural_text_pl = translate_name(natural_text_en, container, other_container)
    if was_titled:
        natural_text_pl = natural_text_pl.title()
    return natural_text_pl.replace(' ', "_")

In [671]:
# translate tables
path = '../auxiliary/translated_schema/tables_names.json'

for i, table in tqdm(enumerate(tables), total=len(tables)):
    name_pl = translate_original_name(table['name_original'], table['db_id'])
    table['name_original_pl'] = name_pl
    
    if i % 50 == 0:
        with open(path, 'w') as f:
            json.dump(tables, f, indent=4, ensure_ascii=False)
            
with open(path, 'w') as f:
    json.dump(tables, f, indent=4, ensure_ascii=False)

100%|██████████| 876/876 [01:36<00:00,  9.04it/s]


In [694]:
# translate columns
path = '../auxiliary/translated_schema/columns_names.json'

with open(path, 'r') as f:
    columns = json.load(f)

for i, column in tqdm(enumerate(columns), total=len(columns)):
    if 'column_name_original_pl' in column:
        continue
    name_pl = translate_original_name(column['column_name_original'], column['table_name'], column['db_id'])
    column['column_name_original_pl'] = name_pl
    
    if i % 50 == 0:
        with open(path, 'w') as f:
            json.dump(columns, f, indent=4, ensure_ascii=False)
            
with open(path, 'w') as f:
    json.dump(columns, f, indent=4, ensure_ascii=False)

  0%|          | 0/4503 [00:00<?, ?it/s]

100%|██████████| 4503/4503 [01:43<00:00, 43.43it/s]   


In [765]:
# find suspicious columns
with open('../auxiliary/translated_schema/columns_names.json', 'r') as f:
    columns = json.load(f)

suspicious_columns = [column['column_name_original_pl'] for column in columns if column['column_name_original'] == column['column_name_original_pl']]
suspicious_columns = list(set(suspicious_columns))
len(suspicious_columns)

411

In [764]:
print(choices(suspicious_columns, k=10))

['bfp', 'buildUpPlayDribblingClass', 'gid', 'eid', 'chanceCreationPassing', 'sec_id', 'defenceTeamWidthClass', 'status', 'InvoiceId', 'AId']


In [767]:
for column in columns:
    if column['column_name_original'] == column['column_name_original_pl']:
        column['column_name_original_pl'] += '?'
        
with open(path, 'w') as f:
    json.dump(columns, f, indent=4, ensure_ascii=False)

In [769]:
# make columns names similar to original
with open('../auxiliary/translated_schema/columns_names.json', 'r') as f:
    columns = json.load(f)
    
for column in columns:
    if column['column_name'].lower() == column['column_name_original'].lower().replace('_', ' '):
        column['column_name_pl'] = column['column_name_original_pl'].lower().replace('_', ' ')
        
with open('../auxiliary/translated_schema/columns_names.json', 'w') as f:
    json.dump(columns, f, indent=4, ensure_ascii=False)

In [773]:
def translate_original_name(name):
    natural_text_en = name.replace('_', ' ')
    was_titled = all(word[0]==word[0].upper() and word[1:]==word[1:].lower() for word in natural_text_en.split(' '))
    natural_text_pl = translate_phrase(natural_text_en)
    if was_titled:
        natural_text_pl = natural_text_pl.title()
    return natural_text_pl.replace(' ', "_")

In [774]:
# fix _ID issue
with open('../auxiliary/translated_schema/columns_names.json', 'r') as f:
    columns = json.load(f)
    
for column in columns:
    if column['column_name_original'].lower().endswith('_id') and column['column_name_original_pl'].lower().startswith('id'):
        column['column_name_original_pl'] = translate_original_name(column['column_name_original'][:-3]) + column['column_name_original'][-3:]
        print('.', end='')
        
with open('../auxiliary/translated_schema/columns_names.json', 'w') as f:
    json.dump(columns, f, indent=4, ensure_ascii=False)

......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

In [None]:
# check tables names conflicts
with open('../auxiliary/translated_schema/tables_names.json', 'r') as f:
    tables = json.load(f)
    
db_ids = set(column['db_id'] for column in columns)
for db_id in db_ids:
    tables_names = list(table['name_original_pl'] for table in tables if table['db_id'] == db_id)
    duplicates = set([name for name in tables_names if len([x for x in tables_names if x == name]) > 1])
    if duplicates:
        print(f'Conflicting columns {duplicates} in table {table_name} in database {db_id}')

In [526]:
print([column['column_name_original'] for column in choices(simple_columns, k=10)])

['line_2', 'Gold', 'resident_id', 'asessment_outcome_code', 'Mountain_ID', 'g_2b', 'player_id', 'All_Games_Percent', 'Profits_in_Billion', 'customer_id']


In [440]:
print([column['column_name_original'] for column in choices(difficult_columns, k=10)])

['sh', 'Id', 'id', 'seq', 'buildUpPlaySpeed', 'g', 'h', 'w', 'bpf', 'PetType']


In [None]:
# for GPT
table_names_translations = {table['name']: table['name_pl'] for table in tables}
batch = choices(simple_columns, k=100)
for column in batch:
    table_name_pl = table_names_translations[column['table_name']]
    text = f"{column['column_name_original']} ({column['column_name']} - {column['column_name_pl']}) from \"{column['table_name']}\" ({table_name_pl})"
    if column['foreign_key']:
        foreign_key_pl = table_names_translations[column['foreign_key']]
        text += f" in relation with \"{column['foreign_key']}\" ({foreign_key_pl})"
    text += ' -> '
    print(text)

## Auxiliary

In [353]:
def get_columns_from_table(db_id, table_name):
    with open('../../spider-en/tables.json', 'r') as f:
        data = json.load(f)
    db = [db for db in data if db['db_id'] == db_id][0]
    table_idx = db['table_names'].index(table_name)
    columns = [column[1] for column in db['column_names'] if column[0] == table_idx]
    return columns

In [768]:
get_columns_from_table('store_1', 'tracks')

['id',
 'name',
 'album id',
 'media type id',
 'genre id',
 'composer',
 'milliseconds',
 'bytes',
 'unit price']