In [1]:
from pathlib import Path

from common import load_json, Dataset

## Comparing translations

In [None]:
TRANSLATIONS_DIR = Path('../components/schema_trans')

In [26]:
def find_differences_in_translations(path1, path2, attribute_name):
    differences = []
    
    translations1 = load_json(path1)
    translations2 = load_json(path2)
    
    for translation1, translation2 in zip(translations1, translations2):
        value1 = translation1[attribute_name]
        value2 = translation2[attribute_name]
        if value1 != value2:
            differences.append((value1, value2))
            
    translations_count = len(translations1)
    return differences, translations_count

In [27]:
def compare_translations(path1, path2):
    column_diff, column_count = find_differences_in_translations(
        path1 / 'column_trans.json',
        path2 / 'column_trans.json',
        'column_name_original_pl'
    )

    table_diff, table_count = find_differences_in_translations(
        path1 / 'table_trans.json',
        path2 / 'table_trans.json',
        'name_original_pl'
    )

    print('Column differences:', f'{len(column_diff)} / {column_count} ({len(column_diff)/column_count:.3f}%)')
    print('Table differences:', f'{len(table_diff)} / {table_count} ({len(table_diff)/table_count:.3f}%)')

In [28]:
# context vs nocontext

compare_translations(
    TRANSLATIONS_DIR / 'context',
    TRANSLATIONS_DIR / 'nocontext'
)

Column differences: 851 / 4563 (0.187%)
Table differences: 193 / 888 (0.217%)


In [30]:
# context vs context_curated

compare_translations(
    TRANSLATIONS_DIR / 'context',
    TRANSLATIONS_DIR / 'context_curated'
)

Column differences: 615 / 4563 (0.135%)
Table differences: 77 / 888 (0.087%)


## Counting complex queries

In [2]:
import sqlparse

In [3]:
def has_scopes(sql):
    tokens = list(sqlparse.parse(sql)[0].flatten())
    tokens = [token.value.upper() for token in tokens]
    return tokens.count('SELECT') > 1

In [4]:
def count_samples_with_scopes(dataset_name):
    dataset = Dataset.load_by_name(dataset_name)
    
    complex_count = 0
    total_count = 0
    
    for _, samples in dataset.splits_dict.items():
        if samples is None:
            continue
        for sample in samples:
            if has_scopes(sample.query['en']):
                complex_count += 1
            total_count += 1
            
    percent = complex_count / total_count * 100
    print(f'{complex_count} / {total_count} ({percent:.1f}%)')

In [5]:
count_samples_with_scopes('spider')

1509 / 9693 (15.6%)
