# 13_check_data

Check error 1: FRMT has no rows where `text_pt_br` or `text_pt_pt` are empty (null or whitespace) in train/validation/test.


In [1]:
from pathlib import Path
import duckdb
import pandas as pd

def find_project_db(start: Path | None = None, max_up: int = 6) -> Path:
    p = (start or Path.cwd()).resolve()
    for _ in range(max_up + 1):
        cand = p / 'data' / 'duckdb' / 'subs_project.duckdb'
        if cand.exists():
            return cand
        p = p.parent
    raise FileNotFoundError('Could not find data/duckdb/subs_project.duckdb by walking up from cwd.')

PROJECT_DB = find_project_db()
SOURCE_DB = PROJECT_DB.parent / 'subs.duckdb'
print('PROJECT_DB:', PROJECT_DB)
print('SOURCE_DB:', SOURCE_DB)


PROJECT_DB: /home/laiarodrigo/repos/Thesis/data/duckdb/subs_project.duckdb
SOURCE_DB: /home/laiarodrigo/repos/Thesis/data/duckdb/subs.duckdb


In [4]:
con = duckdb.connect(str(PROJECT_DB), read_only=True)

if SOURCE_DB.exists():
    try:
        con.execute(f"ATTACH '{SOURCE_DB.as_posix()}' AS src")
    except Exception as e:
        print('ATTACH skipped:', e)

tables = con.execute("""
    SELECT table_name
    FROM information_schema.tables
    WHERE table_schema NOT IN ('information_schema', 'pg_catalog')
    ORDER BY table_name
""").df()['table_name'].tolist()

print('tables/views:', tables)
tables = ['test_data', 'train_data']


ATTACH skipped: Binder Error: Unique file handle conflict: Database "src" is already attached with path "/home/laiarodrigo/repos/Thesis/data/duckdb/subs.duckdb", 
tables/views: ['_gold_src_with_rn', 'all_data', 'frmt_dev', 'frmt_dev_clean_v', 'frmt_dev_split_v', 'frmt_test', 'frmt_test_clean_v', 'gold_seen', 'gold_test', 'gold_test_v', 'movies', 'opus_cleaned', 'opus_deleted_log', 'opus_filter_simple', 'opus_foreign_blocks', 'opus_moses', 'opus_moses_backup', 'opus_moses_filtered', 'opus_moses_preview', 'opus_ops_delete', 'opus_ops_progress', 'opus_ops_update', 'opus_replacements', 'opus_source', 'ptbr_processed_v', 'ptbr_split_assign_v', 'ptbr_unique_v', 'ptbrvarid', 'ptbrvarid_jt_dropped_examples', 'ptbrvarid_metrics', 'ptbrvarid_processed_v', 'ptbrvarid_seen', 'ptbrvarid_split_v', 'ptbrvarid_text_v', 'ptbrvarid_v', 'ptbrvid_repaired_v', 'ptbrvid_repaired_v', 'subtitle_pairs', 'subtitle_pairs_2', 'test_data', 'test_data', 'test_pairs_guard', 'train_data', 'train_data']


In [5]:
def table_cols(name: str) -> set[str]:
    info = con.execute(f"PRAGMA table_info('{name}')").df()
    return set(info['name'].tolist())

required = {'dataset', 'text_pt_br', 'text_pt_pt'}
candidates = []
for t in tables:
    cols = table_cols(t)
    if required.issubset(cols):
        candidates.append(t)

split_candidates = [t for t in candidates if 'split' in table_cols(t)]

print('candidates with dataset/text_pt_br/text_pt_pt:', candidates)
print('candidates with split column:', split_candidates)


candidates with dataset/text_pt_br/text_pt_pt: ['test_data', 'train_data']
candidates with split column: ['test_data', 'train_data']


## Quick sanity: sample rows from train_data/test_data

Show 2 rows per dataset and per split to confirm columns and content.


In [1]:
expected_cols = [
    'dataset', 'split', 'source', 'bucket', 'theme', 'domain', 'label',
    'text_pt_br', 'text_pt_pt', 'ref_pt_pt_manual', 'ref_pt_pt_deepl'
]

def sample_rows(table: str) -> pd.DataFrame:
    cols = table_cols(table)
    missing = [c for c in expected_cols if c not in cols]
    print(f"\n[{table}] columns={len(cols)} missing={missing}")

    has_split = 'split' in cols
    select_parts = []
    if 'dataset' in cols:
        select_parts.append('dataset')
    select_parts.append('split' if has_split else "'test' AS split")
    for c in ['source','bucket','theme','domain','label','text_pt_br','text_pt_pt','ref_pt_pt_manual','ref_pt_pt_deepl']:
        if c in cols:
            select_parts.append(c)

    select_list = ', '.join(select_parts)

    q = f"""
        WITH base AS (
          SELECT {select_list}
          FROM {table}
        ),
        ranked AS (
          SELECT *, row_number() OVER (PARTITION BY dataset, split ORDER BY random()) AS rn
          FROM base
        )
        SELECT * FROM ranked
        WHERE rn <= 2
        ORDER BY dataset, split, rn
    """
    return con.execute(q).df()

train_sample = sample_rows('train_data')
display(train_sample)

test_sample = sample_rows('test_data')
display(test_sample)


NameError: name 'pd' is not defined

In [2]:
# Override these if needed
RELATION_WITH_SPLIT = tables[1]  # e.g., 'all_data' or any table/view with a split column
SPLIT_RELATIONS = {}  # e.g., {'train': 'train_data', 'validation': 'val_data', 'test': 'test_data'}

if RELATION_WITH_SPLIT is None and split_candidates:
    RELATION_WITH_SPLIT = split_candidates[0]

if not RELATION_WITH_SPLIT and not SPLIT_RELATIONS:
    # Try to infer per-split relations from names
    for t in candidates:
        name = t.lower()
        if 'train' in name:
            SPLIT_RELATIONS.setdefault('train', t)
        elif 'val' in name or 'valid' in name or 'validation' in name:
            SPLIT_RELATIONS.setdefault('validation', t)
        elif 'test' in name:
            SPLIT_RELATIONS.setdefault('test', t)

print('RELATION_WITH_SPLIT:', RELATION_WITH_SPLIT)
print('SPLIT_RELATIONS:', SPLIT_RELATIONS)


NameError: name 'tables' is not defined

In [14]:
EMPTY_BR = '(text_pt_br IS NULL OR length(trim(text_pt_br)) = 0)'
EMPTY_PT = '(text_pt_pt IS NULL OR length(trim(text_pt_pt)) = 0)'
ANY_EMPTY = f'({EMPTY_BR} OR {EMPTY_PT})'
BOTH_EMPTY = f'({EMPTY_BR} AND {EMPTY_PT})'

def query_counts_from_relation(relation: str, split_value: str | None = None) -> pd.DataFrame:
    where = "dataset = 'FRMT'"
    if split_value is not None:
        where += f" AND lower(split) = '{split_value.lower()}'"
        split_expr = f"'{split_value}'"
    else:
        split_expr = 'lower(split)'
    q = f"""
        SELECT
          {split_expr} AS split,
          count(*) AS n_total,
          sum(CASE WHEN {EMPTY_BR} THEN 1 ELSE 0 END) AS n_empty_br,
          sum(CASE WHEN {EMPTY_PT} THEN 1 ELSE 0 END) AS n_empty_pt,
          sum(CASE WHEN {ANY_EMPTY} THEN 1 ELSE 0 END) AS n_any_empty,
          sum(CASE WHEN {BOTH_EMPTY} THEN 1 ELSE 0 END) AS n_both_empty
        FROM {relation}
        WHERE {where}
        GROUP BY 1
        ORDER BY 1
    """
    return con.execute(q).df()

def query_examples_from_relation(relation: str, split_value: str | None = None, limit: int = 20) -> pd.DataFrame:
    where = "dataset = 'FRMT' AND (" + ANY_EMPTY + ")"
    if split_value is not None:
        where += f" AND lower(split) = '{split_value.lower()}'"
    q = f"""
        SELECT dataset, split, text_pt_br, text_pt_pt
        FROM {relation}
        WHERE {where}
        LIMIT {limit}
    """
    return con.execute(q).df()


In [15]:
if RELATION_WITH_SPLIT:
    # Single relation that includes a split column
    res = query_counts_from_relation(RELATION_WITH_SPLIT)
else:
    # Separate relations for each split
    frames = []
    for split_name, rel in SPLIT_RELATIONS.items():
        frames.append(query_counts_from_relation(rel, split_value=split_name))
    res = pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()

display(res)

if not res.empty and (res['n_any_empty'] > 0).any():
    print('Found empty text rows. Showing examples:')
    if RELATION_WITH_SPLIT:
        display(query_examples_from_relation(RELATION_WITH_SPLIT))
    else:
        for split_name, rel in SPLIT_RELATIONS.items():
            df_ex = query_examples_from_relation(rel, split_value=split_name)
            if not df_ex.empty:
                print('Split:', split_name)
                display(df_ex)
else:
    print('OK: no empty text rows found for FRMT in the selected splits.')


Unnamed: 0,split,n_total,n_empty_br,n_empty_pt,n_any_empty,n_both_empty
0,train,2502,0.0,0.0,0.0,0.0
1,valid,20,0.0,0.0,0.0,0.0


OK: no empty text rows found for FRMT in the selected splits.


## Check identical pt-BR/pt-PT pairs by dataset/split (excluding PtBrVId)


In [None]:
# Normalize for comparison
def _norm_sql(col: str) -> str:
    return f"lower(trim({col}))"

# train_data (train/valid)
train_same = con.execute("""
    SELECT
        split,
        dataset,
        COUNT(*) AS n_rows,
        SUM(CASE
              WHEN text_pt_br IS NOT NULL AND text_pt_pt IS NOT NULL
               AND length(trim(text_pt_br))>0 AND length(trim(text_pt_pt))>0
               AND lower(trim(text_pt_br)) = lower(trim(text_pt_pt))
              THEN 1 ELSE 0 END) AS n_same
    FROM train_data
    WHERE dataset <> 'PtBrVId'
    GROUP BY split, dataset
    ORDER BY split, dataset
""").df()

# test_data (test)
test_same = con.execute("""
    SELECT
        'test' AS split,
        dataset,
        COUNT(*) AS n_rows,
        SUM(CASE
              WHEN text_pt_br IS NOT NULL AND text_pt_pt IS NOT NULL
               AND length(trim(text_pt_br))>0 AND length(trim(text_pt_pt))>0
               AND lower(trim(text_pt_br)) = lower(trim(text_pt_pt))
              THEN 1 ELSE 0 END) AS n_same
    FROM test_data
    WHERE dataset <> 'PtBrVId'
    GROUP BY dataset
    ORDER BY dataset
""").df()

import pandas as pd
same_stats = pd.concat([train_same, test_same], ignore_index=True)
same_stats['pct_same'] = (same_stats['n_same'] / same_stats['n_rows'] * 100).round(3)
same_stats
