**//IMPORTS**

In [1]:
import duckdb
import pandas as pd, pathlib, itertools, textwrap

**//CONFIGS**

In [2]:
DB_PATH = '../data/duckdb/subs.duckdb'
pd.set_option("display.max_colwidth", None)


**//MAIN CODE**

In [3]:
con = duckdb.connect(pathlib.Path('../data/duckdb/subs.duckdb'), read_only=True)
con.execute("SELECT table_name FROM information_schema.tables WHERE table_schema='main'").fetchall()
con.close()


In [4]:

import duckdb, pathlib, pandas as pd

PROJECT_DB_PATH = pathlib.Path("../data/duckdb/subs_project.duckdb")
SOURCE_DB_PATH  = pathlib.Path("../data/duckdb/subs.duckdb")

PROJECT_DB_STR = PROJECT_DB_PATH.as_posix()
SOURCE_DB_STR  = SOURCE_DB_PATH.as_posix()

def connect_project(read_only: bool = True) -> duckdb.DuckDBPyConnection:
    con = duckdb.connect(PROJECT_DB_STR, read_only=read_only)
    dbl = con.execute("PRAGMA database_list").df()
    if not (dbl["name"] == "src").any():
        con.execute(f"ATTACH '{SOURCE_DB_STR}' AS src")
    return con


con = connect_project(read_only=True)

print(
    con.execute("""
        SELECT COUNT(*) AS n_frmt_train
        FROM train_data
        WHERE dataset = 'FRMT'
    """).df()
)

print(
    con.execute("""
        SELECT COUNT(*) AS n_frmt_test
        FROM test_data
        WHERE dataset = 'FRMT'
    """).df()
)

con.close()


   n_frmt_train
0          3935
   n_frmt_test
0         3987


**//LOAD TEST DATASETS**

In [5]:
# --- SETUP / CONNECT ---
import duckdb, pathlib, pandas as pd
pd.set_option("display.max_colwidth", 180)

PROJECT_DB_PATH = pathlib.Path("../data/duckdb/subs_project.duckdb")
SOURCE_DB_PATH  = pathlib.Path("../data/duckdb/subs.duckdb")

PROJECT_DB_PATH.parent.mkdir(parents=True, exist_ok=True)
con = duckdb.connect(PROJECT_DB_PATH.as_posix())   # read/write OK

# Attach source DB (idempotent)
try:
    con.execute(f"ATTACH '{SOURCE_DB_PATH.as_posix()}' AS src")
except duckdb.BinderException:
    pass

def tbl_exists(q: str) -> bool:
    try:
        con.execute(f"SELECT 1 FROM {q} LIMIT 1")
        return True
    except Exception:
        return False

def lo(c: str) -> str:
    return f"lower(coalesce({c},''))"

print("✓ Connected. src attached:", SOURCE_DB_PATH.exists())

# Find the ptbrvarid source table in subs.duckdb
src_ptbr = None
for cand in ["src.main.ptbrvarid", "src.ptbrvarid", "src.main.PtBrVarId", "src.PtBrVarId"]:
    if tbl_exists(cand):
        src_ptbr = cand
        break
if not src_ptbr:
    raise RuntimeError("Cannot find ptbrvarid in subs.duckdb (tried src.main.ptbrvarid, src.ptbrvarid, ...).")

print("Using ptbrvarid source:", src_ptbr)


✓ Connected. src attached: True
Using ptbrvarid source: src.main.ptbrvarid


In [6]:
from datasets import load_dataset
import duckdb, pathlib, pandas as pd

PROJECT_DB_PATH = pathlib.Path("../data/duckdb/subs_project.duckdb")
con = duckdb.connect(PROJECT_DB_PATH.as_posix(), read_only=False)

# 1) Load FRMT splits
ds_train = load_dataset("google-research/frmt", split="train")
ds_dev   = load_dataset("google-research/frmt", split="dev")
ds_test  = load_dataset("google-research/frmt", split="test")

df_train = ds_train.to_pandas()
df_dev   = ds_dev.to_pandas()
df_test  = ds_test.to_pandas()

print(df_train.columns)


DatasetNotFoundError: Dataset 'google-research/frmt' doesn't exist on the Hub or cannot be accessed.

In [None]:
from datasets import load_dataset

# Load HF splits (each has 500 rows with a "text" field)
bp     = load_dataset("joaosanches/golden_collection", split="gold_collection")
manual = load_dataset("joaosanches/golden_collection", split="referencia_manual")
deepl  = load_dataset("joaosanches/golden_collection", split="referencia_DeepL")
assert len(bp)==len(manual)==len(deepl)==500, (len(bp),len(manual),len(deepl))

# Create target and DELETE everything first
con.execute("""
CREATE TABLE IF NOT EXISTS gold_test (
  bucket TEXT,
  theme  TEXT,
  text_pt_br TEXT,
  ref_pt_pt_manual TEXT,
  ref_pt_pt_deepl  TEXT
);
""")
con.execute("DELETE FROM gold_test;")

def _clean(s: str) -> str:
    if s is None: return None
    return " ".join(str(s).replace("\r"," ").replace("\n"," ").split()).strip()

rows = [
    ("n/a", "n/a", _clean(bp[i]["text"]), _clean(manual[i]["text"]), _clean(deepl[i]["text"]))
    for i in range(500)
]
con.executemany(
    "INSERT INTO gold_test (bucket, theme, text_pt_br, ref_pt_pt_manual, ref_pt_pt_deepl) VALUES (?,?,?,?,?)",
    rows
)

# Sanity on gold
display(con.execute("""
SELECT
  (SELECT COUNT(*) FROM gold_test) AS gold_rows,
  (SELECT COUNT(DISTINCT lower(text_pt_br)) FROM gold_test) AS distinct_bp,
  (SELECT SUM(CASE WHEN coalesce(ref_pt_pt_manual,'')='' THEN 1 ELSE 0 END) FROM gold_test) AS null_manual,
  (SELECT SUM(CASE WHEN coalesce(ref_pt_pt_deepl ,'')='' THEN 1 ELSE 0 END) FROM gold_test) AS null_deepl
""").df())


Unnamed: 0,gold_rows,distinct_bp,null_manual,null_deepl
0,500,500,0.0,0.0


In [None]:
con.execute("DROP VIEW IF EXISTS ptbrvid_repaired_v;")
con.execute("""
CREATE VIEW ptbrvid_repaired_v AS
WITH raw AS (
  SELECT dataset, domain, split, label, text_pt_br, text_pt_pt
  FROM src.main.ptbrvarid
  WHERE dataset='PtBrVId'
),
norm AS (
  SELECT
    -- language
    CASE
      WHEN lower(label) IN ('pt-br','pt-pt') THEN CASE WHEN lower(label)='pt-br' THEN 'pt-BR' ELSE 'pt-PT' END
      WHEN text_pt_br IN ('pt-BR','pt-PT')      THEN text_pt_br
      ELSE NULL
    END AS lang,

    -- text (prefer proper columns; fall back to domain only if it looks like text)
    CASE
      WHEN text_pt_br IS NOT NULL AND text_pt_br NOT IN ('pt-BR','pt-PT') THEN text_pt_br
      WHEN text_pt_pt IS NOT NULL AND text_pt_pt NOT IN ('pt-BR','pt-PT') THEN text_pt_pt
      WHEN domain IS NOT NULL
           AND lower(domain) NOT IN ('journalistic','legal','web','literature','politics','social_media')
           AND length(domain) > 40 THEN domain
      ELSE NULL
    END AS text,

    split, domain
  FROM raw
)
SELECT
  'PtBrVId' AS dataset,
  split,
  lang  AS label,
  CASE WHEN lang='pt-BR' THEN text END AS text_pt_br,
  CASE WHEN lang='pt-PT' THEN text END AS text_pt_pt
FROM norm
WHERE lang IS NOT NULL AND text IS NOT NULL;
""")

print(con.execute("SELECT COUNT(*) AS repaired_rows FROM ptbrvid_repaired_v").df())
print(con.execute("SELECT label, COUNT(*) AS n FROM ptbrvid_repaired_v GROUP BY 1 ORDER BY n DESC").df())


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

   repaired_rows
0        2991704
   label        n
0  pt-PT  2654553
1  pt-BR   337151


In [None]:
con.execute("DROP VIEW IF EXISTS ptbr_unique_v;")
con.execute("""
CREATE VIEW ptbr_unique_v AS
WITH keyed AS (
  SELECT
    label,
    NULLIF(TRIM(text_pt_br),'') AS text_pt_br,
    NULLIF(TRIM(text_pt_pt),'') AS text_pt_pt,
    lower(coalesce(text_pt_br, text_pt_pt, '')) AS k_txt
  FROM ptbrvid_repaired_v
),
clean AS (
  SELECT label, text_pt_br, text_pt_pt, k_txt
  FROM keyed
  WHERE k_txt <> ''
),
agg AS (
  SELECT
    label, k_txt,
    arg_max(text_pt_br, length(coalesce(text_pt_br,''))) AS text_pt_br,
    arg_max(text_pt_pt, length(coalesce(text_pt_pt,''))) AS text_pt_pt
  FROM clean
  GROUP BY label, k_txt
)
SELECT label, text_pt_br, text_pt_pt
FROM agg;
""")

con.execute("DROP VIEW IF EXISTS ptbr_split_assign_v;")
con.execute("""
CREATE VIEW ptbr_split_assign_v AS
WITH base AS (
  SELECT
    label,
    text_pt_br,
    text_pt_pt,
    lower(coalesce(text_pt_br, text_pt_pt, '')) AS k_txt
  FROM ptbr_unique_v
)
SELECT
  CASE WHEN (hash(k_txt, coalesce(label,'')) % 5) = 0 THEN 'test' ELSE 'train' END AS split,
  label, text_pt_br, text_pt_pt
FROM base;
""")

print(con.execute("""
SELECT split, COUNT(*) AS n
FROM ptbr_split_assign_v
GROUP BY 1
ORDER BY 2 DESC
""").df())


In [None]:
con.execute("DROP VIEW IF EXISTS ptbr_split_assign_v;")
con.execute("""
CREATE VIEW ptbr_split_assign_v AS
WITH base AS (
  SELECT
    label,
    text_pt_br,
    text_pt_pt,
    lower(coalesce(text_pt_br, text_pt_pt, '')) AS k_txt,
    hash(lower(coalesce(text_pt_br, text_pt_pt, '')), coalesce(label,'')) AS h
  FROM ptbr_unique_v
)
SELECT
  CASE
    -- ~0.002% for test (about like 35 / 1.78M)
    WHEN (h % 100000) < 2 THEN 'test'
    -- next ~0.056% for valid (about like 1000 / 1.78M)
    WHEN (h % 100000) < 58 THEN 'valid'
    -- everything else is train
    ELSE 'train'
  END AS split,
  label,
  text_pt_br,
  text_pt_pt
FROM base;
""")


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

     n_test  n_total  pct_test
0  597931.0  2990689     19.99


In [None]:
# Optional FRMT/GOLD test-pair guard
def have(q: str) -> bool:
    return tbl_exists(q)

con.execute("DROP VIEW IF EXISTS test_pairs_guard;")
have_frmt = have("frmt_test")
have_gold = have("gold_test")

if have_frmt and have_gold:
    con.execute(f"""
      CREATE VIEW test_pairs_guard AS
      SELECT DISTINCT n_br, n_pt FROM (
        SELECT {lo('text_pt_br')} AS n_br, {lo('text_pt_pt')} AS n_pt FROM frmt_test
        UNION ALL
        SELECT {lo('text_pt_br')} AS n_br, {lo('ref_pt_pt_manual')} AS n_pt FROM gold_test
      );
    """)
elif have_frmt:
    con.execute(f"""
      CREATE VIEW test_pairs_guard AS
      SELECT DISTINCT {lo('text_pt_br')} AS n_br, {lo('text_pt_pt')} AS n_pt FROM frmt_test;
    """)
elif have_gold:
    con.execute(f"""
      CREATE VIEW test_pairs_guard AS
      SELECT DISTINCT {lo('text_pt_br')} AS n_br, {lo('ref_pt_pt_manual')} AS n_pt FROM gold_test;
    """)
else:
    con.execute("CREATE VIEW test_pairs_guard AS SELECT ''::TEXT AS n_br, ''::TEXT AS n_pt WHERE 1=0;")

print("test_pairs_guard rows:", con.execute("SELECT COUNT(*) FROM test_pairs_guard").fetchone()[0])


test_pairs_guard rows: 3108


In [None]:
def has_cols(q: str, want=("sent_pt_br","sent_pt_pt")) -> bool:
    try:
        cols = con.execute(f"DESCRIBE {q}").df()["column_name"].str.lower().tolist()
        return all(w in cols for w in want)
    except Exception:
        return False

candidates = [
    "src.main.opus_moses_filtered", "main.opus_moses_filtered",
    "src.main.opus_filtered",       "main.opus_filtered",
    "src.main.opus_filter_simple",  "main.opus_filter_simple",
    "src.main.opus_moses",          "main.opus_moses",
]
chosen = None
for q in candidates:
    if tbl_exists(q) and has_cols(q):
        chosen = q; break

con.execute("DROP VIEW IF EXISTS opus_source;")
if chosen:
    con.execute(f"CREATE VIEW opus_source AS SELECT * FROM {chosen};")
    print(f"✓ opus_source -> {chosen}")
else:
    con.execute("CREATE VIEW opus_source AS SELECT NULL::TEXT AS sent_pt_br, NULL::TEXT AS sent_pt_pt WHERE 1=0;")
    print("! No OPUS table found with (sent_pt_br, sent_pt_pt)")


✓ opus_source -> src.main.opus_moses_filtered


In [None]:
# TRAIN view
con.execute("""
CREATE OR REPLACE VIEW train_data AS
-- OPUS (anti-join: don't leak known test pairs)
SELECT
  'OpenSubs' AS dataset, 'opus_source' AS source,
  'n/a' AS bucket, 'n/a' AS theme,
  CAST(NULL AS TEXT) AS label,
  o.sent_pt_br AS text_pt_br, o.sent_pt_pt AS text_pt_pt,
  CAST(NULL AS TEXT) AS ref_pt_pt_manual, CAST(NULL AS TEXT) AS ref_pt_pt_deepl
FROM opus_source o
LEFT JOIN test_pairs_guard g
  ON lower(coalesce(o.sent_pt_br,'')) = g.n_br
 AND lower(coalesce(o.sent_pt_pt,'')) = g.n_pt
WHERE g.n_br IS NULL

UNION ALL
-- PtBrVarId (assigned TRAIN, ~80%)
SELECT
  'PtBrVarId' AS dataset,
  p.split     AS split,                 -- 'train' or 'valid'
  'liaad/PtBrVId' AS source,
  'n/a' AS bucket, 'n/a' AS theme,
  p.label,
  p.text_pt_br, p.text_pt_pt,
  CAST(NULL AS TEXT) AS ref_pt_pt_manual,
  CAST(NULL AS TEXT) AS ref_pt_pt_deepl
FROM ptbr_split_assign_v p
WHERE lower(p.split) IN ('train','valid')

UNION ALL
-- FRMT dev (if present)
SELECT
  'FRMT' AS dataset, 'google-research/frmt' AS source,
  f.bucket, 'n/a' AS theme, CAST(NULL AS TEXT) AS label,
  f.text_pt_br, f.text_pt_pt,
  CAST(NULL AS TEXT), CAST(NULL AS TEXT)
FROM frmt_dev f;
""")

# TEST view
con.execute("""
CREATE OR REPLACE VIEW test_data AS
-- PtBrVarId (assigned TEST, ~20%)
SELECT
  'PtBrVarId' AS dataset,
  'test'      AS split,
  'liaad/PtBrVId' AS source,
  'n/a' AS bucket, 'n/a' AS theme,
  p.label,
  p.text_pt_br, p.text_pt_pt,
  CAST(NULL AS TEXT) AS ref_pt_pt_manual,
  CAST(NULL AS TEXT) AS ref_pt_pt_deepl
FROM ptbr_split_assign_v p
WHERE lower(p.split) = 'test'

UNION ALL
-- FRMT test (if present)
SELECT
  'FRMT' AS dataset, 'google-research/frmt' AS source,
  f.bucket, 'n/a' AS theme, CAST(NULL AS TEXT) AS label,
  f.text_pt_br, f.text_pt_pt,
  CAST(NULL AS TEXT), CAST(NULL AS TEXT)
FROM frmt_test f

UNION ALL
-- Gold (500 rows)
SELECT
  'Gold' AS dataset, 'joaosanches/golden_collection' AS source,
  COALESCE(g.bucket,'n/a'), COALESCE(g.theme,'n/a'),
  'n/a' AS label,
  g.text_pt_br, CAST(NULL AS TEXT) AS text_pt_pt,
  g.ref_pt_pt_manual, g.ref_pt_pt_deepl
FROM gold_test g;
""")


<duckdb.duckdb.DuckDBPyConnection at 0x7b400536a9f0>

In [None]:
import duckdb, pathlib

PROJECT_DB_PATH = pathlib.Path("../data/duckdb/subs_project.duckdb")

con = duckdb.connect(PROJECT_DB_PATH.as_posix())

# Just to see that the views exist
print(con.execute("""
SELECT table_name, table_type
FROM information_schema.tables
WHERE table_name IN ('train_data','test_data')
""").df())

# Drop old materializations if they exist
con.execute("DROP TABLE IF EXISTS train_data_instance;")
con.execute("DROP TABLE IF EXISTS test_data_instance;")

# Materialize the current views
con.execute("CREATE TABLE train_data_instance AS SELECT * FROM train_data;")
con.execute("CREATE TABLE test_data_instance  AS SELECT * FROM test_data;")

print("Rows in train_data_instance:",
      con.execute("SELECT COUNT(*) FROM train_data_instance").fetchone()[0])
print("Rows in test_data_instance :",
      con.execute("SELECT COUNT(*) FROM test_data_instance").fetchone()[0])

con.close()


   table_name  table_type
0   test_data  BASE TABLE
1  train_data        VIEW
2   test_data        VIEW
3  train_data        VIEW


In [None]:
# What FRMT relations exist?
frmt_present = {
    "frmt_dev": tbl_exists("frmt_dev"),
    "frmt_test": tbl_exists("frmt_test")
}
display(pd.DataFrame([frmt_present]))

# Counts in FRMT base tables (if present)
if frmt_present["frmt_dev"]:
    print("frmt_dev rows:", con.execute("SELECT COUNT(*) FROM frmt_dev").fetchone()[0])
if frmt_present["frmt_test"]:
    print("frmt_test rows:", con.execute("SELECT COUNT(*) FROM frmt_test").fetchone()[0])

# Where FRMT surfaced in unified views:
print("\nFRMT in train_data:")
display(con.execute("""
SELECT COUNT(*) AS n
FROM train_data
WHERE dataset='FRMT'
""").df())

print("FRMT in test_data:")
display(con.execute("""
SELECT COUNT(*) AS n
FROM test_data
WHERE dataset='FRMT'
""").df())

# Optional: peek a few rows with an inferred origin label (dev/test) based on which view they’re in
print("\nSample FRMT rows from train_data (origin ~ FRMT-dev):")
display(con.execute("""
SELECT dataset, text_pt_br, text_pt_pt
FROM train_data
WHERE dataset='FRMT'
LIMIT 5
""").df())

print("\nSample FRMT rows from test_data (origin ~ FRMT-test):")
display(con.execute("""
SELECT dataset, text_pt_br, text_pt_pt
FROM test_data
WHERE dataset='FRMT'
LIMIT 5
""").df())


Unnamed: 0,frmt_dev,frmt_test
0,True,True


frmt_dev rows: 2521
frmt_test rows: 2608

FRMT in train_data:


Unnamed: 0,n
0,2521


FRMT in test_data:


Unnamed: 0,n
0,2608



Sample FRMT rows from train_data (origin ~ FRMT-dev):


Unnamed: 0,dataset,text_pt_br,text_pt_pt
0,FRMT,"Um telefone móvel, telefone celular, telefone portátil, muitas vezes chamado apenas de celular ou de telefone, é um telefone portátil que faz e recebe chamadas através de um ca...","Um telefone móvel, telefone celular, telefone manual, abreviado para telemóvel, é um telefone portátil que pode fazer e receber chamadas através de uma ligação de radiofrequênc..."
1,FRMT,"O link da frequência de rádio estabelece uma conexão com os sistemas variáveis de uma operadora de celular, o que dá acesso à rede pública de telefonia.",A radiofrequência estabelece uma ligação aos sistemas de comutação de uma operadora de telemóveis que permite o acesso à rede telefónica pública comutada (RTPC).
2,FRMT,"Os serviços modernos de telefonia celular usam uma arquitetura telefônica celular, e, assim, telefones portáteis são chamados de telefones celulares ou ""cell phones"" na América...","Os serviços de telemóveis modernos utilizam uma arquitetura de rede celular e, por conseguinte, em Portugal, chamam-se apenas telemóveis."
3,FRMT,"Além da telefonia, os aparelhos dos anos 2000 fornecem uma variedade de outros serviços, como mensagens de texto, MMS, e-mail, acesso à internet, comunicação sem fio de curto a...","Além da telefonia, os telemóveis da década de 2000 estão equipados com vários outros serviços, como mensagens de texto, MMS, e-mail, acesso à Internet, comunicações sem fios de..."
4,FRMT,"Telefones celulares que oferecem esses outros serviços são chamados de ""feature phones"", os que têm capacidades de computação ainda mais avançadas são chamados de smartphones.","Os telemóveis que apresentam apenas estas capacidades, são conhecidos como telemóveis básicos. Os que incluem capacidades de cálculo muito avançadas são conhecidos como smartph..."



Sample FRMT rows from test_data (origin ~ FRMT-test):


Unnamed: 0,dataset,text_pt_br,text_pt_pt
0,FRMT,"Um ônibus (contração de ômnibus, com as variações multibus, motorbus, autobus etc.) é um veículo rodoviário projetado para transportar muitos passageiros.",Um autocarro é um veículo rodoviário concebido para transportar muitos passageiros.
1,FRMT,Ônibus podem ter capacidade para até 300 passageiros.,Os autocarros têm capacidade para transportar no máximo 300 passageiros.
2,FRMT,"O tipo de ônibus mais comum é o ônibus rígido de um andar, com cargas maiores transportadas por ônibus de dois andares e articulados, e cargas menores transportadas por ônibus ...","Os tipos de autocarros mais comuns são os rígidos de um andar, sendo as cargas maiores nos de dois andares e os autocarros articulados. As cargas menores transportadas por auto..."
3,FRMT,"Muitos tipos de ônibus, como os de transporte urbano e os intermunicipais, cobram uma tarifa.","Muitos tipos de autocarros, como os de trânsito urbano e intercidades, cobram bilhete."
4,FRMT,"Outros tipos, como ônibus escolares do ensino médio ou fundamental ou ônibus de traslado em uma instituição de ensino superior não cobram tarifas.","Outros tipos, como os autocarros da escola primária ou secundária ou de ida e volta num campus pós-secundário não cobram bilhete."


**// Statistics**

In [None]:
import duckdb, pathlib, pandas as pd
pd.set_option("display.max_colwidth", 180)

PROJECT_DB_PATH = pathlib.Path("../data/duckdb/subs_project.duckdb")
con = duckdb.connect(PROJECT_DB_PATH.as_posix())  # read/write OK

def tbl_exists(name: str) -> bool:
    try:
        con.execute(f"SELECT 1 FROM {name} LIMIT 1"); return True
    except Exception:
        return False

def describe_relation(name: str) -> pd.DataFrame:
    try:
        return con.execute(f"DESCRIBE {name}").df()
    except Exception as e:
        return pd.DataFrame({"error":[str(e)]})

def count_rows(name: str) -> int:
    try:
        return con.execute(f"SELECT COUNT(*) FROM {name}").fetchone()[0]
    except Exception:
        return 0

def lo(c: str) -> str:
    return f"lower(coalesce({c},''))"

def random_sample(name: str, n: int = 20) -> pd.DataFrame:
    try:
        return con.execute(f"SELECT * FROM {name} USING SAMPLE {n} ROWS").df()
    except Exception:
        return con.execute(f"SELECT * FROM {name} ORDER BY random() LIMIT {n}").df()


In [None]:
candidates = [
    "train_data", "test_data",
    "ptbrvarid_split_assign", "ptbrvarid_v",
    "opus_source", "frmt_dev", "frmt_test",
    "gold_test"
]
present = [r for r in candidates if tbl_exists(r)]

summary_rows = []
for name in present:
    desc = describe_relation(name)
    cols = ", ".join(desc["column_name"].tolist()) if "column_name" in desc else "(n/a)"
    # find type from information_schema
    rtype = con.execute("""
        SELECT table_type FROM information_schema.tables
        WHERE table_schema='main' AND table_name=?
    """,[name]).fetchone()
    summary_rows.append({
        "relation": name,
        "type": (rtype[0] if rtype else "UNKNOWN"),
        "n_rows": count_rows(name),
        "columns": cols
    })

summary_df = pd.DataFrame(summary_rows).sort_values("relation")
summary_df


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Unnamed: 0,relation,type,n_rows,columns
4,frmt_dev,BASE TABLE,2521,"en, text_pt_br, text_pt_pt, bucket"
5,frmt_test,BASE TABLE,2608,"en, text_pt_br, text_pt_pt, bucket"
6,gold_test,BASE TABLE,500,"bucket, theme, text_pt_br, ref_pt_pt_manual, ref_pt_pt_deepl"
3,opus_source,VIEW,10347894,"line_no, pair_id, sent_pt_br, sent_pt_pt"
2,ptbrvarid_v,VIEW,337128,"split, label, text_pt_br, text_pt_pt"
1,test_data,BASE TABLE,601039,"dataset, source, bucket, theme, label, text_pt_br, text_pt_pt, ref_pt_pt_manual, ref_pt_pt_deepl"
0,train_data,VIEW,12743162,"dataset, source, bucket, theme, label, text_pt_br, text_pt_pt, ref_pt_pt_manual, ref_pt_pt_deepl"


In [None]:
bd_train = con.execute("""
SELECT dataset, COUNT(*) AS n
FROM train_data
GROUP BY 1 ORDER BY n DESC
""").df() if tbl_exists("train_data") else pd.DataFrame()

bd_test = con.execute("""
SELECT dataset, COUNT(*) AS n
FROM test_data
GROUP BY 1 ORDER BY n DESC
""").df() if tbl_exists("test_data") else pd.DataFrame()

bd_train, bd_test


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

(     dataset         n
 0   OpenSubs  10347883
 1  PtBrVarId   2392758
 2       FRMT      2521,
      dataset       n
 0  PtBrVarId  597931
 1       FRMT    2608
 2       Gold     500)

In [None]:
# Random 20 from train_data and test_data
train20 = con.execute("""
    SELECT * FROM train_data
    ORDER BY random()
    LIMIT 20
""").df()

test20 = con.execute("""
    SELECT * FROM test_data 
    ORDER BY random()
    LIMIT 20
""").df()


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [None]:
test20

Unnamed: 0,dataset,source,bucket,theme,label,text_pt_br,text_pt_pt,ref_pt_pt_manual,ref_pt_pt_deepl
0,PtBrVarId,liaad/PtBrVId,,,pt-PT,,"Como verdadeiro tribunal de substituição, a Relação aprecia livremente as provas produzidas segundo a sua prudente convicção acerca de cada facto impugnado, exceto no que respe...",,
1,PtBrVarId,liaad/PtBrVId,,,pt-PT,,"Lembra-se, a propósito, que, no documento sobre a ""reestruturação geral da empresa"", em que a Administração se comprometia a ""reestruturar, redimensionar, reinstalar e reequipa...",,
2,PtBrVarId,liaad/PtBrVId,,,pt-PT,,"Entendemos ser de reconhecer, relativamente às necessidades da menor, que a mesma entrará numa fase do seu desenvolvimento em que o aumento de despesas se faz sentir com mais i...",,
3,PtBrVarId,liaad/PtBrVId,,,pt-BR,"Entre o final dos anos 40 e começo dos 50, era capaz de tudo para chamar atenção em público. Ficou famosa por seus números nas revistas do Teatro Recreio, do empresário Walter ...",,,
4,PtBrVarId,liaad/PtBrVId,,,pt-PT,,"Além disso, entende-se que o conhecimento do mérito da causa, total ou parcialmente, só deve ter lugar quando o processo contenha todos os elementos necessários para uma decisã...",,
5,PtBrVarId,liaad/PtBrVId,,,pt-PT,,"É que o CSC separa, respetivamente a remuneração da pensão de reforma, nos arts.39 e 40; -a atribuição da remuneração é fixada por deliberação da assembleia geral,enquanto que ...",,
6,PtBrVarId,liaad/PtBrVId,,,pt-PT,,"Se votarem ""sim"", e todas as sondagens apontam para que isso aconteça, então Londres vai passar a ser a primeira cidade da Grã-Bretanha a ter um presidente da câmara. Alguém qu...",,
7,PtBrVarId,liaad/PtBrVId,,,pt-BR,"José Dirceu disse que a exploração do fato é uma tentativvva de desestabilizar a campanha de Lula. ""Eu e a Erundina também usamos o carro de som e ninguém falou nada"", disse. P...",,,
8,PtBrVarId,liaad/PtBrVId,,,pt-PT,,"Tão sensível foi o encontro do rei com a imprensa local que o jornal árabe ""Al-Hayat"", impresso em Londres, viu ontem confiscados os seus exemplares distribuídos em Amã, por te...",,
9,PtBrVarId,liaad/PtBrVId,,,pt-PT,,"Ora, no caso dos autos, a ampliação pretendida pelo recorrente, no sentido da condenação da Autora no pagamento desse novo montante, quando formulou pedido de condenação em €24...",,


In [None]:
# ===========================
#  IMPORTS & COMMON SETUP
# ===========================
import duckdb
import pathlib
import pandas as pd
from datasets import load_dataset

pd.set_option("display.max_colwidth", 180)

PROJECT_DB_PATH = pathlib.Path("../data/duckdb/subs_project.duckdb")
SOURCE_DB_PATH  = pathlib.Path("../data/duckdb/subs.duckdb")

PROJECT_DB_PATH.parent.mkdir(parents=True, exist_ok=True)

# Connect and attach src
con = duckdb.connect(PROJECT_DB_PATH.as_posix(), read_only=False)
try:
    con.execute(f"ATTACH '{SOURCE_DB_PATH.as_posix()}' AS src")
except duckdb.BinderException:
    pass

def tbl_exists(q: str) -> bool:
    try:
        con.execute(f"SELECT 1 FROM {q} LIMIT 1")
        return True
    except Exception:
        return False

def lo(c: str) -> str:
    return f"lower(coalesce({c},''))"

print("✓ Connected. src attached:", SOURCE_DB_PATH.exists())

# ===========================
#  GOLD COLLECTION (test only)
# ===========================
# HF: joaosanches/golden_collection
bp     = load_dataset("joaosanches/golden_collection", split="gold_collection")
manual = load_dataset("joaosanches/golden_collection", split="referencia_manual")
deepl  = load_dataset("joaosanches/golden_collection", split="referencia_DeepL")
assert len(bp)==len(manual)==len(deepl)==500, (len(bp),len(manual),len(deepl))

con.execute("""
CREATE TABLE IF NOT EXISTS gold_test (
  bucket TEXT,
  theme  TEXT,
  text_pt_br TEXT,
  ref_pt_pt_manual TEXT,
  ref_pt_pt_deepl  TEXT
);
""")
con.execute("DELETE FROM gold_test;")

def _clean(s: str) -> str:
    if s is None: return None
    return " ".join(str(s).replace("\r"," ").replace("\n"," ").split()).strip()

rows = [
    ("n/a", "n/a",
     _clean(bp[i]["text"]),
     _clean(manual[i]["text"]),
     _clean(deepl[i]["text"]))
    for i in range(500)
]
con.executemany(
    "INSERT INTO gold_test (bucket, theme, text_pt_br, ref_pt_pt_manual, ref_pt_pt_deepl) VALUES (?,?,?,?,?)",
    rows
)

print("Gold rows:", con.execute("SELECT COUNT(*) FROM gold_test").fetchone()[0])

# ===========================
#  PtBrVarId  (train / valid / test via hash)
# ===========================
# Discover the PtBrVarId source table inside src
src_ptbr = None
for cand in ["src.main.ptbrvarid", "src.ptbrvarid", "src.main.PtBrVId", "src.PtBrVId"]:
    if tbl_exists(cand):
        src_ptbr = cand
        break
if not src_ptbr:
    raise RuntimeError("Cannot find ptbrvarid in subs.duckdb (tried src.main.ptbrvarid, src.ptbrvarid, ...).")

print("Using ptbrvarid source:", src_ptbr)

# 1) Repair noisy rows (decode label + text)
con.execute("DROP VIEW IF EXISTS ptbrvid_repaired_v;")
con.execute(f"""
CREATE VIEW ptbrvid_repaired_v AS
WITH raw AS (
  SELECT dataset, domain, split, label, text_pt_br, text_pt_pt
  FROM {src_ptbr}
  WHERE dataset='PtBrVId'
),
norm AS (
  SELECT
    -- language
    CASE
      WHEN lower(label) IN ('pt-br','pt-pt')
           THEN CASE WHEN lower(label)='pt-br' THEN 'pt-BR' ELSE 'pt-PT' END
      WHEN text_pt_br IN ('pt-BR','pt-PT')
           THEN text_pt_br
      ELSE NULL
    END AS lang,

    -- text (prefer proper columns; fall back to domain if it really looks like text)
    CASE
      WHEN text_pt_br IS NOT NULL AND text_pt_br NOT IN ('pt-BR','pt-PT') THEN text_pt_br
      WHEN text_pt_pt IS NOT NULL AND text_pt_pt NOT IN ('pt-BR','pt-PT') THEN text_pt_pt
      WHEN domain IS NOT NULL
           AND lower(domain) NOT IN ('journalistic','legal','web','literature','politics','social_media')
           AND length(domain) > 40 THEN domain
      ELSE NULL
    END AS text
  FROM raw
)
SELECT
  'PtBrVId' AS dataset,
  lang  AS label,
  CASE WHEN lang='pt-BR' THEN text END AS text_pt_br,
  CASE WHEN lang='pt-PT' THEN text END AS text_pt_pt
FROM norm
WHERE lang IS NOT NULL AND text IS NOT NULL;
""")

print("PtBrVId repaired rows:",
      con.execute("SELECT COUNT(*) FROM ptbrvid_repaired_v").fetchone()[0])

# 2) Deduplicate text (per label)
con.execute("DROP VIEW IF EXISTS ptbr_unique_v;")
con.execute("""
CREATE VIEW ptbr_unique_v AS
WITH keyed AS (
  SELECT
    label,
    NULLIF(TRIM(text_pt_br),'') AS text_pt_br,
    NULLIF(TRIM(text_pt_pt),'') AS text_pt_pt,
    lower(coalesce(text_pt_br, text_pt_pt, '')) AS k_txt
  FROM ptbrvid_repaired_v
),
clean AS (
  SELECT label, text_pt_br, text_pt_pt, k_txt
  FROM keyed
  WHERE k_txt <> ''
),
agg AS (
  SELECT
    label, k_txt,
    arg_max(text_pt_br, length(coalesce(text_pt_br,''))) AS text_pt_br,
    arg_max(text_pt_pt, length(coalesce(text_pt_pt,''))) AS text_pt_pt
  FROM clean
  GROUP BY label, k_txt
)
SELECT label, text_pt_br, text_pt_pt
FROM agg;
""")

print("PtBrVId unique rows:",
      con.execute("SELECT COUNT(*) FROM ptbr_unique_v").fetchone()[0])

# 3) Deterministic split: train / valid / test
#    - valid: 1%   (~0.01)
#    - test:  0.05% (~0.0005)
con.execute("DROP VIEW IF EXISTS ptbr_split_assign_v;")
con.execute("""
CREATE VIEW ptbr_split_assign_v AS
WITH base AS (
  SELECT
    label,
    text_pt_br,
    text_pt_pt,
    lower(coalesce(text_pt_br, text_pt_pt, '')) AS k_txt,
    hash(lower(coalesce(text_pt_br, text_pt_pt, '')), coalesce(label,'')) AS h
  FROM ptbr_unique_v
)
SELECT
  CASE
    WHEN (h % 10000) < 5   THEN 'test'   -- 5 / 10000 = 0.05%
    WHEN (h % 10000) < 105 THEN 'valid'  -- next 100 / 10000 = 1%
    ELSE 'train'
  END AS split,
  label,
  text_pt_br,
  text_pt_pt
FROM base;
""")

print("PtBrVId split counts:")
print(con.execute("""
    SELECT split, COUNT(*) AS n
    FROM ptbr_split_assign_v
    GROUP BY 1
    ORDER BY n DESC
""").df())

# ===========================
#  FRMT (hugosousa/frmt)
#        dev: 1% valid, 99% train
#        test: all test
# ===========================
# Load from HF
ds_dev  = load_dataset("hugosousa/frmt", split="dev")
ds_test = load_dataset("hugosousa/frmt", split="test")

df_dev  = ds_dev.to_pandas()
df_test = ds_test.to_pandas()

# Ensure column names are text_pt_br / text_pt_pt.
# (hugosousa/frmt already uses 'br' and 'pt' columns; rename them.)
if "br" in df_dev.columns and "pt" in df_dev.columns:
    df_dev  = df_dev.rename(columns={"br": "text_pt_br", "pt": "text_pt_pt"})
    df_test = df_test.rename(columns={"br": "text_pt_br", "pt": "text_pt_pt"})

# Drop old tables and create new ones
con.execute("DROP TABLE IF EXISTS frmt_dev;")
con.execute("DROP TABLE IF EXISTS frmt_test;")

con.execute("CREATE TABLE frmt_dev  AS SELECT * FROM df_dev;")
con.execute("CREATE TABLE frmt_test AS SELECT * FROM df_test;")

print("FRMT dev rows:",  con.execute("SELECT COUNT(*) FROM frmt_dev").fetchone()[0])
print("FRMT test rows:", con.execute("SELECT COUNT(*) FROM frmt_test").fetchone()[0])

# Split FRMT dev into train/valid (1% valid)
con.execute("DROP VIEW IF EXISTS frmt_dev_split_v;")
con.execute("""
CREATE VIEW frmt_dev_split_v AS
WITH base AS (
  SELECT
    *,
    hash(lower(coalesce(text_pt_br,'')), lower(coalesce(text_pt_pt,''))) AS h
  FROM frmt_dev
)
SELECT
  CASE
    WHEN (h % 100) = 0 THEN 'valid'  -- 1% of dev
    ELSE 'train'
  END AS split,
  *
FROM base;
""")

print("FRMT dev split counts:")
print(con.execute("""
    SELECT split, COUNT(*) AS n
    FROM frmt_dev_split_v
    GROUP BY 1
    ORDER BY n DESC
""").df())

# ===========================
#  OPUS / OpenSubtitles (train only)
# ===========================
def has_cols(q: str, want=("sent_pt_br","sent_pt_pt")) -> bool:
    try:
        cols = con.execute(f"DESCRIBE {q}").df()["column_name"].str.lower().tolist()
        return all(w in cols for w in want)
    except Exception:
        return False

candidates = [
    "src.main.opus_moses_filtered", "main.opus_moses_filtered",
    "src.main.opus_filtered",       "main.opus_filtered",
    "src.main.opus_filter_simple",  "main.opus_filter_simple",
    "src.main.opus_moses",          "main.opus_moses",
]
chosen = None
for q in candidates:
    if tbl_exists(q) and has_cols(q):
        chosen = q
        break

con.execute("DROP VIEW IF EXISTS opus_source;")
if chosen:
    con.execute(f"CREATE VIEW opus_source AS SELECT * FROM {chosen};")
    print(f"✓ opus_source -> {chosen}")
else:
    con.execute("CREATE VIEW opus_source AS SELECT NULL::TEXT AS sent_pt_br, NULL::TEXT AS sent_pt_pt WHERE 1=0;")
    print("! No OPUS table found with (sent_pt_br, sent_pt_pt)")

# ===========================
#  TEST PAIRS GUARD (FRMT test + Gold)
# ===========================
con.execute("DROP VIEW IF EXISTS test_pairs_guard;")
have_frmt = tbl_exists("frmt_test")
have_gold = tbl_exists("gold_test")

if have_frmt and have_gold:
    con.execute(f"""
      CREATE VIEW test_pairs_guard AS
      SELECT DISTINCT n_br, n_pt FROM (
        SELECT {lo('text_pt_br')} AS n_br, {lo('text_pt_pt')} AS n_pt FROM frmt_test
        UNION ALL
        SELECT {lo('text_pt_br')} AS n_br, {lo('ref_pt_pt_manual')} AS n_pt FROM gold_test
      );
    """)
elif have_frmt:
    con.execute(f"""
      CREATE VIEW test_pairs_guard AS
      SELECT DISTINCT {lo('text_pt_br')} AS n_br, {lo('text_pt_pt')} AS n_pt FROM frmt_test;
    """)
elif have_gold:
    con.execute(f"""
      CREATE VIEW test_pairs_guard AS
      SELECT DISTINCT {lo('text_pt_br')} AS n_br, {lo('ref_pt_pt_manual')} AS n_pt FROM gold_test;
    """)
else:
    con.execute("CREATE VIEW test_pairs_guard AS SELECT ''::TEXT AS n_br, ''::TEXT AS n_pt WHERE 1=0;")

print("test_pairs_guard rows:",
      con.execute("SELECT COUNT(*) FROM test_pairs_guard").fetchone()[0])

# ===========================
#  UNIFIED VIEWS: train_data / test_data
#  Columns:
#    dataset, split, source, bucket, theme, label,
#    text_pt_br, text_pt_pt, ref_pt_pt_manual, ref_pt_pt_deepl
# ===========================

con.execute("DROP VIEW IF EXISTS train_data;")
con.execute("""
CREATE VIEW train_data AS

-- OPUS (train only, anti-join against known test pairs)
SELECT
  'OpenSubs' AS dataset,
  'train'    AS split,
  'opus_source' AS source,
  'n/a' AS bucket,
  'n/a' AS theme,
  CAST(NULL AS TEXT) AS label,
  o.sent_pt_br AS text_pt_br,
  o.sent_pt_pt AS text_pt_pt,
  CAST(NULL AS TEXT) AS ref_pt_pt_manual,
  CAST(NULL AS TEXT) AS ref_pt_pt_deepl
FROM opus_source o
LEFT JOIN test_pairs_guard g
  ON lower(coalesce(o.sent_pt_br,'')) = g.n_br
 AND lower(coalesce(o.sent_pt_pt,'')) = g.n_pt
WHERE g.n_br IS NULL

UNION ALL

-- PtBrVarId (train + valid)
SELECT
  'PtBrVarId' AS dataset,
  p.split     AS split,      -- 'train' or 'valid'
  'liaad/PtBrVId' AS source,
  'n/a' AS bucket,
  'n/a' AS theme,
  p.label,
  p.text_pt_br,
  p.text_pt_pt,
  CAST(NULL AS TEXT) AS ref_pt_pt_manual,
  CAST(NULL AS TEXT) AS ref_pt_pt_deepl
FROM ptbr_split_assign_v p
WHERE lower(p.split) IN ('train','valid')

UNION ALL

-- FRMT dev: train + valid
SELECT
  'FRMT' AS dataset,
  d.split AS split,          -- 'train' or 'valid'
  'hugosousa/frmt' AS source,
  d.bucket,
  'n/a' AS theme,
  CAST(NULL AS TEXT) AS label,
  d.text_pt_br,
  d.text_pt_pt,
  CAST(NULL AS TEXT) AS ref_pt_pt_manual,
  CAST(NULL AS TEXT) AS ref_pt_pt_deepl
FROM frmt_dev_split_v d;
""")

con.execute("DROP VIEW IF EXISTS test_data;")
con.execute("""
CREATE VIEW test_data AS

-- PtBrVarId test
SELECT
  'PtBrVarId' AS dataset,
  'test'      AS split,
  'liaad/PtBrVId' AS source,
  'n/a' AS bucket,
  'n/a' AS theme,
  p.label,
  p.text_pt_br,
  p.text_pt_pt,
  CAST(NULL AS TEXT) AS ref_pt_pt_manual,
  CAST(NULL AS TEXT) AS ref_pt_pt_deepl
FROM ptbr_split_assign_v p
WHERE lower(p.split) = 'test'

UNION ALL

-- FRMT test
SELECT
  'FRMT' AS dataset,
  'test' AS split,
  'hugosousa/frmt' AS source,
  f.bucket,
  'n/a' AS theme,
  CAST(NULL AS TEXT) AS label,
  f.text_pt_br,
  f.text_pt_pt,
  CAST(NULL AS TEXT) AS ref_pt_pt_manual,
  CAST(NULL AS TEXT) AS ref_pt_pt_deepl
FROM frmt_test f

UNION ALL

-- Gold Collection (test only)
SELECT
  'Gold' AS dataset,
  'test' AS split,
  'joaosanches/golden_collection' AS source,
  COALESCE(g.bucket,'n/a'),
  COALESCE(g.theme,'n/a'),
  'n/a' AS label,
  g.text_pt_br,
  CAST(NULL AS TEXT) AS text_pt_pt,
  g.ref_pt_pt_manual,
  g.ref_pt_pt_deepl
FROM gold_test g;
""")

# Quick summary
print("\n=== Unified view sizes ===")
print("train_data rows:",
      con.execute("SELECT COUNT(*) FROM train_data").fetchone()[0])
print("test_data rows :",
      con.execute("SELECT COUNT(*) FROM test_data").fetchone()[0])

print("\ntrain_data by dataset/split:")
print(con.execute("""
    SELECT dataset, split, COUNT(*) AS n
    FROM train_data
    GROUP BY dataset, split
    ORDER BY dataset, split
""").df())

print("\ntest_data by dataset/split:")
print(con.execute("""
    SELECT dataset, split, COUNT(*) AS n
    FROM test_data
    GROUP BY dataset, split
    ORDER BY dataset, split
""").df())


✓ Connected. src attached: True
Gold rows: 500
Using ptbrvarid source: src.main.ptbrvarid


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

PtBrVId repaired rows: 2991704


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

PtBrVId unique rows: 2990689
PtBrVId split counts:


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

   split        n
0  train  2959559
1  valid    29680
2   test     1450


README.md:   0%|          | 0.00/678 [00:00<?, ?B/s]

data/exemplars-00000-of-00001.parquet:   0%|          | 0.00/232k [00:00<?, ?B/s]

data/dev-00000-of-00001.parquet:   0%|          | 0.00/1.56M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/1.61M [00:00<?, ?B/s]

Generating exemplars split:   0%|          | 0/613 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/3935 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3987 [00:00<?, ? examples/s]

FRMT dev rows: 3935
FRMT test rows: 3987
FRMT dev split counts:
   split     n
0  train  3915
1  valid    20
✓ opus_source -> src.main.opus_moses_filtered
test_pairs_guard rows: 3116

=== Unified view sizes ===


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

train_data rows: 13341057


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

test_data rows : 5937

train_data by dataset/split:


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

     dataset  split         n
0       FRMT  train      3915
1       FRMT  valid        20
2   OpenSubs  train  10347883
3  PtBrVarId  train   2959559
4  PtBrVarId  valid     29680

test_data by dataset/split:


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

     dataset split     n
0       FRMT  test  3987
1       Gold  test   500
2  PtBrVarId  test  1450
