In [2]:
import duckdb
import pandas as pd
from pathlib import Path

BASE_DIR = Path.home() / "repos" / "Thesis"  # adjust if needed
PROJECT_DB_PATH = BASE_DIR / "data" / "duckdb" / "subs_project.duckdb"
SOURCE_DB_PATH  = BASE_DIR / "data" / "duckdb" / "subs.duckdb"

con = duckdb.connect(PROJECT_DB_PATH.as_posix(), read_only=True)

# Attach src if your view references it
dbl = con.execute("PRAGMA database_list").df()
if not (dbl["name"] == "src").any():
    con.execute(f"ATTACH '{SOURCE_DB_PATH.as_posix()}' AS src")

# IMPORTANT: make DuckDB less memory-hungry for bookkeeping
con.execute("SET preserve_insertion_order=false")
con.execute("SET threads=1")  # start low; increase later if stable

# If you can afford it, give DuckDB more headroom (optional)
# con.execute("SET memory_limit='6GB'")

datasets = ["PtBrVId", "FRMT", "Gold"]  # <- adjust names to match exactly what's in your view
dfs = []
for d in datasets:
    df_d = con.execute("SELECT * FROM test_data WHERE dataset = ? LIMIT 15", [d]).df()
    print(d, "rows:", len(df_d))
    dfs.append(df_d)

df = pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()
df


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

PtBrVId rows: 15
FRMT rows: 15
Gold rows: 15


Unnamed: 0,dataset,split,source,bucket,theme,label,text_pt_br,text_pt_pt,ref_pt_pt_manual,ref_pt_pt_deepl
0,PtBrVId,test,liaad/PtBrVId,,journalistic,pt-PT,,Nas cidades dos países em vias de desenvolvime...,,
1,PtBrVId,test,liaad/PtBrVId,,journalistic,pt-PT,,"Na carta, Teresa d'Ávila diz que é ""pressupost...",,
2,PtBrVId,test,liaad/PtBrVId,,journalistic,pt-PT,,"Num comunicado, o ministro da Economia, Edmond...",,
3,PtBrVId,test,liaad/PtBrVId,,journalistic,pt-PT,,"""Em sintonia com o povo"" Falando em nome dos l...",,
4,PtBrVId,test,liaad/PtBrVId,,journalistic,pt-BR,Embora as regras não se refiram à correção diá...,,,
5,PtBrVId,test,liaad/PtBrVId,,journalistic,pt-PT,,XP-PCPlistaPE-SJA 2332 caracteres Carvalhas nã...,,
6,PtBrVId,test,liaad/PtBrVId,,journalistic,pt-PT,,"Até final do ano, o projecto mandado elaborar ...",,
7,PtBrVId,test,liaad/PtBrVId,,journalistic,pt-BR,"Pedro, segundo os policiais, matou o irmão, Ag...",,,
8,PtBrVId,test,liaad/PtBrVId,,journalistic,pt-PT,,"No primeiro semestre deste ano, as autoridades...",,
9,PtBrVId,test,liaad/PtBrVId,,journalistic,pt-BR,Técnicos reprovam ônibus no minhocão Da Report...,,,


In [3]:
df_by_split_train = con.execute("""
    SELECT
        split,
        COUNT(*) AS total_frmt,
        SUM(
            CASE WHEN
                (text_pt_br IS NULL OR length(trim(text_pt_br)) = 0)
                AND
                (text_pt_pt IS NULL OR length(trim(text_pt_pt)) = 0)
            THEN 1 ELSE 0 END
        ) AS both_missing
    FROM train_data
    WHERE dataset = 'FRMT'
    GROUP BY split
    ORDER BY split;
""").df()

df_by_split_test = con.execute("""
    SELECT
        split,
        COUNT(*) AS total_frmt,
        SUM(
            CASE WHEN
                (text_pt_br IS NULL OR length(trim(text_pt_br)) = 0)
                AND
                (text_pt_pt IS NULL OR length(trim(text_pt_pt)) = 0)
            THEN 1 ELSE 0 END
        ) AS both_missing
    FROM test_data
    WHERE dataset = 'FRMT'
    GROUP BY split
    ORDER BY split;
""").df()

df_by_split_train, df_by_split_test


(   split  total_frmt  both_missing
 0  train        3915        1409.0
 1  valid          20           0.0,
   split  total_frmt  both_missing
 0  test        3987        1372.0)