**// IMPORTS**

**// CONFIGS**

In [1]:
import duckdb, pathlib

PROJECT_DB_PATH = pathlib.Path("../data/duckdb/subs_project.duckdb")
SOURCE_DB_PATH  = pathlib.Path("../data/duckdb/subs.duckdb")

con = duckdb.connect(PROJECT_DB_PATH.as_posix())
con.execute("PRAGMA threads=1;")
con.execute("PRAGMA preserve_insertion_order=false;")
con.execute("PRAGMA memory_limit='4GB';")  # <-- increase this if your machine has the RAM
con.execute("PRAGMA temp_directory='/tmp/duckdb_tmp';")  # make sure this dir exists

dbl = con.execute("PRAGMA database_list").df()
if not (dbl["name"] == "src").any():
    con.execute(f"ATTACH '{SOURCE_DB_PATH.as_posix()}' AS src;")


**// MAIN CODE**

In [2]:
# print(con.execute("SELECT COUNT(*) FROM train_data;").df())


In [9]:
def char_stats_for(
    table: str,
    col: str,
    split: str | None = None,
    dataset: str | None = None,
):
    where_clauses = [f"{col} IS NOT NULL"]
    if split is not None:
        where_clauses.append(f"split = '{split}'")
    if dataset is not None:
        where_clauses.append(f"dataset = '{dataset}'")
    where_sql = " AND ".join(where_clauses)

    sql = f"""
        SELECT
            COUNT(*)                                   AS n,
            AVG(LENGTH({col}))                         AS mean,
            MIN(LENGTH({col}))                         AS min,
            MAX(LENGTH({col}))                         AS max,
            approx_quantile(LENGTH({col}), 0.50)       AS median,
            approx_quantile(LENGTH({col}), 0.90)       AS p90,
            approx_quantile(LENGTH({col}), 0.95)       AS p95,
            approx_quantile(LENGTH({col}), 0.99)       AS p99
        FROM {table}
        WHERE {where_sql};
    """
    return con.execute(sql).df()



In [13]:
# train vs valid, all datasets
train_br  = char_stats_for("train_data", "text_pt_br", split="train").assign(split="train", side="pt_br")
train_pt  = char_stats_for("train_data", "text_pt_pt", split="train").assign(split="train", side="pt_pt")

valid_br  = char_stats_for("train_data", "text_pt_br", split="valid").assign(split="valid", side="pt_br")
valid_pt  = char_stats_for("train_data", "text_pt_pt", split="valid").assign(split="valid", side="pt_pt")

test_br   = char_stats_for("test_data",  "text_pt_br").assign(split="test", side="pt_br")
test_pt   = char_stats_for("test_data",  "text_pt_pt").assign(split="test", side="pt_pt")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [14]:
import pandas as pd
char_len_stats = pd.concat([train_br, train_pt, valid_br, valid_pt, test_br, test_pt], ignore_index=True)
char_len_stats

Unnamed: 0,n,mean,min,max,median,p90,p95,p99,split,side
0,10684022,52.870606,1,4087,32,86,128,482,train,pt_br
1,12976311,136.413371,1,4523,39,413,679,1158,train,pt_pt
2,3330,452.204805,27,3805,389,725,861,1398,valid,pt_br
3,26390,515.622205,35,4045,409,974,1149,1456,valid,pt_pt
4,3288,162.168796,4,3109,138,276,347,603,test,pt_br
5,3888,278.699588,5,3947,189,622,899,1293,test,pt_pt


In [12]:
# train_data has both train + valid, distinguished by `split`
per_dataset_train = con.execute("""
    SELECT
        split,               -- 'train' or 'valid'
        dataset,
        COUNT(*)                          AS n_rows,
        SUM(text_pt_br IS NOT NULL)       AS n_br_not_null,
        SUM(text_pt_pt IS NOT NULL)       AS n_pt_not_null
    FROM train_data
    GROUP BY split, dataset
    ORDER BY split, dataset;
""").df()

# test_data usually only has test, so we inject split='test'
per_dataset_test = con.execute("""
    SELECT
        'test' AS split,
        dataset,
        COUNT(*)                          AS n_rows,
        SUM(text_pt_br IS NOT NULL)       AS n_br_not_null,
        SUM(text_pt_pt IS NOT NULL)       AS n_pt_not_null
    FROM test_data
    GROUP BY dataset
    ORDER BY dataset;
""").df()

import pandas as pd
per_dataset = pd.concat([per_dataset_train, per_dataset_test], ignore_index=True)
per_dataset


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Unnamed: 0,split,dataset,n_rows,n_br_not_null,n_pt_not_null
0,train,FRMT,3915,2503.0,2505.0
1,train,OpenSubs,10347883,10347883.0,10347883.0
2,train,PtBrVarId,2959559,333636.0,2625923.0
3,valid,FRMT,20,20.0,20.0
4,valid,PtBrVarId,29680,3310.0,26370.0
5,test,FRMT,3987,2612.0,2614.0
6,test,Gold,500,500.0,0.0
7,test,PtBrVarId,1450,176.0,1274.0


**// TOKENS COUNT PER SPLIT**

In [None]:
from collections import defaultdict
import pandas as pd
from transformers import AutoTokenizer

MODEL_NAME = "Qwen/Qwen3-0.6B"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def count_tokens(text: str) -> int:
    if text is None:
        return 0
    # no padding, no truncation, no special tokens -> dataset "raw" token length
    return len(tokenizer(text, add_special_tokens=False).input_ids)



def token_stats_for_table(table: str, has_split_col: bool) -> pd.DataFrame:
    """
    table: 'train_data' or 'test_data'
    has_split_col: True for train_data, False for test_data (we inject split='test')
    """
    
    sql = f"""
        SELECT split, text_pt_br, text_pt_pt
        FROM {table}
        WHERE text_pt_br IS NOT NULL OR text_pt_pt IS NOT NULL
    """
    rel = con.execute(sql)

    # stats[split] = dict with counters
    stats = defaultdict(lambda: {
        "n_rows": 0,
        "tokens_br": 0,
        "tokens_pt": 0,
    })

    BATCH_SIZE = 50_000 # adjust if you want more/less speed vs memory

    while True:
        rows = rel.fetchmany(BATCH_SIZE)  # list of tuples (split, text_pt_br, text_pt_pt)
        if not rows:
            break

        for split, text_pt_br, text_pt_pt in rows:
            s = stats[split]
            s["n_rows"] += 1
            if text_pt_br is not None:
                s["tokens_br"] += count_tokens(text_pt_br)
            if text_pt_pt is not None:
                s["tokens_pt"] += count_tokens(text_pt_pt)

    # convert to DataFrame
    out_rows = []
    for split, s in stats.items():
        n = s["n_rows"] or 1  # avoid division by zero
        out_rows.append({
            "split": split,
            "n_rows": s["n_rows"],
            "total_tokens_pt_br": s["tokens_br"],
            "mean_tokens_pt_br": s["tokens_br"] / n,
            "total_tokens_pt_pt": s["tokens_pt"],
            "mean_tokens_pt_pt": s["tokens_pt"] / n,
        })

    return pd.DataFrame(out_rows)

train_valid_stats = token_stats_for_table("train_data", has_split_col=True)
test_stats        = token_stats_for_table("test_data",  has_split_col=False)

token_len_stats = pd.concat([train_valid_stats, test_stats], ignore_index=True) \
                    .sort_values("split")

token_len_stats
