In [1]:
from pathlib import Path
import duckdb, itertools, pandas as pd
from tqdm import tqdm
from pathlib import Path;  import duckdb, itertools, tqdm, mmap, math

pd.set_option("display.max_colwidth", None)
# -------------------------------------------------------------------------
# CONFIG
# -------------------------------------------------------------------------
CACHE_DIR = Path("../data/opus_cache")
DB_PATH    = Path("../data/duckdb/subs.duckdb")
BATCH_SIZE = 50_000           # tune to your RAM/SSD speed

# -------------------------------------------------------------------------
def find_files():
    br = None
    pt = None
    for f in CACHE_DIR.iterdir():
        fn = f.name
        if fn.endswith(".pt_BR"):      # ← brazilian sentences
            br = f
        elif fn.endswith(".pt") and not fn.endswith(".pt_BR"):
            # ends with '.pt' but NOT '.pt_BR' → european sentences
            pt = f
    if br is None or pt is None:
        raise RuntimeError(
            "Could not uniquely identify the two files.\n"
            "Expected one file whose name ends with '.pt_BR' (br side) and\n"
            "one that ends with plain '.pt' (pt-PT side). Check CACHE_DIR!"
        )
    return br, pt


BR_PATH, PT_PATH = find_files()
print("Brazilian file :", BR_PATH.name)
print("European file  :", PT_PATH.name)

# -------------------------------------------------------------------------
# 1) DuckDB schema  (sequence + table   → works in every DuckDB version)
# -------------------------------------------------------------------------
DDL = """
CREATE SEQUENCE IF NOT EXISTS seq_opus_moses START 1;

CREATE TABLE IF NOT EXISTS opus_moses (
    pair_id     BIGINT DEFAULT nextval('seq_opus_moses') PRIMARY KEY,
    sent_pt_br  TEXT,
    sent_pt_pt  TEXT
);
"""

def get_connection():
    DB_PATH.parent.mkdir(parents=True, exist_ok=True)
    con = duckdb.connect(str(DB_PATH))
    con.execute(DDL)
    return con

def insert_batch(con, rows):
    con.executemany(
        "INSERT INTO opus_moses (sent_pt_br, sent_pt_pt) VALUES (?, ?)",
        rows
    )

# -------------------------------------------------------------------------
# 2) stream the two files line-by-line and insert
# -------------------------------------------------------------------------
def sentence_pairs():
    with BR_PATH.open('r', encoding='utf-8') as br_f, \
         PT_PATH.open('r', encoding='utf-8') as pt_f:
        for br_line, pt_line in zip(br_f, pt_f):
            yield br_line.rstrip("\n"), pt_line.rstrip("\n")



Brazilian file : OpenSubtitles.pt-pt_BR.pt_BR
European file  : OpenSubtitles.pt-pt_BR.pt


In [2]:
import pandas as pd

def show_context(df: pd.DataFrame,
                 id_value,
                 id_col: str = "pair_id",
                 n: int = 2) -> None:
    """
    Print *n* rows before and after the row whose `id_col` equals `id_value`.

    Parameters
    ----------
    df : pd.DataFrame
        The DataFrame to search.
    id_value : Any
        The value to match in `id_col`.
    id_col : str, default "pair_id"
        Which column contains the identifier.
    n : int, default 2
        How many rows of context to show on each side.
    """
    # find the positional index of the first matching row
    matches = df.index[df[id_col] == id_value]
    if matches.empty:
        raise ValueError(f"{id_value!r} not found in column {id_col!r}")

    i = matches[0]                       # position of the match
    start = max(0, i - n)                # clamp to frame boundaries
    end   = min(len(df), i + n + 1)

    print(df.iloc[start:end].to_string(index=False))


In [3]:
# import duckdb, pathlib, time

# BAK = pathlib.Path("../data/duckdb/subs.duckdb.bak").resolve()

# con = duckdb.connect()                       # open an in-memory connection
# con.execute("SET enable_progress_bar=true")  # show progress for slow steps

# # 1. attach the backup => DuckDB replays the 4 GB WAL (be patient: minutes)
# t0 = time.time()
# con.execute(f"ATTACH '{BAK}' AS bak")
# print("WAL replay finished in", round(time.time()-t0, 1), "s")

# # 2. drop table, index & sequence that belong to opus_moses
# con.execute("DROP TABLE IF EXISTS bak.opus_moses")
# con.execute("DROP SEQUENCE IF EXISTS bak.seq_opus_moses")




In [4]:
# # 3. checkpoint so the 4 GB WAL is folded into the main file  (now 0 B)
# con.execute("PRAGMA force_checkpoint;")
# con.execute("CHECKPOINT;")

# # 4. vacuum to reclaim the table’s disk space
# con.execute("VACUUM")                        # rewrites the file compactly

# # 5. detach and close
# con.execute("DETACH bak")
# con.close()

In [5]:
# from pathlib import Path
# import os, duckdb, time             # ‹-- getsize is in os.path

# DB_PATH = Path("../data/duckdb/subs.duckdb")

# print("DB:", DB_PATH, "size =", os.path.getsize(DB_PATH)/1e6, "MB")
# wal = DB_PATH.with_suffix(".duckdb.wal")
# if wal.exists():
#     print("WAL:", wal, "size =", os.path.getsize(wal)/1e6, "MB")

# t0 = time.time()
# print("Connecting…", flush=True)
# con = duckdb.connect(DB_PATH)
# print("Connected in", round(time.time()-t0,2), "s")

# con.execute("PRAGMA database_size;").show()   # quick sanity check
# con.close()


In [6]:
# con = duckdb.connect("../data/duckdb/subs.duckdb")
# con.execute("PRAGMA force_checkpoint;")
# con.execute("CHECKPOINT;")
# con.close()


In [7]:

# -------------------------------------------------------------------------
# paths to the two files you copied by hand
# -------------------------------------------------------------------------
CACHE_DIR = Path("../data/opus_cache")
BR_PATH   = CACHE_DIR / "OpenSubtitles.pt-pt_BR.pt_BR"
PT_PATH   = CACHE_DIR / "OpenSubtitles.pt-pt_BR.pt"

DB_PATH   = Path("../data/duckdb/subs.duckdb")

BATCH_SIZE = 50_000        # change freely

In [8]:


# -------------------------------------------------------------------------
# helper – fast line-count (used only for nice tqdm total)
# -------------------------------------------------------------------------
def count_lines(fp: Path) -> int:
    with fp.open('rb') as f, mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mm:
        return mm.read().count(b'\n')
    
print("Counting lines in", BR_PATH.name)

TOTAL_LINES = count_lines(BR_PATH)

# -------------------------------------------------------------------------
# schema: de-dupe + unique guard + checkpoint
# -------------------------------------------------------------------------
DDL = """
CREATE SEQUENCE IF NOT EXISTS seq_opus_moses START 1;

CREATE TABLE IF NOT EXISTS opus_moses (
     line_no     BIGINT PRIMARY KEY,                 -- 1-based position
     pair_id     BIGINT DEFAULT nextval('seq_opus_moses'),
     sent_pt_br  TEXT,
     sent_pt_pt  TEXT
);

-- 1) remove earlier duplicates, keep the lowest line_no
DELETE FROM opus_moses
USING (
    SELECT line_no,
           ROW_NUMBER() OVER (
               PARTITION BY sent_pt_br, sent_pt_pt
               ORDER BY line_no
           ) AS dup
    FROM opus_moses
) AS tmp
WHERE opus_moses.line_no = tmp.line_no
  AND tmp.dup > 1;

-- 2) future inserts may *try* to add a duplicate: stop them silently
CREATE UNIQUE INDEX IF NOT EXISTS uq_text_pair
    ON opus_moses(sent_pt_br, sent_pt_pt);
"""

# -------------------------------------------------------------------------
# open DB + run the DDL
# -------------------------------------------------------------------------
print("Connecting to DuckDB database:", DB_PATH)
con = duckdb.connect(DB_PATH)
con.execute("SET checkpoint_threshold='100MB'")
con.execute("SET enable_progress_bar=true")
print("Creating schema if not exists...")
con.execute(DDL)
print("Schema created or already exists.")

# -------------------------------------------------------------------------
# find how many lines are already in the table  (= checkpoint)
# -------------------------------------------------------------------------
done = con.execute("SELECT MAX(line_no) FROM opus_moses").fetchone()[0] or 0
print(f"⏩ resuming after line {done:,}")

# -------------------------------------------------------------------------
# iterate over the *remaining* lines only
# -------------------------------------------------------------------------
def sentence_pairs(start_at: int):
    with BR_PATH.open('r', encoding='utf-8') as br_f, \
         PT_PATH.open('r', encoding='utf-8') as pt_f:
        # skip the already-imported prefix efficiently
        for _ in range(start_at):
            next(br_f); next(pt_f)

        for ln, (br, pt) in enumerate(zip(br_f, pt_f), start_at + 1):
            yield ln, br.rstrip("\n"), pt.rstrip("\n")

# -------------------------------------------------------------------------
# main load loop
# -------------------------------------------------------------------------
batch = []
for ln, src, tgt in tqdm.tqdm(
        sentence_pairs(done),
        total=TOTAL_LINES - done,
        desc="Importing", unit="pairs"):

    batch.append((ln, src, tgt))

    if len(batch) >= BATCH_SIZE:
        con.executemany(
            "INSERT OR IGNORE INTO opus_moses (line_no, sent_pt_br, sent_pt_pt) "
            "VALUES (?, ?, ?)",
            batch)
        batch.clear()

# tail
if batch:
    con.executemany(
        "INSERT OR IGNORE INTO opus_moses (line_no, sent_pt_br, sent_pt_pt) "
        "VALUES (?, ?, ?)",
        batch)
    
con.execute("PRAGMA force_checkpoint;")   # folds WAL pages into the DB
con.execute("CHECKPOINT;") 

con.close()
print("✔ import complete; duplicates prevented going forward.")


Counting lines in OpenSubtitles.pt-pt_BR.pt_BR
Connecting to DuckDB database: ../data/duckdb/subs.duckdb
Creating schema if not exists...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Schema created or already exists.
⏩ resuming after line 16,031,951


Importing:  33%|███▎      | 13849999/42508871 [37:37:47<77:51:53, 102.24pairs/s]


KeyboardInterrupt: 

In [None]:
con.close()

In [None]:
con = duckdb.connect(DB_PATH, read_only=True)

In [None]:
df_raw = con.execute("""
    SELECT pair_id, sent_pt_br, sent_pt_pt, line_no
    FROM opus_moses
    ORDER BY pair_id
    LIMIT 1000
""").df()

df_proc = con.execute("""
    SELECT pair_id, sent_pt_br, sent_pt_pt, score
    FROM subtitle_pairs_2
    WHERE score > 0.75 AND score < 0.9
    LIMIT 1000
""").df()


In [None]:
df_raw

Unnamed: 0,pair_id,sent_pt_br,sent_pt_pt,line_no
0,1,"o diretor mueller acaba de nomear nos um número de caso principal, investigações foi oficialmente dublado amerithrax, quemosenviou cartas receberam antraz de um laboratório americano, nós não estaríamos aqui se nós não tinha provas nos conduzindo de volta para usamriid,","Em episódios anteriores... O diretor Mueller atribuiu-nos um caso importante. Oficialmente, a investigação chama-se Amerithrax.",1
1,2,"bruce ivins é o nosso melhor homem com antraz, então eu quero falar com ele, eles estão nos tratando como se fôssemos o inimigo, nós somos os heróis, a entrada tem que ser através do complexo de seu herói,",- Bruce Ivins é o perito em antraz. - Então quero falar com ele. Tratam-nos como o inimigo e somos os heróis.,2
2,3,"eu conheço a maioria dos americanos não estão prestando atenção, mas vocês tiveram sinais que o 11 de setembro iria acontecer, eagoravocênão consegueentender quem enviou essas cartas, então, em vez disso, você está assediando patriotas que trabalham duro como eu,","Temos de entrar através do complexo de herói dele. A maioria dos americanos não presta atenção, mas vocês tiveram sinais de que o 11 de Setembro ia acontecer. E agora não descobrem quem enviou as cartas.",3
3,4,"bruce,vocêé considerando fazer fisicamente mal a alguém?","E, em vez disso, incomodam patriotas trabalhadores como eu. Bruce, está a pensar em magoar alguém?",4
4,5,"você fez isso, dr, ivins?","Fez isto, Dr. Ivins?",5
...,...,...,...,...
995,1010,"Morro, de verdade, de vontades de ver-te.","Morro, de verdade, de vontades de ver-te.",1010
996,1011,Eu também cresço.,Eu também cresço.,1011
997,1012,Dizem que me estou tão grande que agora não me reconheceria.,Dizem que me estou tão grande que agora não me reconheceria.,1012
998,1013,Beijos.,Beijos.,1013


In [None]:
df_proc

Unnamed: 0,pair_id,sent_pt_br,sent_pt_pt,score


In [None]:
show_context(df, 12269161)  

NameError: name 'df' is not defined

In [None]:
con.close()

In [None]:
import duckdb
con = duckdb.connect("../data/duckdb/subs.duckdb")

# all user tables that live in the file itself (“main” schema)
tables = con.execute("""
    SELECT table_name
    FROM   information_schema.tables
    WHERE  table_schema = 'main'      -- skip temp tables & ATTACH-ed DBs
""").fetchall()

print("tables:", [t[0] for t in tables])


info = con.execute("PRAGMA table_info('opus_moses')").df()
print(info['name'].tolist())      # just the column names
# or peek everything:
# print(info)

con.close()


tables: ['movies', 'opus_moses', 'ptbrvarid', 'subtitle_pairs', 'subtitle_pairs_2', 'test_data', 'train_data']
['line_no', 'pair_id', 'sent_pt_br', 'sent_pt_pt']


In [None]:
import re, duckdb, pathlib
import pandas as pd
from fuzzywuzzy import fuzz

DB = pathlib.Path("../data/duckdb/subs.duckdb")
WEIGHT_TIME, WEIGHT_TEXT = 0.3, 0.7  # keep your scoring surface

# ---------- helpers ----------
SPLIT_RE = re.compile(r'(?<=[\.\?\!…])\s+')
tag_re   = re.compile(r'<[^>]+>|\{[^}]+\}')
nl_re    = re.compile(r'\s*\n\s*')

def clean_text(s: str) -> str:
    if not s: return ""
    s = nl_re.sub(' ', s)
    s = tag_re.sub('', s).strip()
    return s

def sim_token_set(a: str, b: str) -> float:
    # length-aware damping for tiny fragments
    def lw(x):
        core = re.sub(r"[\W_]+", "", x, flags=re.UNICODE)
        L = len(core)
        if L <= 6: return 0.25
        if L >= 24: return 1.0
        return 0.25 + (L - 6) / 18.0 * (1.0 - 0.25)
    a, b = clean_text(a), clean_text(b)
    base = fuzz.token_set_ratio(a, b) / 100.0
    return base * lw(a) * lw(b)

def score_text_only(a: str, b: str) -> float:
    # your Δt=0 "Hungarian-like" score
    return WEIGHT_TIME + WEIGHT_TEXT * (fuzz.ratio(clean_text(a), clean_text(b)) / 100.0)

def split_tail_sentence(text: str):
    parts = SPLIT_RE.split((text or "").strip())
    if len(parts) < 2: return None, None
    head = ' '.join(parts[:-1]).strip()
    tail = parts[-1].strip()
    if not head or not tail: return None, None
    return head, tail

def split_head_sentence(text: str):
    parts = SPLIT_RE.split((text or "").strip())
    if len(parts) < 2: return None, None
    head = parts[0].strip()
    rest = ' '.join(parts[1:]).strip()
    if not head or not rest: return None, None
    return head, rest

# ---------- row-level transforms (pure Python) ----------
def push_tails_forward_rows(rows, margin=0.05):
    """
    Move last sentence of PT(i) to PT(i+1) when it helps vs BR(i), BR(i+1).
    rows: list of {line_no, pair_id, sent_pt_br, sent_pt_pt}
    """
    rows = [r.copy() for r in rows]
    for i in range(len(rows) - 1):
        pt_i, br_i     = rows[i]["sent_pt_pt"], rows[i]["sent_pt_br"]
        pt_ip1, br_ip1 = rows[i+1]["sent_pt_pt"], rows[i+1]["sent_pt_br"]
        head, tail = split_tail_sentence(pt_i)
        if not tail:
            continue
        keep = sim_token_set(pt_i, br_i) + sim_token_set(pt_ip1, br_ip1)
        pt_i2   = head
        pt_ip12 = (tail + " " + (pt_ip1 or "")).strip()
        move = sim_token_set(pt_i2, br_i) + sim_token_set(pt_ip12, br_ip1)
        if move > keep + margin:
            rows[i]["sent_pt_pt"]   = pt_i2
            rows[i+1]["sent_pt_pt"] = pt_ip12
    return rows

def pull_heads_back_rows(rows, margin=0.05):
    """
    Move first sentence of PT(i+1) back to the end of PT(i) when it helps.
    """
    rows = [r.copy() for r in rows]
    for i in range(len(rows) - 1):
        pt_i, br_i     = rows[i]["sent_pt_pt"], rows[i]["sent_pt_br"]
        pt_ip1, br_ip1 = rows[i+1]["sent_pt_pt"], rows[i+1]["sent_pt_br"]
        head, rest = split_head_sentence(pt_ip1)
        if not head:
            continue
        keep = sim_token_set(pt_i, br_i) + sim_token_set(pt_ip1, br_ip1)
        pt_i2   = ((pt_i or "").rstrip() + (" " if pt_i else "") + head).strip()
        pt_ip12 = rest
        move = sim_token_set(pt_i2, br_i) + sim_token_set(pt_ip12, br_ip1)
        if move > keep + margin:
            rows[i]["sent_pt_pt"]   = pt_i2
            rows[i+1]["sent_pt_pt"] = pt_ip12
    return rows

# ---------- PASS 1: forward-only (rebuild subtitle_pairs_2 from opus_moses) ----------
def forward_pass_rebuild_pairs(margin=0.05):
    with duckdb.connect(DB) as con:
        con.execute("""
            CREATE TABLE IF NOT EXISTS subtitle_pairs_2(
                line_no     INTEGER,
                pair_id     BIGINT,
                sent_pt_br  VARCHAR,
                sent_pt_pt  VARCHAR,
                score       DOUBLE
            )
        """)
        con.execute("DELETE FROM subtitle_pairs_2;")

        df = con.execute("""
            SELECT line_no, pair_id, sent_pt_br, sent_pt_pt
            FROM opus_moses
            ORDER BY line_no
        """).df()
        if df.empty:
            print("opus_moses is empty.")
            return

        rows = [dict(zip(df.columns, r)) for r in df.itertuples(index=False, name=None)]
        rows = push_tails_forward_rows(rows, margin=margin)

        out = []
        for r in rows:
            br = clean_text(r["sent_pt_br"])
            pt = clean_text(r["sent_pt_pt"])
            sc = float(score_text_only(br, pt))
            out.append((int(r["line_no"]), int(r["pair_id"]), br, pt, sc))

        con.executemany("""
            INSERT INTO subtitle_pairs_2(line_no, pair_id, sent_pt_br, sent_pt_pt, score)
            VALUES (?, ?, ?, ?, ?)
        """, out)
        con.execute("PRAGMA force_checkpoint;")
        print(f"✓ forward pass wrote {len(out):,} rows to subtitle_pairs_2")

# ---------- PASS 2: backward-only (read/modify subtitle_pairs_2 in place) ----------
def backward_pass_update_pairs(margin=0.05):
    with duckdb.connect(DB) as con:
        df = con.execute("""
            SELECT line_no, pair_id, sent_pt_br, sent_pt_pt
            FROM subtitle_pairs_2
            ORDER BY line_no
        """).df()
        if df.empty:
            print("subtitle_pairs_2 is empty — run forward_pass_rebuild_pairs() first.")
            return

        rows = [dict(zip(df.columns, r)) for r in df.itertuples(index=False, name=None)]
        rows = pull_heads_back_rows(rows, margin=margin)

        # rewrite table in one go (fast + atomic)
        out = []
        for r in rows:
            br = clean_text(r["sent_pt_br"])
            pt = clean_text(r["sent_pt_pt"])
            sc = float(score_text_only(br, pt))
            out.append((int(r["line_no"]), int(r["pair_id"]), br, pt, sc))

        con.execute("BEGIN;")
        con.execute("DELETE FROM subtitle_pairs_2;")
        con.executemany("""
            INSERT INTO subtitle_pairs_2(line_no, pair_id, sent_pt_br, sent_pt_pt, score)
            VALUES (?, ?, ?, ?, ?)
        """, out)
        con.execute("COMMIT;")
        con.execute("PRAGMA force_checkpoint;")
        print(f"✓ backward pass updated {len(out):,} rows in subtitle_pairs_2")


In [None]:
# 1) Do the forward pass once (rebuilds the table from opus_moses)
forward_pass_rebuild_pairs(margin=0.05)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [None]:
# 2) If/when you want, run the backward pass on the saved result
backward_pass_update_pairs(margin=0.05)

In [None]:
from fuzzywuzzy import fuzz                 # pip install fuzzywuzzy python-Levenshtein
import re, duckdb, pandas as pd, pathlib

DB = pathlib.Path("../data/duckdb/subs.duckdb")
WEIGHT_TIME, WEIGHT_TEXT = 0.3, 0.7         # same as in your aligner

# --- in-memory text cleaners -------------------------------------------
tag_re   = re.compile(r'<[^>]+>|\{[^}]+\}')
nl_re    = re.compile(r'\s*\n\s*')

def clean_text(s: str) -> str:
    s = nl_re.sub(' ', s)          # eliminate_new_lines
    s = tag_re.sub('', s).strip()  # strip <tags> or {tags}
    return s

def hungarian_like_score(a: str, b: str) -> float:
    """Score identical to your aligner when Δt = 0."""
    return WEIGHT_TIME + WEIGHT_TEXT * (fuzz.ratio(a, b) / 100.0)


In [None]:
from duckdb.typing import VARCHAR, FLOAT   # import the constants

def materialise_new_pairs():
    """Clean + score the unseen opus_moses rows and append to subtitle_pairs_2."""
    with duckdb.connect(DB) as con:
        print("Connecting to", DB)

        # ── 1. how many new rows are there? ────────────────────────────
        todo = con.execute("""
            SELECT COUNT(*) FROM opus_moses
            WHERE pair_id NOT IN (SELECT pair_id FROM subtitle_pairs_2)
        """).fetchone()[0]

        if todo == 0:
            print("✓ nothing new to process")
            return
        print(f"Found {todo:,} new rows to process")

        # ── 2. register the helpers for this session (DuckDB infers types) ─
        con.create_function("clean_text",           clean_text)
        con.create_function("hungarian_like_score", hungarian_like_score)

        # ── 3. stream-insert, all work done inside DuckDB ──────────────
        con.execute("""
            INSERT OR IGNORE INTO subtitle_pairs_2            
            SELECT  line_no,
                    pair_id,
                    clean_text(sent_pt_br) AS sent_pt_br,
                    clean_text(sent_pt_pt) AS sent_pt_pt,
                    hungarian_like_score(
                        clean_text(sent_pt_br),
                        clean_text(sent_pt_pt)
                    )                       AS score
            FROM   opus_moses
            WHERE  pair_id NOT IN (SELECT pair_id FROM subtitle_pairs_2)
        """)

        con.execute("PRAGMA force_checkpoint;")     # folds & deletes .wal
        print(f"✓ inserted {todo:,} rows into subtitle_pairs_2")


In [None]:
materialise_new_pairs()

Connecting to ../data/duckdb/subs.duckdb
Found 1,497,223 new rows to process


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

✓ inserted 1,497,223 rows into subtitle_pairs_2


In [None]:
with duckdb.connect('../data/duckdb/subs.duckdb') as con:
    n_rows = con.execute("SELECT COUNT(*) FROM opus_moses").fetchone()[0]

print("opus_moses has", n_rows, "rows")


opus_moses has 13262988 rows


In [None]:
con = duckdb.connect(DB_PATH, read_only=True)
df2 = con.execute("""
    SELECT *
    FROM subtitle_pairs_2
    LIMIT 500000
""").df()


In [None]:
df2

Unnamed: 0,pair_id,line_no,sent_pt_br,sent_pt_pt,score
0,1667388,1667388,Mas ainda bem que está comigo.,Mas ainda bem que está comigo.,1.000
1,1667389,1667389,"Quando chegarmos ao campo de prisioneiros, vão...",Quando chegarmos ao campo de prisioneiros de g...,0.916
2,1667391,1667391,"Vou sentir a sua falta, Sargento.","Vou sentir a sua falta, Sargento.",1.000
3,1667393,1667393,Não pare!,Não parem!,0.965
4,1667396,1667396,- Ela está aqui?,- Ela está aqui? - Não.,0.874
...,...,...,...,...,...
499995,9528214,9528215,"Vá. Se não for, não funcionará.","Se não fores, a festa não começa.",0.734
499996,9528215,9528216,Conto contigo. Ah...,Vai lá!,0.405
499997,9528216,9528217,Os estúpidos nunca aprendem até que morrem.,Só a morte cura a estupidez!,0.517
499998,9528217,9528218,Mh?,O Exército.,0.300


In [None]:
show_context(df, 12269161)  

ValueError: 12269161 not found in column 'pair_id'

In [None]:
SAMPLE_N = 200_000              # adjust down if you still hit the limit

with duckdb.connect(DB) as con:
    con.create_function("clean_text", clean_text)

    equal_rows = con.execute(f"""
        SELECT COUNT(*)
        FROM (
            SELECT
                clean_text(m.sent_pt_br) = s.sent_pt_br  AS br_ok,
                clean_text(m.sent_pt_pt) = s.sent_pt_pt  AS pt_ok
            FROM (
                SELECT * FROM opus_moses
                ORDER  BY line_no
                LIMIT  {SAMPLE_N}
            ) AS m
            JOIN subtitle_pairs_2 AS s USING(pair_id)
        )
        WHERE br_ok AND pt_ok
    """).fetchone()[0]

print(f"{equal_rows:,} of the first {SAMPLE_N:,} opus_moses rows match "
      "subtitle_pairs_2 after cleaning")


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

200,000 of the first 200,000 opus_moses rows match subtitle_pairs_2 after cleaning
