**//IMPORTS**

In [1]:
import duckdb
import pandas as pd, pathlib, itertools, textwrap, re, gc
import numpy as np
import random
import unicodedata


from fuzzywuzzy import fuzz
from rapidfuzz import fuzz, distance
from simalign import SentenceAligner
from typing import Optional, Dict, Any, Tuple, List, Iterable, Union



**//CONFIGS**

In [2]:
DB = '../data/duckdb/subs.duckdb'
pd.set_option("display.max_colwidth", None)

TAG_RE = re.compile(r'<[^>]+>|\{[^}]+\}')
NL_RE  = re.compile(r'\s*\n\s*')
SENT_SPLIT_RE = re.compile(r'(?<=[\.\?\!…])\s+')

ABBREVS = {"dr","dra","sr","sra","srta","prof","profa","etc","av","nº","n.º","vs","p.ex"}
ABBR_RX = re.compile(r'\b(' + '|'.join(re.escape(x) for x in ABBREVS) + r')\.', re.IGNORECASE)

# PREPROCESSING
HARD_STOPS = ".?!…"
SOFT_TAILS = ",;:—–-"
DASHES     = "-–—"

LETTER      = r"[^\W\d_]"                                   # any letter, no digits/underscore
NAME_TOKEN  = rf"{LETTER}(?:{LETTER}|[.'\-])*"              # e.g., Steve, O'Neill, João-Pedro
NAME_PHRASE = rf"{NAME_TOKEN}(?:\s+{NAME_TOKEN}){{0,2}}"    # up to 3-word names

WS = r"(?:\s|\u00A0|\u202F)*"                               # normal/narrow/nbsp
SPEAKER_LABEL_DROP = re.compile(
    rf"(^|[^\w]){NAME_PHRASE}{WS}[:\uFF1A]{WS}",            # keep boundary, drop label
    re.UNICODE
)

WORD = re.compile(r"[^\W\d_]+", re.UNICODE)
# Lightweight PT stopword set for "content-token" accounting
PT_PREPS = {"de","do","da","dos","das","em","no","na","nos","nas","com","para","por","a","ao","à","às","aos"}
PT_DETS  = {"o","a","os","as","um","uma","uns","umas","este","esta","estes","estas","esse","essa","esses","essas","aquele","aquela","aqueles","aquelas"}
PT_CLITICS={"me","te","se","lhe","nos","vos","lhes"}
PT_CONJ  = {"e","ou","mas","nem","que","porque","pois","porém","porem"}
PT_NEG   = {"não","nao"}
PT_STOPWORDS = (PT_PREPS | PT_DETS | PT_CLITICS | PT_CONJ | PT_NEG)

# sentence splitter (no look-behind) for light stats only
_SENT_RE = re.compile(r'.*?[.!?…]+(?:["”»\'\)\]\}]+)?(?=\s|$)|.+?(?=\s|$)', re.UNICODE)

ALIGN_METHOD = "inter"  # alternatives: "inter" (↑recall), "mwmf", "itermax", "union"

# === alignment tokenizer (use ONLY for SimAlign) ===
WS_TOKEN = re.compile(r"\S+", re.UNICODE)



In [None]:
# ---------- cleaning + similarity ----------
def clean_text(s: str) -> str:
    if not s: return ""
    return TAG_RE.sub('', NL_RE.sub(' ', s)).strip()

def sim(a: str, b: str) -> float:
    a = clean_text(a); b = clean_text(b)
    if not a and not b:
        return 1.0
    if not a or not b:
        return 0.0

    # edit-distance core signals (all penalize insertions/deletions)
    s_edit  = fuzz.ratio(a, b) / 100.0
    s_sort  = fuzz.token_sort_ratio(a, b) / 100.0         # order-insensitive but still length-aware
    s_lev   = distance.Levenshtein.normalized_similarity(a, b)  # 0..1

    # penalize big length mismatches (e.g., a is much longer than b)
    lp = min(len(a), len(b)) / max(len(a), len(b))  # 0..1

    # blend; weights are tame and easy to tune
    base = 0.5*s_edit + 0.2*s_sort + 0.3*s_lev
    return base * (0.5 + 0.5*lp)   # shrink score when lengths differ a lot

# ---------- clause split (sentences first, comma/dash fallback) ----------
def mask_abbrevs(t: str) -> str: return ABBR_RX.sub(lambda m: m.group(1)+"§", t or "")
def unmask_abbrevs(t: str) -> str: return (t or "").replace("§",".")

def sentence_split(t: str):
    tt = mask_abbrevs(t or "")
    parts = [unmask_abbrevs(p).strip() for p in SENT_SPLIT_RE.split(tt.strip()) if p.strip()]
    return parts

def split_tail_clause(text: str, max_tail_chars=60):
    parts = sentence_split(text)
    if len(parts) >= 2:
        head = ' '.join(parts[:-1]).strip(); tail = parts[-1].strip()
        if head and tail: return head, tail
    t = (text or "").strip()
    for token in [",", " - ", " – ", " — "]:
        k = t.rfind(token)
        if k != -1 and 1 <= len(t) - (k+len(token)) <= max_tail_chars:
            return t[:k].strip(), t[k+len(token):].strip()
    return None, None

def split_head_clause(text: str, max_head_chars=60):
    parts = sentence_split(text)
    if len(parts) >= 2:
        head = parts[0].strip(); rest = ' '.join(parts[1:]).strip()
        if head and rest: return head, rest
    t = (text or "").strip()
    for token in [",", " - ", " – ", " — "]:
        k = t.find(token)
        if k != -1 and 1 <= k+1 <= max_head_chars:
            return t[:k+len(token)].strip(), t[k+len(token):].strip()
    return None, None

def ok_piece(seg: str, min_chars=6, min_tokens=2):
    toks = [w for w in re.findall(r'\b\w+\b', seg or "", flags=re.UNICODE) if any(c.isalpha() for c in w)]
    return bool(seg) and len(seg) >= min_chars and len(toks) >= min_tokens

def _py_int(x):
    # robust cast for numpy/pandas scalars and plain ints
    if isinstance(x, (np.generic,)):  # np.int64, np.int32, etc.
        return int(x.item())
    return int(x)

def load_opus_window(start_line: int, window: int = 600) -> pd.DataFrame:
    start_line = _py_int(start_line)
    window     = _py_int(window)
    with duckdb.connect(str(DB)) as con:
        df = con.execute("""
            SELECT line_no, pair_id, sent_pt_br, sent_pt_pt
            FROM opus_moses
            WHERE line_no BETWEEN ? AND ?
            ORDER BY line_no
        """, [start_line, start_line + window - 1]).df()
    return df.fillna("")


# ---------- PASS A: neighbor MOVES (choose tail→next or head←next if it increases sum) ----------
def apply_neighbor_moves(df: pd.DataFrame, margin=0.04, max_clause_chars=60):
    df2 = df.copy()
    log = []
    n = len(df2)
    for i in range(n-1):
        for lang, other in (("sent_pt_pt","sent_pt_br"), ("sent_pt_br","sent_pt_pt")):
            L_i,  L_ip1  = df2.at[i,lang],     df2.at[i+1,lang]
            R_i,  R_ip1  = df2.at[i,other],    df2.at[i+1,other]
            keep_sum = sim(L_i,R_i) + sim(L_ip1,R_ip1)

            # option 1: move tail of i -> front of i+1
            head, tail = split_tail_clause(L_i, max_tail_chars=max_clause_chars)
            gain1 = -1e9
            if ok_piece(tail) and ok_piece(head, min_chars=4):
                move_sum1 = sim(head, R_i) + sim((tail + " " + (L_ip1 or "")).strip(), R_ip1)
                gain1 = move_sum1 - keep_sum

            # option 2: move head of i+1 -> end of i
            head2, rest2 = split_head_clause(L_ip1, max_head_chars=max_clause_chars)
            gain2 = -1e9
            if ok_piece(head2) and ok_piece(rest2, min_chars=4):
                move_sum2 = sim(((L_i or "") + (" " if L_i else "") + head2).strip(), R_i) + sim(rest2, R_ip1)
                gain2 = move_sum2 - keep_sum

            # apply the better positive option
            if gain1 > margin and gain1 >= gain2:
                df2.at[i,lang]     = head
                df2.at[i+1,lang]   = (tail + " " + (L_ip1 or "")).strip()
                log.append({"i": i, "lang": lang, "op": "tail_to_next", "gain": float(gain1)})
            elif gain2 > margin and gain2 > gain1:
                df2.at[i,lang]     = (((L_i or "") + (" " if L_i else "") + head2).strip())
                df2.at[i+1,lang]   = rest2
                log.append({"i": i, "lang": lang, "op": "head_from_next", "gain": float(gain2)})
            # else: no move
    return df2, pd.DataFrame(log)

# # ---------- example run on a tiny window ----------
# df  = load_opus_window(start_line=13580016, window=10)

# # A) move commas/clauses across neighbors when it helps the two-row sum
# moved_df, move_log = apply_neighbor_moves(df, margin=0.04, max_clause_chars=60)

# print("Moves:", len(move_log))
# print(move_log.head(10))


In [4]:
def _moved_piece(bi, ai, bip1, aip1, op):
    """Best-effort extract of the moved fragment from before/after strings."""
    if op == "tail_to_next":
        # ai = head; piece = suffix removed from bi
        if bi.startswith(ai):
            return bi[len(ai):].strip()
        # fallback: prefix added to next
        added = max(0, len(aip1) - len(bip1))
        return aip1[:added].strip()
    else:  # "head_from_next"
        # aip1 = rest; piece = prefix removed from bip1
        if len(bip1) > len(aip1):
            return bip1[:len(bip1) - len(aip1)].strip()
        # fallback: suffix added to i
        if ai.startswith(bi):
            return ai[len(bi):].strip()
        return ""

def preview_moves(df_before, df_after, move_log, k=8):
    """
    Show top-k moves with before/after texts, moved fragment, and score deltas.
    Assumes df_before/df_after are the same window (same line_no order).
    """
    if move_log is None or move_log.empty:
        print("No moves to preview.")
        return

    log = move_log.sort_values("gain", ascending=False).head(k)

    for _, r in log.iterrows():
        i   = int(r["i"])
        op  = r["op"]
        lang = r["lang"]
        other = "sent_pt_pt" if lang == "sent_pt_br" else "sent_pt_br"

        # pull rows
        bi   = df_before.at[i,   lang]
        bip1 = df_before.at[i+1, lang]
        ai   = df_after.at[i,    lang]
        aip1 = df_after.at[i+1,  lang]

        Ri   = df_before.at[i,   other]
        Rip1 = df_before.at[i+1, other]  # other side doesn't change during move

        piece = _moved_piece(bi, ai, bip1, aip1, op)

        keep_sum = sim(bi, Ri) + sim(bip1, Rip1)
        new_sum  = sim(ai, Ri) + sim(aip1, Rip1)
        d_i   = sim(ai, Ri)   - sim(bi, Ri)
        d_ip1 = sim(aip1, Rip1) - sim(bip1, Rip1)

        line_i   = int(df_before.at[i,   "line_no"])
        line_ip1 = int(df_before.at[i+1, "line_no"])

        print("\n────────────────────────────────────────")
        print(f"lines {line_i} → {line_ip1} | {lang} | {op} | gain {float(r['gain']):.3f}")
        print(f"moved piece: [{piece}]")
        print(f"sum sim: {keep_sum:.3f} → {new_sum:.3f}  (Δi={d_i:+.3f}, Δi+1={d_ip1:+.3f})")

        print("\n— BEFORE —")
        print(f"i   ({lang}): {bi}")
        print(f"i+1 ({lang}): {bip1}")
        print(f"i   ({other}): {Ri}")
        print(f"i+1 ({other}): {Rip1}")

        print("\n— AFTER —")
        print(f"i   ({lang}): {ai}")
        print(f"i+1 ({lang}): {aip1}")

# # Usage example (with what you already computed):
# df = load_opus_window(start_line=1750340, window=50)
# moved_df, move_log = apply_neighbor_moves(df, margin=0.04, max_clause_chars=60)
# preview_moves(df, moved_df, move_log, k=10)


In [5]:
def preview_window_final_df(
    start_line: int,
    window: int = 50,
    margin: float = 0.04,
    max_clause_chars: int = 60,
    max_iters: int = 5,
) -> pd.DataFrame:
    """
    Run apply_neighbor_moves repeatedly (moves only) on a window until no more moves
    or max_iters is reached. Return a DF with the same columns as opus_moses
    (line_no, pair_id, sent_pt_br, sent_pt_pt) reflecting the FINAL subtitles.
    """
    # load the original window
    cur = load_opus_window(start_line=start_line, window=window)

    # iterate moves to convergence
    for _ in range(max_iters):
        nxt, log = apply_neighbor_moves(cur, margin=margin, max_clause_chars=max_clause_chars)
        if log is None or log.empty:
            break
        cur = nxt

    # return only the opus_moses columns, in order
    return cur.loc[:, ["line_no", "pair_id", "sent_pt_br", "sent_pt_pt"]].copy()

final_df = preview_window_final_df(start_line=6766355, window=20, margin=0.04, max_clause_chars=60, max_iters=5)
final_df

Unnamed: 0,line_no,pair_id,sent_pt_br,sent_pt_pt
0,6766357,6766358,"Muito bem, OK. Ok? Ok.","Está. Bem, tudo bem. Está bem?"
1,6766360,6766361,Adeus.,Tudo bem.
2,6766361,6766362,"Não, não. Rosa, escute.","Adeusinho. Não, não. Rosa, ouça."
3,6766363,6766364,Preciso encontrar a senhora Lieberman. OK.,Preciso de encontrar a Sra. Lieberman.
4,6766365,6766366,"Se não encontrá-la, posso perder meu emprego. Se não entender, diga ""OK"". OK.","Está. Se não a encontrar, posso perder o meu emprego. Se não entender, diga ""Está bem""."
5,6766368,6766369,Ok.,Pronto.
6,6766369,6766370,Adeus.,Pronto.
7,6766370,6766371,Gracias.,Adeusinho.
8,6766371,6766372,Adeus.,Gracias.
9,6766372,6766373,Devolva a bola de gude!,Dá-me esse berlinde.


**//PREPROCESSING**

In [2]:
def _drop_speaker_labels_keep_content(s: str) -> str:
    if not s:
        return s
    # normalize space variants first
    s = (s.replace("\u00A0", " ")
           .replace("\u202F", " ")
           .replace("\u2007", " ")
           .replace("\u2009", " "))

    # keep the boundary, drop the label
    s = SPEAKER_LABEL_DROP.sub(r"\1", s)

    # tidy spacing
    s = re.sub(r"\s+([,.;:!?…)\]\}}])", r"\1", s)
    s = re.sub(r"([(\[\{{«“\"'])\s+", r"\1", s)
    s = re.sub(r"\s{2,}", " ", s).strip()
    return s

def _rstrip_quotes(s): return re.sub(r'[\s"\']+$', '', s or "")
def _last_char(s): 
    t = _rstrip_quotes(s or "").rstrip()
    return t[-1:] if t else ""
def _first_alpha_case(s):
    for ch in (s or ""):
        if ch.isalpha(): return "upper" if ch.isupper() else "lower"
    return None

def _starts_with_dash(s): return bool(re.match(r'^\s*['+re.escape(DASHES)+r']\s*', s or ""))
def _strip_leading_dash(s): return re.sub(r'^\s*['+re.escape(DASHES)+r']\s*', '', s or "")
def _remove_dash_after_punct(s): return re.sub(r'([,\.!\?])\s*['+re.escape(DASHES)+r']\s*', r'\1 ', s or "")

def _normalize_spaces(s):
    s = (s or "").replace("\u00A0", " ")
    s = re.sub(r'\s+', ' ', s).strip()
    s = re.sub(r'\s+([,.;:?!…])', r'\1', s)
    return s

def _is_all_caps_alpha(s):
    letters = [c for c in (s or "") if c.isalpha()]
    return bool(letters) and all(c.isupper() for c in letters)

def _sentence_case_from_lower(s):
    t, out, cap = (s or "").lower(), [], True
    for ch in t:
        if cap and ch.isalpha(): out.append(ch.upper()); cap=False
        else: out.append(ch)
        if ch in HARD_STOPS: cap=True
    return "".join(out)

def _capitalize_first_alpha(s):
    if not s: return s
    chars=list(s)
    for i,ch in enumerate(chars):
        if ch.isalpha(): chars[i]=ch.upper(); break
    return "".join(chars)

def _normalize_line_text(s):
    if not s: return ""
    s = _strip_leading_dash(s)
    s = _drop_speaker_labels_keep_content(s)
    s = _remove_dash_after_punct(s)
    s = _normalize_spaces(s)
    if _is_all_caps_alpha(s):
        s = _sentence_case_from_lower(s)
    return s

def _join_text(a,b):
    b2 = _strip_leading_dash(b).lstrip()
    if not a: return b2
    if not b2: return a
    return (a.rstrip() + " " + b2).strip()

def _has_inner_hard_stop(s):  # two sentences in one row
    return bool(re.search(r'[.?!…].+\S.*[.?!…]', s or ""))

def _should_merge_pair(br_a, br_b, pt_a, pt_b):
    a_end_br = _last_char(br_a); a_end_pt = _last_char(pt_a)
    b_head_br = _first_alpha_case(br_b); b_head_pt = _first_alpha_case(pt_b)
    hard_br = a_end_br in HARD_STOPS; hard_pt = a_end_pt in HARD_STOPS

    cont_br = ((a_end_br in SOFT_TAILS) or (len(br_a) < 40)) and (b_head_br=="lower" or _starts_with_dash(br_b))
    cont_pt = ((a_end_pt in SOFT_TAILS) or (len(pt_a) < 40)) and (b_head_pt=="lower" or _starts_with_dash(pt_b))

    underseg = (_has_inner_hard_stop(pt_a) and not _has_inner_hard_stop(br_a)) or \
               (_has_inner_hard_stop(br_a) and not _has_inner_hard_stop(pt_a))

    if hard_br and hard_pt and (b_head_br=="upper") and (b_head_pt=="upper") and not underseg:
        return False
    return bool(cont_br or cont_pt or underseg)



def plan_ops_over_corpus(block_size=50_000, reset=False):
    with duckdb.connect(str(DB)) as con:
        lo, hi = con.execute("SELECT MIN(line_no), MAX(line_no) FROM opus_moses").fetchone()
        lo, hi = int(lo), int(hi)

        # reset this run (only when you want a fresh start)
        if reset:
            con.execute("DELETE FROM opus_ops_update")
            con.execute("DELETE FROM opus_ops_delete")
            con.execute("DELETE FROM opus_ops_progress")
            con.execute("INSERT INTO opus_ops_progress VALUES (0)")

        # resume point
        done = int(con.execute("SELECT done_through FROM opus_ops_progress").fetchone()[0])
        cur  = max(lo, done + 1)

        carry = None
        last_br, last_pt = None, None

        while cur <= hi:
            win = min(block_size, hi - cur + 1)
            df = con.execute("""
                SELECT line_no, pair_id, sent_pt_br, sent_pt_pt
                FROM opus_moses
                WHERE line_no BETWEEN ? AND ?
                ORDER BY line_no
            """, [cur, cur+win-1]).df()

            rows = df.to_dict("records")
            if carry is not None:
                rows = [carry] + rows
                carry = None

            updates, deletes = [], []
            i, n = 0, len(rows)
            while i < n:
                base = rows[i]; i += 1
                br = _normalize_line_text(base["sent_pt_br"])
                pt = _normalize_line_text(base["sent_pt_pt"])
                group_lines = [int(base["line_no"])]

                while i < n:
                    nxt = rows[i]
                    br2 = _normalize_line_text(nxt["sent_pt_br"])
                    pt2 = _normalize_line_text(nxt["sent_pt_pt"])
                    if _should_merge_pair(br, br2, pt, pt2):
                        br = _join_text(br, br2)
                        pt = _join_text(pt, pt2)
                        group_lines.append(int(nxt["line_no"]))
                        i += 1
                    else:
                        break

                if i >= n:
                    carry = {"line_no": group_lines[0], "pair_id": int(base["pair_id"]),
                             "sent_pt_br": br, "sent_pt_pt": pt}
                    break

                if last_br and _last_char(last_br) in HARD_STOPS:
                    br = _capitalize_first_alpha(br)
                if last_pt and _last_char(last_pt) in HARD_STOPS:
                    pt = _capitalize_first_alpha(pt)

                head = group_lines[0]
                if br != base["sent_pt_br"] or pt != base["sent_pt_pt"] or len(group_lines) > 1:
                    updates.append({"line_no": head, "sent_pt_br": br, "sent_pt_pt": pt})
                for ln in group_lines[1:]:
                    deletes.append({"line_no": ln})

                last_br, last_pt = br, pt

            if updates:
                con.register("upd", pd.DataFrame(updates))
                con.execute("""
                    INSERT INTO opus_ops_update (line_no, sent_pt_br, sent_pt_pt)
                    SELECT line_no, sent_pt_br, sent_pt_pt FROM upd
                    ON CONFLICT(line_no) DO UPDATE SET
                        sent_pt_br = EXCLUDED.sent_pt_br,
                        sent_pt_pt = EXCLUDED.sent_pt_pt
                """)

                con.unregister("upd")
            if deletes:
                con.register("del", pd.DataFrame(deletes))
                con.execute("""
                    INSERT INTO opus_ops_delete (line_no)
                    SELECT DISTINCT line_no FROM del
                    ON CONFLICT(line_no) DO NOTHING
                """)

                con.unregister("del")

            del df, rows, updates, deletes
            gc.collect()

            # advance + persist resume point
            cur += win
            con.execute("UPDATE opus_ops_progress SET done_through = ?", [cur - 1])

        # flush final carry (on the same connection)
        if carry is not None:
            if last_br and _last_char(last_br) in HARD_STOPS:
                carry["sent_pt_br"] = _capitalize_first_alpha(carry["sent_pt_br"])
            if last_pt and _last_char(last_pt) in HARD_STOPS:
                carry["sent_pt_pt"] = _capitalize_first_alpha(carry["sent_pt_pt"])
            con.register("tail_upd", pd.DataFrame([{
                "line_no": int(carry["line_no"]),
                "sent_pt_br": carry["sent_pt_br"],
                "sent_pt_pt": carry["sent_pt_pt"],
            }]))
            con.execute("""
                INSERT INTO opus_ops_update (line_no, sent_pt_br, sent_pt_pt)
                SELECT line_no, sent_pt_br, sent_pt_pt FROM tail_upd
                ON CONFLICT(line_no) DO UPDATE SET
                    sent_pt_br = EXCLUDED.sent_pt_br,
                    sent_pt_pt = EXCLUDED.sent_pt_pt
            """)
            con.unregister("tail_upd")


# def apply_ops_to_opus_moses():
#     with duckdb.connect(str(DB)) as con:
#         # sanity: no overlap between updates and deletes
#         overlap = con.execute("""
#             SELECT COUNT(*) FROM opus_ops_update u
#             INNER JOIN opus_ops_delete d USING (line_no)
#         """).fetchone()[0]
#         if overlap:
#             raise RuntimeError(f"{overlap} lines in BOTH update & delete; fix plan_ops first.")

#         con.execute("BEGIN")

#         # 1) delete merged-away tails FIRST (avoids transient duplicates)
#         con.execute("""
#             DELETE FROM opus_moses
#             WHERE line_no IN (SELECT line_no FROM opus_ops_delete)
#         """)

#         # 2) then update heads with their merged/cleaned text
#         con.execute("""
#             UPDATE opus_moses AS o
#             SET sent_pt_br = u.sent_pt_br,
#                 sent_pt_pt = u.sent_pt_pt
#             FROM opus_ops_update AS u
#             WHERE o.line_no = u.line_no
#         """)

#         con.execute("COMMIT")
#         con.execute("CHECKPOINT")

def apply_ops_ctas_swap(force_checkpoint=True):
    with duckdb.connect(str(DB)) as con:
        # If a previous tx is half-open on this connection, close it
        try:
            con.execute("ROLLBACK")
        except:
            pass

        # updates win over deletes
        con.execute("""
            DELETE FROM opus_ops_delete
            WHERE line_no IN (SELECT line_no FROM opus_ops_update)
        """)

        con.execute("BEGIN")
        try:
            con.execute("DROP TABLE IF EXISTS opus_moses_new")
            con.execute("""
                CREATE TABLE opus_moses_new AS
                SELECT
                    o.line_no,
                    o.pair_id,
                    COALESCE(u.sent_pt_br, o.sent_pt_br) AS sent_pt_br,
                    COALESCE(u.sent_pt_pt, o.sent_pt_pt) AS sent_pt_pt
                FROM opus_moses o
                LEFT JOIN opus_ops_update u USING (line_no)
                WHERE o.line_no NOT IN (SELECT line_no FROM opus_ops_delete)
                ORDER BY o.line_no
            """)

            con.execute("DROP TABLE opus_moses")
            con.execute("ALTER TABLE opus_moses_new RENAME TO opus_moses")
            con.execute("CREATE UNIQUE INDEX IF NOT EXISTS opus_moses_line_pk ON opus_moses(line_no)")
            con.execute("CREATE UNIQUE INDEX IF NOT EXISTS opus_moses_pair_uq  ON opus_moses(pair_id)")
            con.execute("COMMIT")
        except:
            con.execute("ROLLBACK")
            raise

        if force_checkpoint:
            # waits for other write transactions to finish
            con.execute("FORCE CHECKPOINT")



In [3]:
# plan_ops_over_corpus(block_size=50_000, reset=True)
# apply_ops_ctas_swap()            

**//AFTER PREPROCESSING, RUN THE ALIGNER THROUGH THE WHOLE CORPUS**

In [9]:
import duckdb, pandas as pd, gc

def _run_moves_df(df: pd.DataFrame, margin=0.04, max_clause_chars=60, max_iters=5) -> pd.DataFrame:
    """Repeat apply_neighbor_moves on a DataFrame until no more moves or max_iters."""
    cur = df.loc[:, ["line_no","pair_id","sent_pt_br","sent_pt_pt"]].copy()
    for _ in range(int(max_iters)):
        nxt, log = apply_neighbor_moves(cur, margin=margin, max_clause_chars=max_clause_chars)
        if log is None or log.empty:
            break
        cur = nxt
    return cur

def apply_neighbor_moves_corpus_inplace(
    block_size: int = 50_000,
    overlap: int = 3,                 # rows kept between blocks so moves can cross the seam
    margin: float = 0.04,
    max_clause_chars: int = 60,
    max_iters: int = 5,
):
    """
    Stream over opus_moses and apply your neighbor-move heuristic in-place.
    - Processes in blocks with 'overlap' rows carried forward.
    - Only updates rows that actually changed.
    - No row-count changes (this pass only moves clauses).
    """
    with duckdb.connect(str(DB)) as con:
        lo, hi = con.execute("SELECT min(line_no), max(line_no) FROM opus_moses").fetchone()
        lo, hi = int(lo), int(hi)

        cur_start = lo
        carry_df = None  # last 'overlap' rows of the previous processed block (already moved)

        while cur_start <= hi:
            # choose fetch start/count so we include the carried rows
            if carry_df is None:
                fetch_start = cur_start
                fetch_count = min(block_size, hi - fetch_start + 1)
            else:
                fetch_start = int(carry_df["line_no"].iloc[0])
                fetch_count = min(block_size + overlap, hi - fetch_start + 1)

            # load from DB
            df = con.execute("""
                SELECT line_no, pair_id, sent_pt_br, sent_pt_pt
                FROM opus_moses
                WHERE line_no BETWEEN ? AND ?
                ORDER BY line_no
            """, [fetch_start, fetch_start + fetch_count - 1]).df()

            # overlay carried texts onto the front (so we start from the already-moved boundary)
            if carry_df is not None and not carry_df.empty:
                df = df.merge(carry_df[["line_no","sent_pt_br","sent_pt_pt"]],
                              on="line_no", how="left", suffixes=("","_car"))
                for col in ("sent_pt_br","sent_pt_pt"):
                    rep = df[col + "_car"]
                    df[col] = rep.where(rep.notna(), df[col])
                    df.drop(columns=[col + "_car"], inplace=True)

            # keep a copy for diffing
            orig = df.loc[:, ["line_no","sent_pt_br","sent_pt_pt"]].copy()

            # run your move heuristic on this combined block
            moved = _run_moves_df(df, margin=margin, max_clause_chars=max_clause_chars, max_iters=max_iters)

            # decide how many rows to flush now (keep the last 'overlap' rows for the next block)
            is_last_block = (fetch_start + len(df) - 1) >= hi
            flush_n = len(moved) if is_last_block else max(0, len(moved) - overlap)

            if flush_n:
                out = moved.iloc[:flush_n]
                base = orig.iloc[:flush_n]

                # diffs → only update changed rows
                changed = (out["sent_pt_br"] != base["sent_pt_br"]) | (out["sent_pt_pt"] != base["sent_pt_pt"])
                upd = out.loc[changed, ["line_no","sent_pt_br","sent_pt_pt"]]

                if not upd.empty:
                    con.register("upd", upd)
                    con.execute("""
                        UPDATE opus_moses AS o
                        SET sent_pt_br = u.sent_pt_br,
                            sent_pt_pt = u.sent_pt_pt
                        FROM upd AS u
                        WHERE o.line_no = u.line_no
                    """)
                    con.unregister("upd")

                # next fetch should begin right after the last flushed line
                cur_start = int(out["line_no"].iloc[-1]) + 1
            else:
                # nothing flushed (tiny last block)
                cur_start = fetch_start + len(df)

            # carry the tail (overlap) forward (already moved)
            carry_df = moved.iloc[flush_n:].copy()

            # tidy memory
            del df, orig, moved
            gc.collect()

        # flush any leftover carried rows (end of file)
        if carry_df is not None and not carry_df.empty:
            con.register("upd_tail", carry_df.loc[:, ["line_no","sent_pt_br","sent_pt_pt"]])
            con.execute("""
                UPDATE opus_moses AS o
                SET sent_pt_br = u.sent_pt_br,
                    sent_pt_pt = u.sent_pt_pt
                FROM upd_tail AS u
                WHERE o.line_no = u.line_no
            """)
            con.unregister("upd_tail")

        # reclaim disk space
        con.execute("CHECKPOINT")


In [10]:
# apply_neighbor_moves_corpus_inplace(
#     block_size=50_000,   # tune for your RAM
#     overlap=3,           # 2–3 is plenty for neighbor moves
#     margin=0.04,
#     max_clause_chars=60,
#     max_iters=5
# )


In [11]:
# ==============================
# B) REPEATED-PREFIX CLEANER
# ==============================
def _split_sents(s: str) -> List[str]:
    s = (s or "").strip()
    return [m.group(0).strip() for m in _SENT_RE.finditer(s)]

def _strip_accents(s: str) -> str:
    s = unicodedata.normalize("NFD", s)
    return "".join(ch for ch in s if unicodedata.category(ch) != "Mn")

def _norm_for_match(s: str) -> str:
    s = _strip_accents(s.lower())
    s = re.sub(r"[^\w]+", " ", s, flags=re.UNICODE)
    return re.sub(r"\s+", " ", s).strip()

def _tokens(s: str) -> List[str]:
    return _norm_for_match(s).split()

def _jaccard(a: set, b: set) -> float:
    return len(a & b) / max(1, len(a | b))

def _looks_like_sentence_start(s: str) -> bool:
    t = (s or "").lstrip()
    while t and t[0] in "«“\"([{'’”»": t = t[1:].lstrip()
    return (not t) or t[0].isupper()

def _adjacent_dedup(sents: List[str], jacc=0.96) -> List[str]:
    out: List[str] = []
    for s in sents:
        if out:
            a = set(_tokens(out[-1])); b = set(_tokens(s))
            if _jaccard(a, b) >= jacc:
                continue
        out.append(s)
    return out

def dedup_repeated_prefix_block(
    br_prev: str, pt_prev: str,
    br_here: str, pt_here: str,
    *,
    prev_window: int = 6,
    min_prefix_tokens: int = 6,
    coverage_thresh: float = 0.92,
    require_both: bool = False,
    collapse_adjacent_dups: bool = True
) -> Tuple[str, str, bool, int]:
    """
    Trim from the START of (br_here, pt_here) the longest sentence-aligned prefix
    whose tokens are largely contained in the TAIL of (br_prev, pt_prev).
    Returns: (br_trimmed, pt_trimmed, applied, n_sentences_removed)
    """
    def _count_to_remove(prev: str, nxt: str) -> int:
        prev_s = _split_sents(prev); nxt_s = _split_sents(nxt)
        if not prev_s or not nxt_s: return 0
        tail = " ".join(prev_s[-prev_window:]) if prev_window > 0 else " ".join(prev_s)
        tail_tok = set(_tokens(tail))
        best_k = 0
        for k in range(1, len(nxt_s) + 1):
            pref = " ".join(nxt_s[:k])
            toks = _tokens(pref)
            if len(toks) < min_prefix_tokens: continue
            cov = len(set(toks) & tail_tok) / max(1, len(set(toks)))
            if cov >= coverage_thresh: best_k = k
        return best_k

    k_br = _count_to_remove(br_prev, br_here)
    k_pt = _count_to_remove(pt_prev, pt_here)
    k = min(k_br, k_pt) if require_both else max(k_br, k_pt)
    if k <= 0: return br_here, pt_here, False, 0

    br_s = _split_sents(br_here)[k:]; pt_s = _split_sents(pt_here)[k:]
    if collapse_adjacent_dups:
        br_s = _adjacent_dedup(br_s); pt_s = _adjacent_dedup(pt_s)

    br_out = " ".join(br_s).strip(); pt_out = " ".join(pt_s).strip()
    if br_out and not _looks_like_sentence_start(br_out): return br_here, pt_here, False, 0
    if pt_out and not _looks_like_sentence_start(pt_out): return br_here, pt_here, False, 0
    return br_out, pt_out, True, k

def run_repeated_prefix_cleaner_chunked(
    *,
    db_path=DB,
    table: str = "opus_moses",
    order_col: str = "line_no",
    text_br_col: str = "sent_pt_br",
    text_pt_col: str = "sent_pt_pt",
    id_pair_col: str = "pair_id",
    # knobs (forwarded)
    prev_window: int = 6,
    min_prefix_tokens: int = 6,
    coverage_thresh: float = 0.92,
    require_both: bool = False,
    collapse_adjacent_dups: bool = True,
    # deletion policy
    delete_on_trigger: bool = True,
    delete_if_empty_only: bool = False,
    # execution
    chunk_size: int = 50_000,
    start_line: Optional[int] = None,
    end_line: Optional[int] = None,
    apply_changes: bool = False,
    print_updates: bool = False,
    trace_lines: Optional[Iterable[int]] = None
):
    """
    Walk rows; if dedup applies (either language unless require_both=True):
      - delete whole row (default), or
      - update with trimmed text.
    Prints deleted (line_no, pair_id). Processes in chunks.
    """
    assert not (delete_on_trigger and delete_if_empty_only), \
        "Choose delete_on_trigger=True OR delete_if_empty_only=True (not both)."

    where = []; args = []
    if start_line is not None: where.append(f"{order_col} >= ?"); args.append(int(start_line))
    if end_line   is not None: where.append(f"{order_col} <= ?"); args.append(int(end_line))
    WHERE = ("WHERE " + " AND ".join(where)) if where else ""

    with duckdb.connect(str(db_path)) as con:
        mn, mx = con.execute(
            f"SELECT min({order_col}), max({order_col}) FROM {table} {WHERE}", args
        ).fetchone()
        if mn is None or mx is None:
            print("No rows match selection."); return

        prev_br, prev_pt = "", ""
        cur = int(mn)
        while cur <= int(mx):
            hi = min(cur + int(chunk_size) - 1, int(mx))
            df = con.execute(f"""
                SELECT {order_col} AS line_no,
                       {id_pair_col} AS pair_id,
                       {text_br_col} AS br,
                       {text_pt_col} AS pt
                FROM {table}
                WHERE {order_col} BETWEEN ? AND ?
                ORDER BY {order_col}
            """, [cur, hi]).df()

            updates = []; deletes = []
            deleted_ids_print = []; updated_ids_print = []

            for i in range(len(df)):
                line_no = int(df.line_no.iloc[i])
                pair_id = int(df.pair_id.iloc[i]) if "pair_id" in df.columns else None
                br_here = df.br.iloc[i] or ""; pt_here = df.pt.iloc[i] or ""

                br_new, pt_new, applied, k = dedup_repeated_prefix_block(
                    prev_br, prev_pt, br_here, pt_here,
                    prev_window=prev_window,
                    min_prefix_tokens=min_prefix_tokens,
                    coverage_thresh=coverage_thresh,
                    require_both=require_both,
                    collapse_adjacent_dups=collapse_adjacent_dups
                )

                if trace_lines and (line_no in set(trace_lines)):
                    print(f"[trace {line_no}] applied={applied} k={k}")

                will_delete = False
                if applied:
                    if delete_on_trigger:
                        will_delete = True
                    elif delete_if_empty_only and (not br_new.strip() and not pt_new.strip()):
                        will_delete = True

                if will_delete:
                    deletes.append((line_no,))
                    deleted_ids_print.append((line_no, pair_id))
                    # don't advance prev_* on deletion (use last kept row)
                else:
                    if applied and (br_new != br_here or pt_new != pt_here):
                        updates.append((br_new, pt_new, line_no))
                        if print_updates: updated_ids_print.append((line_no, pair_id))
                        prev_br, prev_pt = br_new, pt_new
                    else:
                        prev_br, prev_pt = br_here, pt_here

            if apply_changes and (updates or deletes):
                con.execute("BEGIN TRANSACTION")
                if updates:
                    con.executemany(
                        f"UPDATE {table} SET {text_br_col} = ?, {text_pt_col} = ? WHERE {order_col} = ?",
                        updates
                    )
                if deletes:
                    con.executemany(
                        f"DELETE FROM {table} WHERE {order_col} = ?",
                        deletes
                    )
                con.execute("COMMIT")

            print(f"[{cur}..{hi}] updates={len(updates)} deletes={len(deletes)}")
            if deleted_ids_print:
                print("  Deleted rows (line_no, pair_id):")
                for j in range(0, len(deleted_ids_print), 1000):
                    block = deleted_ids_print[j:j+1000]
                    print("   ", ", ".join(f"({ln},{pid})" for ln,pid in block))
            if print_updates and updated_ids_print:
                print("  Updated rows (line_no, pair_id):")
                for j in range(0, len(updated_ids_print), 1000):
                    block = updated_ids_print[j:j+1000]
                    print("   ", ", ".join(f"({ln},{pid})" for ln,pid in block))

            cur = hi + 1


In [12]:
# 2) repeated-prefix dedup only
# run_repeated_prefix_cleaner_chunked(
#     db_path=DB,
#     prev_window=6,
#     min_prefix_tokens=6,
#     coverage_thresh=0.92,
#     require_both=False,             # either side may trigger
#     collapse_adjacent_dups=True,
#     delete_on_trigger=True,         # delete whenever dedup applies
#     delete_if_empty_only=False,
#     chunk_size=50_000,
#     start_line=None, end_line=None,
#     apply_changes=True,            # DRY RUN
#     print_updates=False,
#     trace_lines= {18}               # e.g., {18} to debug that row
# )

**//SIMALIGN**

In [None]:
# ==============================
# PURE SimAlign LINKS — Filter & Preview (self-contained)
# ==============================
# What this provides:
#  - SimAlign setup (XLM-R, word-level)
#  - Feature extractor using ONLY raw SimAlign word links (optionally with prev/here/next window)
#  - Flag policy (threshold-based; tuneable)
#  - Preview helpers (numbers-only + [[interior]] / <edge> highlights)
#  - No trimming/mutation logic included
# ==============================

# ---------- tokenization / basics ----------
def tokenize(s: str) -> list[str]:
    return WORD.findall(s or "")

def ali_tokenize(s: str) -> list[str]:
    """Tokens fed to SimAlign (word-level). Keep consistent with ali_char_spans()."""
    return tokenize(s)

def ali_char_spans(text: str) -> list[tuple[int, int]]:
    """Char spans aligned with ali_tokenize()."""
    return [m.span() for m in WORD.finditer(text or "")]

def token_overlap(a: str, b: str) -> float:
    A = set(w.lower() for w in WORD.findall(a or ""))
    B = set(w.lower() for w in WORD.findall(b or ""))
    return (len(A & B) / len(A | B)) if (A and B) else 0.0


def sim2(a: str, b: str, *, method: str = ALIGN_METHOD,
        smooth_small_gaps: int = 1, content_only: bool = False) -> float:
    """
    Similarity = average coverage of aligned tokens on both sides.
    - method: 'itermax' | 'mwmf' | 'inter' (you set ALIGN_METHOD outside)
    - smooth_small_gaps: fill 1-token pinholes if >0
    - content_only: measure coverage over content tokens only (ignores stopwords)
    """
    a = (a or "").strip()
    b = (b or "").strip()
    if not a and not b:
        return 1.0
    if not a or not b:
        return 0.0

    L = ali_tokenize(a)
    R = ali_tokenize(b)
    if not L and not R:
        return 1.0
    if not L or not R:
        return 0.0

    # raw SimAlign links
    out = aligner.get_word_aligns(L, R)
    pairs = out.get(method, out.get("inter", []))  # fall back if needed

    covered_L = {i for i, _ in pairs}
    covered_R = {j for _, j in pairs}

    covL = len(covered_L) / max(1, len(L))
    covR = len(covered_R) / max(1, len(R))

    return 0.5 * (covL + covR)

def is_content_token(tok: str) -> bool:
    t = (tok or "").lower()
    return (t not in PT_STOPWORDS) and (len(t) > 1)


def _split_sents(s: str) -> list[str]:
    s = (s or "").strip()
    return [m.group(0).strip() for m in _SENT_RE.finditer(s)]

# ---------- SimAlign setup ----------
aligner = SentenceAligner(model="xlmr", token_type="word", matching_methods="a")

# ---------- raw pairs + utilities ----------
def _raw_pairs(l_tokens, r_tokens, method=ALIGN_METHOD):
    out = aligner.get_word_aligns(l_tokens, r_tokens)  # keys: "inter", "itermax", "mwmf"
    if method == "itermax":
        return out["itermax"]
    elif method == "union":  # inter ∪ itermax (often a sweet spot)
        return list({*out["inter"], *out["itermax"]})
    else:  # "inter" or "mwmf"
        return out[method]

def spans_from_uncovered(tokens: list[str], covered_idx: set[int]) -> list[tuple[int,int]]:
    spans, cur = [], []
    for i in range(len(tokens)):
        if i not in covered_idx:
            cur.append(i)
        elif cur:
            spans.append((cur[0], cur[-1])); cur = []
    if cur:
        spans.append((cur[0], cur[-1]))
    return spans

def _smooth_small_gaps(covered: set[int], n_tokens: int, max_gap: int = 1) -> set[int]:
    """Fill tiny uncovered holes (≤ max_gap) surrounded by covered tokens (function-word pinholes)."""
    C = set(covered)
    i = 0
    while i < n_tokens:
        if i not in C:
            j = i
            while j < n_tokens and j not in C:
                j += 1
            gap = j - i
            if 0 < gap <= max_gap and i > 0 and j < n_tokens:
                for k in range(i, j):
                    C.add(k)
            i = j
        else:
            i += 1
    return C

def _interior_uncovered(tokens: list[str], covered: set[int]) -> list[tuple[int,int]]:
    """Uncovered runs strictly inside (not touching edges)."""
    runs = spans_from_uncovered(tokens, covered)
    n = len(tokens)
    return [(i0, i1) for (i0, i1) in runs if i0 > 0 and i1 < n - 1]

def _content_count(tokens: list[str]) -> int:
    return sum(1 for t in tokens if is_content_token(t))

# ---------- highlighting ----------
def _split_edge_vs_interior(runs: list[tuple[int,int]], n_tokens: int):
    interior, edges = [], []
    for i0, i1 in runs:
        if i0 > 0 and i1 < n_tokens - 1:
            interior.append((i0, i1))
        else:
            edges.append((i0, i1))
    return interior, edges

def _to_char_spans(token_runs: list[tuple[int,int]], token_char: list[tuple[int,int]]):
    char_runs = []
    for i0, i1 in token_runs:
        if not token_char:
            continue
        i0 = max(0, min(i0, len(token_char) - 1))
        i1 = max(0, min(i1, len(token_char) - 1))
        L = token_char[i0][0]; R = token_char[i1][1]
        char_runs.append((L, R))
    # merge
    char_runs.sort()
    merged = []
    for L, R in char_runs:
        if not merged or L > merged[-1][1]:
            merged.append([L, R])
        else:
            merged[-1][1] = max(merged[-1][1], R)
    return [(L, R) for L, R in merged]

def _apply_highlights(text: str,
                      interior_char: list[tuple[int,int]],
                      edge_char: list[tuple[int,int]],
                      marks=("[[", "]]"), edge_marks=("<", ">")) -> str:
    """Insert [[...]] (interior) and <...> (edge) highlights without breaking indices."""
    tags = []
    for L, R in interior_char:
        tags.append((L, "open_i")); tags.append((R, "close_i"))
    for L, R in edge_char:
        tags.append((L, "open_e")); tags.append((R, "close_e"))
    tags.sort(key=lambda x: (x[0], x[1].startswith("close")))  # close before open at same pos

    out, last = [], 0
    stack = []
    for pos, kind in tags:
        pos = max(0, min(pos, len(text)))
        if pos > last:
            out.append(text[last:pos])
            last = pos
        if kind == "open_i":
            out.append(marks[0]); stack.append("i")
        elif kind == "close_i":
            if stack and stack[-1] == "i":
                stack.pop()
                out.append(marks[1])
        elif kind == "open_e":
            out.append(edge_marks[0]); stack.append("e")
        elif kind == "close_e":
            if stack and stack[-1] == "e":
                stack.pop()
                out.append(edge_marks[1])
    if last < len(text):
        out.append(text[last:])
    return "".join(out)

def _alignment_uncovered_highlights(
    left_text: str, right_prev: str, right_here: str, right_next: str,
    *, use_window: bool
) -> tuple[str, float, float]:
    """
    Highlight uncovered tokens on the left_text using raw SimAlign links.
    Returns (highlighted_text, coverage_ratio, interior_content_ratio).
    """
    left_toks = ali_tokenize(left_text)
    if use_window:
        right_win = ali_tokenize(" ".join(x for x in [right_prev, right_here, right_next] if x))
        pairs = _raw_pairs(left_toks, right_win)
    else:
        pairs = _raw_pairs(left_toks, ali_tokenize(right_here))

    covered = {i for i, _ in pairs}
    covered = _smooth_small_gaps(covered, len(left_toks), max_gap=1)

    n = len(left_toks)
    cov = len(covered) / max(1, n)

    runs_all = spans_from_uncovered(left_toks, covered)
    interior_runs, edge_runs = _split_edge_vs_interior(runs_all, n)

    # interior content ratio
    total_content = sum(1 for t in left_toks if is_content_token(t))
    interior_content = sum(
        1 for i0, i1 in interior_runs for t in left_toks[i0:i1+1] if is_content_token(t)
    )
    interior_content_ratio = interior_content / max(1, total_content)

    token_chars = ali_char_spans(left_text)
    interior_char = _to_char_spans(interior_runs, token_chars)
    edge_char     = _to_char_spans(edge_runs, token_chars)
    hi = _apply_highlights(left_text, interior_char, edge_char)
    return hi, cov, interior_content_ratio

# ---------- feature extractor (pure SimAlign) ----------
def alignment_quality_features(
    br_prev: str, br_here: str, br_next: str,
    pt_prev: str, pt_here: str, pt_next: str,
    *, use_window: bool = True, sim_fn=None
) -> dict:
    """
    Compute alignment metrics using ONLY raw SimAlign word links (+ optional prev/next window).
    """
    if sim_fn is None:
        sim_fn = sim

    br_toks = ali_tokenize(br_here)
    pt_toks = ali_tokenize(pt_here)

    if use_window:
        pt_win = ali_tokenize(" ".join(x for x in [pt_prev, pt_here, pt_next] if x))
        br_pairs = _raw_pairs(br_toks, pt_win)
        br_cov = {i for i, _ in br_pairs}

        br_win = ali_tokenize(" ".join(x for x in [br_prev, br_here, br_next] if x))
        pt_pairs = _raw_pairs(pt_toks, br_win)
        pt_cov = {i for i, _ in pt_pairs}  # left indices of PT tokens
    else:
        pairs = _raw_pairs(br_toks, pt_toks)
        br_cov = {i for i, _ in pairs}
        pt_cov = {j for _, j in pairs}     # approximate right coverage

    # smooth 1-token pinholes
    br_cov = _smooth_small_gaps(br_cov, len(br_toks), max_gap=1)
    pt_cov = _smooth_small_gaps(pt_cov, len(pt_toks), max_gap=1)

    br_cov_ratio = len(br_cov) / max(1, len(br_toks))
    pt_cov_ratio = len(pt_cov) / max(1, len(pt_toks))
    cov_min = min(br_cov_ratio, pt_cov_ratio)
    cov_gap = abs(br_cov_ratio - pt_cov_ratio)

    # interior uncovered runs + content ratio
    br_int_spans = _interior_uncovered(br_toks, br_cov)
    pt_int_spans = _interior_uncovered(pt_toks, pt_cov)

    def _content_in_runs(tokens, runs):
        return sum(1 for i0, i1 in runs for t in tokens[i0:i1+1] if is_content_token(t))

    br_content_total = _content_count(br_toks)
    pt_content_total = _content_count(pt_toks)
    br_int_content = _content_in_runs(br_toks, br_int_spans)
    pt_int_content = _content_in_runs(pt_toks, pt_int_spans)

    br_int_content_ratio = br_int_content / max(1, br_content_total)
    pt_int_content_ratio = pt_int_content / max(1, pt_content_total)

    br_max_int = max((j - i + 1) for i, j in br_int_spans) if br_int_spans else 0
    pt_max_int = max((j - i + 1) for i, j in pt_int_spans) if pt_int_spans else 0

    # “spillover” vs extra-info (same-language neighbors)
    def _span_text(tokens, sp): i0, i1 = sp; return " ".join(tokens[i0:i1+1])
    br_int_text = " ".join(_span_text(br_toks, sp) for sp in br_int_spans)
    pt_int_text = " ".join(_span_text(pt_toks, sp) for sp in pt_int_spans)
    br_spill = max(token_overlap(br_int_text, br_prev), token_overlap(br_int_text, br_next)) if br_int_text else 0.0
    pt_spill = max(token_overlap(pt_int_text, pt_prev), token_overlap(pt_int_text, pt_next)) if pt_int_text else 0.0

    sent_diff = abs(len(_split_sents(br_here)) - len(_split_sents(pt_here)))
    base_sim = sim_fn(br_here, pt_here)

    return {
        "br_cov": br_cov_ratio, "pt_cov": pt_cov_ratio,
        "cov_min": cov_min, "cov_gap": cov_gap,
        "br_int_content_ratio": br_int_content_ratio,
        "pt_int_content_ratio": pt_int_content_ratio,
        "br_max_int": br_max_int, "pt_max_int": pt_max_int,
        "br_spill": br_spill, "pt_spill": pt_spill,
        "sent_diff": sent_diff,
        "base_sim": float(base_sim),
        "br_content_total": br_content_total,
        "pt_content_total": pt_content_total,
    }

# ---------- flag policy (tune to taste) ----------
def alignment_quality_flag(
    feats: dict,
    *,
    min_cov_ok: float = 0.50,
    max_cov_gap: float = 0.35,
    max_int_ratio: float = 0.33,
    max_max_int: int = 9,
    max_sent_diff: int = 1,
    min_sim_ok: float = 0.30,
    spill_tolerance: float = 0.60,
    min_row_content: int = 6,
    min_interior_content_for_flag: int = 3
) -> tuple[bool, str]:
    """
    Decide whether to activate the "alignment-quality" filter.
    Uses only SimAlign-derived features (+ same-language spill check).
    """
    # guards: short rows or already good-enough
    if feats["base_sim"] >= 0.70 and feats["cov_min"] >= 0.60:
        return False, "ok"
    if feats["br_content_total"] < min_row_content and feats["pt_content_total"] < min_row_content:
        return False, "too_short"

    reasons = []
    if feats["cov_min"] < min_cov_ok:
        reasons.append("low_coverage")
    if feats["cov_gap"] > max_cov_gap:
        reasons.append("coverage_asymmetry")
    if feats["br_int_content_ratio"] > max_int_ratio or feats["pt_int_content_ratio"] > max_int_ratio:
        reasons.append("big_interior_unaligned_content")
    if feats["br_max_int"] >= max_max_int or feats["pt_max_int"] >= max_max_int:
        reasons.append("long_interior_gap")
    if feats["sent_diff"] > max_sent_diff:
        reasons.append("sentence_mismatch")
    if feats["base_sim"] < min_sim_ok:
        reasons.append("low_similarity")

    spillish = (feats["br_spill"] >= spill_tolerance) or (feats["pt_spill"] >= spill_tolerance)
    strong = {"low_coverage","coverage_asymmetry","big_interior_unaligned_content","long_interior_gap"}
    strong_hits = len([r for r in reasons if r in strong])

    # need some actual interior content if we accuse "content" reasons
    if {"big_interior_unaligned_content","long_interior_gap"} & set(reasons):
        enough_interior = (feats["br_int_content_ratio"]*feats["br_content_total"] >= min_interior_content_for_flag) or \
                          (feats["pt_int_content_ratio"]*feats["pt_content_total"] >= min_interior_content_for_flag)
        if not enough_interior:
            reasons = [r for r in reasons if r not in {"big_interior_unaligned_content","long_interior_gap"}]

    activate = False
    if strong_hits >= 2:
        activate = True
    elif strong_hits >= 1 and not spillish:
        activate = True
    elif len(reasons) >= 3 and not spillish:
        activate = True

    return bool(activate), (",".join(reasons) if reasons else "ok")

# ---------- previews ----------
def preview_alignment_quality_window(
    start_line: int,
    window: int = 40,
    *,
    db_path=DB,
    use_window: bool = True,
    thresholds: dict | None = None,
    sim_fn=None
) -> pd.DataFrame:
    """Numbers-only preview (no highlights). No DB writes."""
    if thresholds is None:
        thresholds = {}
    with duckdb.connect(str(db_path)) as con:
        df = con.execute("""
            SELECT line_no, pair_id, sent_pt_br, sent_pt_pt
            FROM opus_moses
            WHERE line_no BETWEEN ? AND ?
            ORDER BY line_no
        """, [int(start_line), int(start_line + window - 1)]).df()

    rows = []
    n = len(df)
    for i in range(n):
        br_prev = df.sent_pt_br.iloc[i-1] if i > 0 else ""
        br_here = df.sent_pt_br.iloc[i]
        br_next = df.sent_pt_br.iloc[i+1] if i+1 < n else ""

        pt_prev = df.sent_pt_pt.iloc[i-1] if i > 0 else ""
        pt_here = df.sent_pt_pt.iloc[i]
        pt_next = df.sent_pt_pt.iloc[i+1] if i+1 < n else ""

        feats = alignment_quality_features(
            br_prev, br_here, br_next,
            pt_prev, pt_here, pt_next,
            use_window=use_window, sim_fn=sim_fn or sim
        )
        activate, reason = alignment_quality_flag(feats, **thresholds)

        rows.append({
            "line_no": int(df.line_no.iloc[i]),
            "pair_id": int(df.pair_id.iloc[i]),
            "activate_filter": bool(activate),
            "reason": reason,
            **feats,
        })
    return pd.DataFrame(rows)

def preview_alignment_quality_window_with_highlights(
    start_line: int,
    window: int = 40,
    *,
    db_path=DB,
    use_window: bool = True,
    thresholds: dict | None = None,
    show_when: str = "flagged",   # "flagged" | "all"
    sim_fn=None
) -> pd.DataFrame:
    """Preview with [[INTERIOR]] and <EDGE> highlights using pure SimAlign links."""
    if thresholds is None:
        thresholds = dict(
            min_cov_ok=0.50,
            max_cov_gap=0.35,
            max_int_ratio=0.33,
            max_max_int=9,
            max_sent_diff=1,
            min_sim_ok=0.30,
            spill_tolerance=0.60
        )

    with duckdb.connect(str(db_path)) as con:
        df = con.execute("""
            SELECT line_no, pair_id, sent_pt_br, sent_pt_pt
            FROM opus_moses
            WHERE line_no BETWEEN ? AND ?
            ORDER BY line_no
        """, [int(start_line), int(start_line + window - 1)]).df()

    rows = []
    n = len(df)
    for i in range(n):
        br_prev = df.sent_pt_br.iloc[i-1] if i > 0 else ""
        br_here = df.sent_pt_br.iloc[i]
        br_next = df.sent_pt_br.iloc[i+1] if i+1 < n else ""

        pt_prev = df.sent_pt_pt.iloc[i-1] if i > 0 else ""
        pt_here = df.sent_pt_pt.iloc[i]
        pt_next = df.sent_pt_pt.iloc[i+1] if i+1 < n else ""

        feats = alignment_quality_features(
            br_prev, br_here, br_next,
            pt_prev, pt_here, pt_next,
            use_window=use_window, sim_fn=sim_fn or sim
        )
        activate, reason = alignment_quality_flag(feats, **thresholds)
        if show_when == "flagged" and not activate:
            continue

        br_hi, _, _ = _alignment_uncovered_highlights(
            br_here, pt_prev, pt_here, pt_next, use_window=use_window
        )
        pt_hi, _, _ = _alignment_uncovered_highlights(
            pt_here, br_prev, br_here, br_next, use_window=use_window
        )

        rows.append({
            "line_no": int(df.line_no.iloc[i]),
            "pair_id": int(df.pair_id.iloc[i]),
            "activate_filter": bool(activate),
            "reason": reason,
            "base_sim": feats["base_sim"],
            "br_cov": feats["br_cov"], "pt_cov": feats["pt_cov"],
            "cov_gap": feats["cov_gap"],
            "br_int_content_ratio": feats["br_int_content_ratio"],
            "pt_int_content_ratio": feats["pt_int_content_ratio"],
            "spillish": max(feats["br_spill"], feats["pt_spill"]),
            "br_highlight": br_hi,   # [[INTERIOR]] and <EDGE>
            "pt_highlight": pt_hi,
        })
    return pd.DataFrame(rows)


2025-09-05 13:05:28,341 - simalign.simalign - INFO - Initialized the EmbeddingLoader with model: xlm-roberta-base


In [None]:

hq = preview_alignment_quality_window_with_highlights(
    start_line=15, window=30,
    use_window=True,           # align against prev+here+next window
    show_when="all",
    thresholds=dict(
        min_cov_ok=0.50,
        max_cov_gap=0.35,
        max_int_ratio=0.25,
        max_max_int=9,
        max_sent_diff=1,
        min_sim_ok=0.30,
        spill_tolerance=0.60
    )
)


In [43]:
hq

Unnamed: 0,line_no,pair_id,activate_filter,reason,base_sim,br_cov,pt_cov,cov_gap,br_int_content_ratio,pt_int_content_ratio,spillish,br_highlight,pt_highlight
0,21,21,True,"big_interior_unaligned_content,long_interior_gap,sentence_mismatch",0.388742,0.650794,0.8,0.149206,0.333333,0.207547,0.081395,"Não, mas esses buracos são grandes o suficiente para as mãos deslizarem, com luvas e [[punho anéis afixados a]] ele, pode ser hermético, uma caixa [[de biossegurança improvisada, certifique-se de que]] entra [[em um caso]] biológico, se houver algum esporos deixados lá, eu quero [[ter certeza]] que eles não ser sacudido durante um acidentado carona para [[o laboratório]], sim senhor, entendido, [[nós cuidaremos disso, tome cuidado, certo, k-6, vamos para k]]-6, odeio diga que eu disse [[a você]], bem, nós vamos conseguir para testá-[[lo imediatamente]], saberemos [[em breve, você foi]] puxado na equipe de sykes também? [[Parecia evitar]] o sétimo círculo do inferno por mais um dia, eu pensei [[em você estavam]] seguindo ivins, eu sou, o que [[que diabos ele]] está fazendo aqui?","Não. Mas esses buracos têm tamanho suficiente para mãos. Com luvas e [[material próprio]], podia ser estanque. [[Ponham isso]] numa caixa biológica. Se tiver esporos, não quero que se soltem durante a viagem. Sim, entendido. [[Para trás. 86436. Detesto dizer-te]] ""eu bem te disse"". Vamos testá-la e ficaremos [[a saber]]. Também vieste para a equipa [[do Sykes? Evitei]] o Sétimo Círculo do Inferno por mais um dia. Não [[andavas a]] seguir o Ivins? Ando. Que raio faz ele aqui?"
1,27,27,True,"big_interior_unaligned_content,sentence_mismatch",0.402956,0.602273,0.811321,0.209048,0.423729,0.194444,0.075472,"Não pode ser coincidência, temos um não seguro pessoa na tenda do voluntário, casaco marrom, chapéu preto, remova-[[o imediatamente]], entendido, reabastecer para um café? Por aqui, não [[não, isto é um erro]], isso, [[uh, eu não entendo]], eu [[tenho o, eu tenho o alerta]] de [[alta liberação, isso significa]] que eu trabalho [[para o governo, eu trabalho]] para o exército, O que você está filmando? Isto é um erro, [[eu estou, senhor, eu]] preciso te colocar no [[carro, senhor, no]] carro [[agora, ok]], então você estava lá?","Não pode ser coincidência. Temos uma pessoa não autorizada na tenda de voluntários. Casaco castanho, chapéu preto. [[Querem um]] café? Isto é um engano. Não [[percebo. Tenho autorização]] elevada. O que está a filmar? [[Isto é]] um engano, eu... Preciso que entre no carro. [[Entre já no]] carro. Está bem. Então esteve lá?"
2,36,36,False,ok,0.40488,0.852941,1.0,0.147059,0.153846,0.0,0.010638,"Não posso [[divulgue quaisquer]] detalhes, mas eles me trouxeram para ajudar por causa da minha experiência, isso foi [[tudo sobre]] as notícias de hoje, uh, você viu alguma coisa? Eles tinham alguma nova <evidência>?","Não posso falar de pormenores, mas pediram-me ajuda devido às minhas habilitações. Passou nas notícias. Viu alguma coisa, têm provas novas?"
3,38,38,True,"big_interior_unaligned_content,long_interior_gap,sentence_mismatch",0.388779,0.542636,0.807692,0.265057,0.387097,0.134615,0.075758,"<Eu> não posso divulgar isso, não alcancei [[o nível de]] liberação eu tenho por sendo um tagarela, [[soa como você estava realmente]] essencial, [[bruce, você se]] sentiu [[necessário? Você]] sabe, dr, halstrom, [[eu comecei]] a perceber no meu vida que existem dois tipos de pessoas, [[há pessoas cujo trabalho é é sentar em uma cadeira]] o dia [[todo e]] fazer perguntas, e então há alguns poucos habilidosos de nós que encontram respostas para eles, um é claramente mais necessário do que o outro, [[bom para]] ir, obrigada, [[eu aprecio isso, meu]] amigo, vejo [[você em]] breve, bem aqui, por favor, sim [[senhor, obrigada, tudo bem]], pare [[aí, ok, faça isso novamente]] com o outro, [[sim, use]] a <luva, sim, uh-hmm, sim, o que você quiser, uma caneta ou>,?","<Não> posso dizer. Não tenho uma autorização elevada por [[falar de mais. Parece que]] foi essencial, Bruce. Sentiu [[que precisavam]] de si? Sabe, Dra. Halstrom? Já percebi que há dois tipos de pessoas. [[Há as]] que passam o dia sentadas a fazer perguntas e há alguns de nós, mais habilitados, [[que lhes arranjamos]] respostas. Um [[dos tipos]] é mais necessário que o outro. Podemos ir. Obrigado, amigo. Até breve. Aqui, por favor. Pare. Repita com a outra. Sim."


In [6]:
def ali_tokenize(s: str) -> list[str]:
    return [m.group(0) for m in WS_TOKEN.finditer(s or "")]

def ali_char_spans(s: str) -> list[tuple[int,int]]:
    return [m.span() for m in WS_TOKEN.finditer(s or "")]

def _raw_pairs(left_tokens: list[str], right_tokens: list[str], method: str = ALIGN_METHOD):
    """Call SimAlign safely; fall back on truncation and available methods."""
    if not left_tokens or not right_tokens:
        return []
    try:
        out = aligner.get_word_aligns(left_tokens, right_tokens)
    except Exception:
        lt, rt = left_tokens[:300], right_tokens[:300]  # rare long-line guard
        try:
            out = aligner.get_word_aligns(lt, rt)
        except Exception:
            return []
    if method in out:
        return out[method]
    for m in ("itermax","mwmf","inter"):
        if m in out:
            return out[m]
    return []


# -------- reservoir sampling (keeps a bounded, uniform-ish sample) -------
def _reservoir_add(reservoir: list, item: dict, cap: int, seen_counter: int):
    if cap <= 0:
        return seen_counter + 1
    if len(reservoir) < cap:
        reservoir.append(item)
    else:
        j = random.randint(0, seen_counter)
        if j < cap:
            reservoir[j] = item
    return seen_counter + 1


# -------- main audit: fixed thresholds, no adaptive policy ----------------
def audit_alignment_filter_chunked_fixed(
    *,
    db_path=DB,
    table: str = "opus_moses",
    order_col: str = "line_no",
    id_col: str = "pair_id",
    br_col: str = "sent_pt_br",
    pt_col: str = "sent_pt_pt",
    start_line: Optional[int] = None,
    end_line:   Optional[int] = None,
    chunk_size: int = 50_000,
    # filter policy (FIXED per run)
    thresholds: Dict[str, Any] = dict(
        min_cov_ok=0.50,
        max_cov_gap=0.35,
        max_int_ratio=0.33,
        max_max_int=9,
        max_sent_diff=1,
        min_sim_ok=0.30,
        spill_tolerance=0.60
    ),
    use_window: bool = True,
    sim_fn=None,
    # how many examples to keep in memory
    max_store_flagged: int = 1000,
    max_store_passed: int  = 1000,
    seed: int = 13,
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Iterate in chunks, compute alignment_quality_features + alignment_quality_flag
    with FIXED thresholds. Print per-chunk stats.
    Returns (flagged_df, passed_df, summary_df). No highlights, no DB writes.
    """
    random.seed(seed)

    with duckdb.connect(str(db_path)) as con:
        # build range
        where, args = [], []
        if start_line is not None:
            where.append(f"{order_col} >= ?"); args.append(int(start_line))
        if end_line is not None:
            where.append(f"{order_col} <= ?"); args.append(int(end_line))
        WHERE = ("WHERE " + " AND ".join(where)) if where else ""

        mn, mx = con.execute(
            f"SELECT MIN({order_col}), MAX({order_col}) FROM {table} {WHERE}", args
        ).fetchone()
        if mn is None or mx is None:
            print("No rows to audit.")
            return (pd.DataFrame(), pd.DataFrame(), pd.DataFrame())

        total_rows = 0
        total_flagged = 0
        summaries = []

        # sample reservoirs
        flagged_res, passed_res = [], []
        seen_flagged = seen_passed = 0

        cur = int(mn)
        while cur <= int(mx):
            hi = min(cur + int(chunk_size) - 1, int(mx))
            df = con.execute(f"""
                SELECT {order_col} AS line_no, {id_col} AS pair_id,
                       {br_col} AS br, {pt_col} AS pt
                FROM {table}
                WHERE {order_col} BETWEEN ? AND ?
                ORDER BY {order_col}
            """, [cur, hi]).df()

            if df.empty:
                cur = hi + 1
                continue

            chunk_rows = len(df)
            chunk_flagged = 0

            for i in range(chunk_rows):
                br_prev = df.br.iloc[i-1] if i > 0 else ""
                br_here = df.br.iloc[i]
                br_next = df.br.iloc[i+1] if i+1 < chunk_rows else ""

                pt_prev = df.pt.iloc[i-1] if i > 0 else ""
                pt_here = df.pt.iloc[i]
                pt_next = df.pt.iloc[i+1] if i+1 < chunk_rows else ""

                feats = alignment_quality_features(
                    br_prev, br_here, br_next,
                    pt_prev, pt_here, pt_next,
                    use_window=use_window, sim_fn=sim_fn or sim
                )

                # alignment_quality_flag may return (activate, reason) OR (activate, reason, flags)
                res = alignment_quality_flag(feats, **thresholds)
                if isinstance(res, tuple) and len(res) == 3:
                    activate, reason_str, flags = res
                else:
                    activate, reason_str = res
                    flags = {}

                row_info = {
                    "line_no": int(df.line_no.iloc[i]),
                    "pair_id": int(df.pair_id.iloc[i]),
                    "activate_filter": bool(activate),
                    "reason": reason_str,
                    "base_sim": feats["base_sim"],
                    "br_cov": feats["br_cov"], "pt_cov": feats["pt_cov"],
                    "cov_gap": feats["cov_gap"],
                    "br_int_content_ratio": feats["br_int_content_ratio"],
                    "pt_int_content_ratio": feats["pt_int_content_ratio"],
                    "br_max_int": feats["br_max_int"], "pt_max_int": feats["pt_max_int"],
                    "spillish": max(feats["br_spill"], feats["pt_spill"]),
                    # include sentence mismatch if available
                    "sent_diff": feats.get("sent_diff", None),
                }
                # keep reason flags if your policy returns them
                row_info.update({k: v for k, v in flags.items()})

                if activate:
                    chunk_flagged += 1
                    seen_flagged = _reservoir_add(flagged_res, row_info, max_store_flagged, seen_flagged)
                else:
                    seen_passed = _reservoir_add(passed_res, row_info, max_store_passed, seen_passed)

            total_rows    += chunk_rows
            total_flagged += chunk_flagged
            frac = (chunk_flagged / chunk_rows) if chunk_rows else 0.0
            print(f"[{cur}..{hi}] rows={chunk_rows} flagged={chunk_flagged} ({frac:.1%})")

            summaries.append({
                "chunk_start": cur, "chunk_end": hi,
                "rows": chunk_rows, "flagged": chunk_flagged, "ratio": frac
            })

            cur = hi + 1

        overall = (total_flagged / total_rows) if total_rows else 0.0
        print(f"\nTOTAL rows={total_rows} flagged={total_flagged} ({overall:.1%})")

        flagged_df = pd.DataFrame(flagged_res)
        passed_df  = pd.DataFrame(passed_res)
        summary_df = pd.DataFrame(summaries)

        return flagged_df, passed_df, summary_df


In [None]:
flagged_df, passed_df, summary = audit_alignment_filter_chunked_fixed(
    start_line=1,            # or None to scan all
    end_line=200000,
    chunk_size=50_000,
    thresholds=dict(         # your fixed policy
        min_cov_ok=0.50,
        max_cov_gap=0.35,
        max_int_ratio=0.25,
        max_max_int=9,
        max_sent_diff=1,
        min_sim_ok=0.30,
        spill_tolerance=0.60
    ),
    use_window=True,
    max_store_flagged=1500,  # how many examples to keep for inspection
    max_store_passed=1500
)

[1..50000] rows=41582 flagged=3583 (8.6%)
[50001..100000] rows=40318 flagged=3655 (9.1%)
[100001..150000] rows=40464 flagged=3481 (8.6%)
[150001..200000] rows=39129 flagged=4551 (11.6%)

TOTAL rows=161493 flagged=15270 (9.5%)


In [None]:
summary.head(), summary["ratio"].describe()

# samples of IDs that flagged vs passed
passed_df.head(20)[["line_no","pair_id","br_cov","pt_cov","cov_gap","base_sim"]]
flagged_df.head(20)[["line_no","pair_id","reason","br_cov","pt_cov","cov_gap","base_sim"]]


Unnamed: 0,line_no,pair_id,reason,br_cov,pt_cov,cov_gap,base_sim
0,118086,118086,coverage_asymmetry,1.0,0.571429,0.428571,0.585207
1,147682,147682,"low_coverage,coverage_asymmetry,big_interior_unaligned_content",0.46875,1.0,0.53125,0.360213
2,71103,71103,low_coverage,0.428571,0.625,0.196429,0.443878
3,92292,92292,"low_coverage,coverage_asymmetry,big_interior_unaligned_content",1.0,0.4,0.6,0.355494
4,101935,101935,coverage_asymmetry,1.0,0.625,0.375,0.39444
5,168486,168486,"low_coverage,coverage_asymmetry,big_interior_unaligned_content,sentence_mismatch",0.368421,1.0,0.631579,0.477645
6,33603,33603,"low_coverage,sentence_mismatch,low_similarity",0.555556,0.25,0.305556,0.291908
7,85756,85756,"low_coverage,big_interior_unaligned_content,sentence_mismatch",0.363636,0.3,0.063636,0.300416
8,167771,167771,"low_coverage,big_interior_unaligned_content,sentence_mismatch,low_similarity",0.263158,0.3125,0.049342,0.281618
9,162093,162093,big_interior_unaligned_content,0.636364,0.6,0.036364,0.610884


In [None]:
hq = preview_alignment_quality_window_with_highlights(
    start_line=118115, window=30,
    use_window=True,           # align against prev+here+next window
    show_when="all",
    thresholds=dict(
        min_cov_ok=0.50,
        max_cov_gap=0.35,
        max_int_ratio=0.25,
        max_max_int=9,
        max_sent_diff=1,
        min_sim_ok=0.30,
        spill_tolerance=0.60
    )
)

hq

Unnamed: 0,line_no,pair_id,activate_filter,reason,base_sim,br_cov,pt_cov,cov_gap,br_int_content_ratio,pt_int_content_ratio,spillish,br_highlight,pt_highlight
0,118115,118115,False,ok,0.63628,1.0,0.875,0.125,0.0,0.0,0.0,Sinto muito não poder compartilhar da sua fé na...,"<Me> desculpe, não posso compartilhar sua fé em..."
1,118116,118116,False,ok,1.0,1.0,1.0,0.0,0.0,0.0,0.0,Wells e Fargo.,Wells e Fargo.
2,118117,118117,False,ok,1.0,1.0,1.0,0.0,0.0,0.0,0.0,"Sim, sim, Wells e Fargo.","Sim, sim, Wells e Fargo."
3,118118,118118,False,ok,0.743698,0.833333,0.842105,0.008772,0.083333,0.0,0.0,"<Realmente,> não vejo uma maneira de servi-lo, senhor Pryor. Bem, pensei que fossem [[capazes de]] abrir uma exceção.","<Mas, eu realmente> não vejo, uma maneira de atendê-lo, Sr. Pryor. Bem, pensei que você pudesse abrir uma exceção."
4,118120,118120,False,ok,0.984487,1.0,1.0,0.0,0.0,0.0,0.0,Será um empréstimo tão pequeno e minha reputação aqui nesta comunidade...,"Será um empréstimo tão pequeno e minha reputação, aqui nesta comunidade..."
5,118121,118121,False,ok,0.644963,0.833333,1.0,0.166667,0.0,0.0,0.0,"Desculpe, senhor Pryor. Bom dia, <cavalheiros.>","Desculpe, Sr. Pryor. Bom Dia senhores."
6,118125,118125,False,ok,1.0,1.0,1.0,0.0,0.0,0.0,0.0,Senhor Pryor?,Senhor Pryor?
7,118126,118126,False,too_short,1.0,0.5,0.5,0.0,0.0,0.0,0.0,"<Sim,> Hank.","<Sim,> Hank."
8,118127,118127,False,ok,0.53106,0.769231,1.0,0.230769,0.111111,0.0,0.0,Eu tenho recusado [[uma porção de]] empregos esperando a saída de sua expedição.,"Recusei muitos empregos que aguardavam, a partida de sua expedição."
9,118128,118128,False,ok,0.812936,1.0,1.0,0.0,0.0,0.0,0.0,Eu sabia que o senhor iria quer que eu e Pawnee fossemos como no ano passado.,Eu sabia que você gostaria que eu e Pawnee fôssemos como no ano passado.


In [24]:
flagged_df[flagged_df['line_no'] == 97231]

Unnamed: 0,line_no,pair_id,activate_filter,reason,base_sim,br_cov,pt_cov,cov_gap,br_int_content_ratio,pt_int_content_ratio,br_max_int,pt_max_int,spillish,sent_diff
309,97231,97231,True,ok,0.437037,0.727273,0.75,0.022727,0.285714,0.142857,3,3,0.1,1


**//GIZA++**

In [None]:
# ==== BOTH-SIDE PREVIEW with DIAGONAL-BAND FILTER (catches edge spillovers) ====
import io, html, duckdb, pandas as pd
from pathlib import Path
from IPython.display import display, HTML

# paths
BR_PATH    = "../data/corpus.clean.br"
PT_PATH    = "../data/corpus.clean.pt"
ALIGN_PATH = "../work_clean/model/aligned.intersect"   # stricter file recommended
PAIR2IDX   = "../data/pairid_to_idx.tsv"               # optional
KEEP_IDX   = "../data/keep.idx"                        # optional
DB_PATH    = "../data/duckdb/subs.duckdb"
DB_TABLE   = "opus_moses"

PAIR_ID = 123456   # <--- set your pair_id
WINDOW  = 20

# content heuristics
PT_STOP = {"de","do","da","dos","das","em","no","na","nos","nas","com","para","por","a","ao","à","às","aos",
           "o","a","os","as","um","uma","uns","umas","este","esta","estes","estas","esse","essa","esses","essas",
           "aquele","aquela","aqueles","aquelas","me","te","se","lhe","nos","vos","lhes","e","ou","mas","nem",
           "que","porque","pois","porém","porem","não","nao"}
DISFL = {"uh","uhm","uhmm","hã","aham","é","uh-hmm","uh-hum","hum","humm"}
def is_punct(t): return all(ch in ",.;:!?…—–-()[]{}\"'«»“”" for ch in t)
def is_content(t):
    t=t.lower(); return (t not in PT_STOP) and (t not in DISFL) and (not is_punct(t)) and (len(t)>1)

def parse_links(line):
    out=set()
    for z in (line or "").split():
        if "-" in z:
            i,j=z.split("-"); out.add((int(i),int(j)))
    return out

def get_line(path, k):
    with io.open(path,"r",encoding="utf8") as f:
        for i,ln in enumerate(f):
            if i==k: return ln.rstrip("\n")
    return ""

# diagonal-band gating: keep only links close to the diagonal
def band_filter_links(links, len_s, len_t, frac=0.20, abs_px=3):
    """Keep (i,j) if | j - (i*len_t/len_s) | <= max(abs_px, frac*len_t)."""
    if len_s==0 or len_t==0: return set()
    out=set()
    band = max(abs_px, int(round(frac*len_t)))
    ratio = len_t/len_s
    for i,j in links:
        target_on_diag = i*ratio
        if abs(j - target_on_diag) <= band:
            out.add((i,j))
    return out

# mappings
pair2orig={}
if Path(PAIR2IDX).exists():
    with io.open(PAIR2IDX,"r",encoding="utf8") as f:
        for ln in f:
            pid, idx = ln.rstrip("\n").split("\t")
            pair2orig[int(pid)] = int(idx)
orig0_to_clean=None; clean_to_orig0=None
if Path(KEEP_IDX).exists():
    keep=[int(x.strip()) for x in io.open(KEEP_IDX,"r",encoding="utf8")]
    orig0_to_clean={orig1-1: c for c,orig1 in enumerate(keep)}
    clean_to_orig0=[orig1-1 for orig1 in keep]

def map_pair_to_clean(pid):
    orig0 = pair2orig.get(pid, pid)
    if orig0_to_clean is None: return orig0, orig0
    return orig0_to_clean.get(orig0), orig0

def highlight(tokens, covered_idx):
    bits=[]
    for i,tok in enumerate(tokens):
        txt=html.escape(tok)
        if (i not in covered_idx) and is_content(tok):
            bits.append(f'<span style="background:#ffe08a">{txt}</span>')
        else:
            bits.append(txt)
    return " ".join(bits)

def edge_runs(tokens, covered_idx):
    n=len(tokens)
    pre=suf=0
    for i in range(n):
        if (i not in covered_idx) and is_content(tokens[i]): pre+=1
        else: break
    for i in range(n-1,-1,-1):
        if (i not in covered_idx) and is_content(tokens[i]): suf+=1
        else: break
    return pre, suf

# build the window
clean_idx, orig0 = map_pair_to_clean(PAIR_ID)
assert clean_idx is not None, f"Original {orig0} missing in clean corpus"
half=max(1,WINDOW//2); start=max(0,clean_idx-half); end=clean_idx+(WINDOW-half-1)

# optional DB slice
db_map={}
if Path(DB_PATH).exists():
    con=duckdb.connect(DB_PATH, read_only=True)
    o_start = clean_to_orig0[start] if clean_to_orig0 else start
    o_end   = clean_to_orig0[min(end,len(clean_to_orig0)-1)] if clean_to_orig0 else end
    q=f"""
    WITH t AS (SELECT row_number() OVER ()-1 AS orig0, sent_pt_br, sent_pt_pt FROM {DB_TABLE})
    SELECT * FROM t WHERE orig0 BETWEEN ? AND ? ORDER BY orig0
    """
    df_db=con.execute(q,[int(o_start),int(o_end)]).df()
    db_map={int(r.orig0):(r.sent_pt_br, r.sent_pt_pt) for _,r in df_db.iterrows()}

rows=[]
for k in range(start, end+1):
    br = (get_line(BR_PATH,k) or "").split()
    pt = (get_line(PT_PATH,k) or "").split()
    raw_links = parse_links(get_line(ALIGN_PATH,k))
    links = band_filter_links(raw_links, len(br), len(pt), frac=0.20, abs_px=3)
    br_cov = {i for (i,_) in links}
    pt_cov = {j for (_,j) in links}
    o0=clean_to_orig0[k] if clean_to_orig0 else k
    raw_br, raw_pt = db_map.get(o0, ("",""))
    pre_br, suf_br = edge_runs(br, br_cov)
    pre_pt, suf_pt = edge_runs(pt, pt_cov)
    suggest=[]
    if pre_br>=3: suggest.append(f"cut BR prefix {pre_br}T")
    if suf_br>=3: suggest.append(f"cut BR suffix {suf_br}T")
    if pre_pt>=3: suggest.append(f"cut PT prefix {pre_pt}T")
    if suf_pt>=3: suggest.append(f"cut PT suffix {suf_pt}T")

    rows.append({
        "clean_idx": k,
        "orig0_idx": o0 if k==clean_idx else "",
        "★": "★" if k==clean_idx else "",
        "BR (clean)": " ".join(br),
        "BR highlight (unaligned near-diagonal)": highlight(br, br_cov),
        "PT (clean)": " ".join(pt),
        "PT highlight (unaligned near-diagonal)": highlight(pt, pt_cov),
        "suggest": " | ".join(suggest),
        "DB BR (raw)": raw_br,
        "DB PT (raw)": raw_pt
    })

display(HTML(f"<p><b>pair_id</b>={PAIR_ID} → <b>orig0</b>={orig0} → <b>clean_idx</b>={clean_idx} | window [{start}..{end}]<br>"
             f"Align file: <code>{ALIGN_PATH}</code> (diagonal band active)</p>"))
display(HTML(pd.DataFrame(rows).to_html(escape=False, index=False)))


clean_idx,orig0_idx,★,BR (clean),BR highlight (unaligned near-diagonal),PT (clean),PT highlight (unaligned near-diagonal),suggest,DB BR (raw),DB PT (raw)
101597,,,Deixa ao Soapy e aos outros . Não volte a lhes dar dinheiro .,Deixa ao Soapy e aos outros . Não volte a lhes dar dinheiro .,"Soapy e os outros miúdos ... Afasta-te deles , sim ? Não lhes oferecas mais dinheiro . ,","Soapy e os outros miúdos ... Afasta-te deles , sim ? Não lhes oferecas mais dinheiro . ,",,Deixa ao Soapy e aos outros. Não volte a lhes dar dinheiro.,"Soapy e os outros miúdos... Afasta-te deles, sim? Não lhes oferecas mais dinheiro.,"
101598,,,Não os faça ...,Não os faça ...,Não os encorajes ...,Não os encorajes ...,,Não os faça...,Não os encorajes...
101599,,,O admirar .,O admirar .,A admirar-te .,A admirar-te .,,O admirar.,A admirar-te.
101600,,,Está bem . Vou fazer isso . Claro que o fará .,Está bem . Vou fazer isso . Claro que o fará .,"Está bem , garanto-te . Claro que garantes .","Está bem , garanto-te . Claro que garantes .",,Está bem. Vou fazer isso. Claro que o fará.,"Está bem, garanto-te. Claro que garantes."
101601,,,Editor executivo de o boletim,Editor executivo de o boletim,Editor-chefe,Editor-chefe,,Editor executivo de o boletim,Editor-chefe
101602,,,"Padre Connelly , queria lhe ajudar , sério . Mas não podemos .","Padre Connelly , queria lhe ajudar , sério . Mas não podemos .","Padre Connelly , gostaria de ajudá-lo . Acredite que sim . Mas é impossível .","Padre Connelly , gostaria de ajudá-lo . Acredite que sim . Mas é impossível .",,"Padre Connelly, queria lhe ajudar, sério. Mas não podemos.","Padre Connelly, gostaria de ajudá-lo. Acredite que sim. Mas é impossível."
101603,,,É uma organização muito forte e muito dura .,É uma organização muito forte e muito dura .,Não se luta com uma organização tão poderosa .,Não se luta com uma organização tão poderosa .,,É uma organização muito forte e muito dura.,Não se luta com uma organização tão poderosa.
101604,,,Norton J. WHITE EDITOR - IMPRENSA MATUTINA,Norton J. WHITE EDITOR - IMPRENSA MATUTINA,Editor,Editor,,Norton J. WHITE EDITOR - IMPRENSA MATUTINA,Editor
101605,,,Sabe o que me está pedindo ?,Sabe o que me está pedindo ?,Sabe o que está a pedir-me ?,Sabe o que está a pedir-me ?,,Sabe o que me está pedindo?,Sabe o que está a pedir-me?
101606,,,"Sim , Sr. White .","Sim , Sr. White .","Sim , Mr . White .","Sim , Mr . White .",,"Sim, Sr. White.","Sim, Mr. White."


In [19]:
con = duckdb.connect(str(DB))
df = con.execute("""
  SELECT *
  FROM opus_moses
  WHERE pair_id >= ?
  ORDER BY pair_id
  LIMIT 15
""", [118130]).df()
con.close()
df

Unnamed: 0,line_no,pair_id,sent_pt_br,sent_pt_pt
0,118130,118130,"É possível que eu não mande ninguém este ano, é melhor você pegar o primeiro emprego que aparecer.","É possível que eu não envie ninguém este ano, é melhor você conseguir, o primeiro bom trabalho que surgir."
1,118131,118131,"Ao escritório, Sam.","Para o escritório, Sam."
2,118132,118132,Papai.,Papai.
3,118133,118133,Por que não está na cama?,Por que você não está na cama?
4,118134,118134,Eu queria dizer boa noite para você.,Queria te dizer boa noite.
5,118135,118135,Boa noite.,Boa noite...
6,118136,118136,Minha querida.,"Boa noite, minha querida."
7,118137,118137,"Esperamos você para jantar durante quase uma hora, Nicholas. Papai... Papai, você comeu alguma coisa? Oh, sim.","Esperamos por você para jantar, por quase uma hora, Nicholas. Pai, você comeu alguma coisa? Oh sim."
8,118139,118139,Eu jantei no hotel.,Jantei no hotel.
9,118140,118140,"Como está, Talbot?","Como você está, Talbot?"
