In [1]:
from datasets import load_dataset, DatasetDict, concatenate_datasets
import pandas as pd

pd.set_option("display.max_colwidth", None)


In [4]:
# !pip install -U datasets clean-text jusText nltk numpy beautifulsoup4 lxml
from datasets import load_dataset
import numpy as np
from cleantext import clean
import nltk

nltk.download("punkt", quiet=True)

import os, nltk, re
from bs4 import BeautifulSoup
import hashlib

# Make sure NLTK looks in your user directory
NLTK_USER_DIR = os.path.expanduser("~/nltk_data")
if NLTK_USER_DIR not in nltk.data.path:
    nltk.data.path.append(NLTK_USER_DIR)

def ensure_punkt():
    """
    Ensure Portuguese punkt is available and visible to this kernel.
    Newer NLTK may also require 'punkt_tab'.
    """
    try:
        nltk.data.find("tokenizers/punkt/portuguese.pickle")
    except LookupError:
        # download into the same directory we added to nltk.data.path
        nltk.download("punkt", download_dir=NLTK_USER_DIR, quiet=True)
        try:
            nltk.data.find("tokenizers/punkt/portuguese.pickle")
        except LookupError:
            nltk.download("punkt_tab", download_dir=NLTK_USER_DIR, quiet=True)
            nltk.data.find("tokenizers/punkt/portuguese.pickle")

# ----------------------------
# Author's regex & helpers (verbatim)
# ----------------------------
import re
from bs4 import BeautifulSoup
import hashlib

HTML_RE = re.compile(r"<[^>]+>")
URL_RE = re.compile(r"((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#…])*")
HASHTAG_RE = re.compile(r"#(\w+)")
QUOTE_SPACE_START_RE = re.compile(r"^\"\s")
QUOTE_SPACE_END_RE = re.compile(r"\s\"$")
MENTION_RE = re.compile(r"@(\w+)")
RETWEET_RE = re.compile(r"RT @(\w+):")
COD_RE = re.compile(r"COD _ (\w+) ")
BULLET_RE = re.compile(r"^(\d)+.\s")
THREE_DASH_RE = re.compile(r"---.*---")
MORE_THAN_THREE_POINTS_RE = re.compile(r"\.{4,}")

VALID_CHARS = "0123456789abcdefghijklmnopqrstuvwxyzàáâãåāèéêëěėēîïíìįīĵłñńôöòóōõšśûüùúūÿýźçćčñń!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~«»“”ºª€ \t\n\r\x0b\x0c"

INVALID_START = [
    "List of recent changes","Sort by","Home |","> Home","useful tips","Licenses:","Search in: ",
    "Terms of Use - ","Home page","Home Page","Copyright","Results/Page",
    "!","#","$","%","&","*","+",
    ",","-",".","/",":",";","<","=",
    ">","?","@","[","\\","]","^","_","`","{","|","}","~",
]
INVALID_MIDDLE = [" @ ", " / ", " | ", "[...]", "(...)"]
INVALID_END = [" ("]

MONTHS = ["january","february","march","april","may","june","july","august","september","october","november","december"]

def remove_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

def remove_hashtags(text): return HASHTAG_RE.sub("", text).strip()
def remove_mentions(text): return MENTION_RE.sub("", text).strip()
def remove_retweets(text): return RETWEET_RE.sub("", text).strip()
def remove_urls(text): return URL_RE.sub("", text).strip()
def remove_cod_literature(text): return COD_RE.sub("", text).strip()
def remove_bullets(text): return BULLET_RE.sub("", text).strip()
def remove_three_dashes(text): return THREE_DASH_RE.sub("", text).strip()
def remove_quote_space_start(text): return QUOTE_SPACE_START_RE.sub('"', text)
def remove_quote_space_end(text):
    if text.endswith(' "'): return text[:-2] + '"'
    return text

def has_more_than_three_points(text): return bool(MORE_THAN_THREE_POINTS_RE.search(text))
def starts_with_month(text): return text.lower().startswith(tuple(MONTHS))
def has_too_long_word(text): return any(word for word in text.split(" ") if len(word) > 20)
def has_invalid_start(text): return text.startswith(tuple(INVALID_START))
def has_invalid_middle(text): return any(True for word in INVALID_MIDDLE if word in text)
def has_invalid_end(text): return text.endswith(tuple(INVALID_END))
def has_valid_brackets(text):
    return (text.count("(") == text.count(")") and text.count("[") == text.count("]") and text.count("{") == text.count("}"))
def has_valid_quotes(text): return text.count('"') % 2 == 0 and text.count("“") == text.count("”")
def is_empty(text): return len(text) == 0
def has_invalid_character(text):
    for char in text:
        if char.lower() not in VALID_CHARS: return True
    return False

def author_transform_chain(text: str) -> str:
    text = remove_retweets(text)
    text = remove_mentions(text)
    text = remove_hashtags(text)
    text = remove_urls(text)
    text = remove_html_tags(text)
    text = remove_cod_literature(text)
    text = remove_bullets(text)
    text = remove_three_dashes(text)
    text = remove_quote_space_start(text)
    text = remove_quote_space_end(text)
    return text

# ----------------------------
# Paper-faithful helpers
# ----------------------------
def drop_nans_and_empties(ds):
    return ds.filter(lambda x: x["text"] is not None and len(x["text"].strip()) > 0)

# MEMORY-SAFE exact de-dup (streaming via hash set; single process to keep state)
def drop_exact_duplicates(ds, batch_size: int = 1000):
    seen = set()
    def tag_batch(batch):
        keep = []
        for t in batch["text"]:
            h = hashlib.md5(t.encode("utf-8")).hexdigest()
            if h in seen:
                keep.append(False)
            else:
                seen.add(h)
                keep.append(True)
        return {"__keep__": keep}
    ds = ds.map(tag_batch, batched=True, batch_size=batch_size, num_proc=1)
    ds = ds.filter(lambda k: k, input_columns="__keep__")
    ds = ds.remove_columns(["__keep__"])
    return ds

def apply_clean_text_ascii(s: str) -> str:
    return clean(
        s, fix_unicode=True, to_ascii=True, lower=False,
        no_line_breaks=False, no_urls=False, no_emails=False, no_phone_numbers=False,
        no_numbers=False, no_digits=False, no_currency_symbols=False,
    )

# Regex fallback if punkt still isn't available (keeps the pipeline running)
_FALLBACK_TOKEN_RE = re.compile(r"\w+|[^\w\s]")

def pt_word_count(s: str) -> int:
    try:
        ensure_punkt()
        return len(nltk.word_tokenize(s, language="portuguese"))
    except LookupError:
        # last-resort fallback (not paper-perfect, but close)
        return len(_FALLBACK_TOKEN_RE.findall(s))


def add_length_column(ds, batch_size: int = 1000):
    ensure_punkt()
    def _lens(batch):
        return {"__len__": [pt_word_count(t) for t in batch["text"]]}
    return ds.map(_lens, batched=True, batch_size=batch_size, num_proc=1)


def iqr_bounds_from_lengths(lengths):
    q1, q3 = np.percentile(lengths, 25), np.percentile(lengths, 75)
    iqr = q3 - q1
    return (q1 - 1.5*iqr, q3 + 1.5*iqr)

def apply_iqr_filter_on_cached_lengths(ds, lo, hi):
    ds = ds.filter(lambda L: lo <= L <= hi, input_columns="__len__")
    return ds.remove_columns(["__len__"])

# jusText boilerplate removal (paper: only for Web)
def web_justext(text: str) -> str:
    import justext
    paras = justext.justext(text, justext.get_stoplist("Portuguese"))
    good = [p.text for p in paras if p.class_type == "good"]
    return "\n".join(good) if good else text

def apply_clean_text_unicode_only(s: str) -> str:
    # Keep accents; still fix bad unicode
    from cleantext import clean
    return clean(
        s,
        fix_unicode=True,
        to_ascii=False,   # <-- keep accents
        lower=False,
        no_line_breaks=False,
        no_urls=False, no_emails=False, no_phone_numbers=False,
        no_numbers=False, no_digits=False, no_currency_symbols=False,
    )

# ----------------------------
# Integrated pipeline
# ----------------------------
def clean_one_domain(
    domain: str,
    use_author_transforms: bool = True,
    run_web_justext: bool = True,
    apply_author_filters: bool = True,
    keep_accents: bool = True,
    num_proc: int = 1,
    batch_size: int = 1000,
):
    """
    Paper pipeline + optional author steps, memory-safe:
      1) drop NaNs/empties
      2) jusText for Web (paper)
      3) author's transform chain (URLs/HTML/etc.)
      4) clean-text -> ASCII (paper)
      5) de-dup (streaming, hash-based)
      6) optional author filters (no tokenizer)
      7) add cached token lengths (NLTK), compute IQR, filter, drop length col (paper)
    """
    ds = load_dataset("liaad/PtBrVId-Raw", domain, split="train")

    ds = drop_nans_and_empties(ds)

    if run_web_justext and domain == "web":
        ds = ds.map(lambda x: {"text": web_justext(x["text"])}, num_proc=num_proc, batched=False)

    # 3) author's transform chain (pure text cleanup)
    if use_author_transforms:
        ds = ds.map(lambda x: {"text": author_transform_chain(x["text"])}, num_proc=num_proc, batched=False)

    # 4) clean-text step — choose whether to preserve accents
    if keep_accents:
        ds = ds.map(lambda x: {"text": apply_clean_text_unicode_only(x["text"])}, num_proc=num_proc, batched=False)
    else:
        ds = ds.map(lambda x: {"text": apply_clean_text_ascii(x["text"])}, num_proc=num_proc, batched=False)
    ds = drop_exact_duplicates(ds, batch_size=batch_size)

    if apply_author_filters:
        ds = ds.filter(
            lambda t: (not starts_with_month(t))
                      and (not has_too_long_word(t))
                      and (not has_invalid_start(t))
                      and (not has_invalid_middle(t))
                      and (not has_invalid_end(t))
                      and (not has_more_than_three_points(t))
                      and (not is_empty(t))
                      and (not has_invalid_character(t))
                      and has_valid_brackets(t)
                      and has_valid_quotes(t),
            input_columns="text",
            num_proc=num_proc
        )

    # Cache token lengths once, then IQR-filter
    ds = add_length_column(ds, batch_size=batch_size)
    lo, hi = iqr_bounds_from_lengths(ds["__len__"])
    ds = apply_iqr_filter_on_cached_lengths(ds, lo, hi)

    return ds


In [2]:
import os, nltk, pathlib

# 1) Pick a single directory and make NLTK look there
NLTK_USER_DIR = os.path.expanduser("~/nltk_data")
os.environ["NLTK_DATA"] = NLTK_USER_DIR  # ensure child processes see it too
if NLTK_USER_DIR not in nltk.data.path:
    nltk.data.path.insert(0, NLTK_USER_DIR)

# 2) Force-reinstall punkt + punkt_tab into that directory
nltk.download("punkt", download_dir=NLTK_USER_DIR, force=True, quiet=False)
# Newer NLTK also needs this metadata package
try:
    nltk.download("punkt_tab", download_dir=NLTK_USER_DIR, force=True, quiet=False)
except Exception:
    pass  # older NLTK won't have it

# 3) Verify both the PACKAGE and the Portuguese model are visible
print("NLTK paths:", nltk.data.path)
print("Has package dir? ", pathlib.Path(NLTK_USER_DIR, "tokenizers", "punkt").exists())
nltk.data.find("tokenizers/punkt")                       # should NOT raise
nltk.data.find("tokenizers/punkt/portuguese.pickle")     # should NOT raise

# Quick smoke test
from nltk import word_tokenize
print(word_tokenize("Olá mundo! Isto é um teste.", language="portuguese"))



[nltk_data] Downloading package punkt to
[nltk_data]     /home/laiarodrigo/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/laiarodrigo/nltk_data...


NLTK paths: ['/home/laiarodrigo/nltk_data', '/home/laiarodrigo/repos/Thesis/thesis/nltk_data', '/home/laiarodrigo/repos/Thesis/thesis/share/nltk_data', '/home/laiarodrigo/repos/Thesis/thesis/lib/nltk_data', '/usr/share/nltk_data', '/usr/local/share/nltk_data', '/usr/lib/nltk_data', '/usr/local/lib/nltk_data']
Has package dir?  True
['Olá', 'mundo', '!', 'Isto', 'é', 'um', 'teste', '.']


[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [5]:
domain = "journalistic"

raw_ds = load_dataset("liaad/PtBrVId-Raw", domain, split="train")
print("Raw count:", len(raw_ds))

clean_ds = clean_one_domain(
    domain=domain,
    use_author_transforms=True,
    run_web_justext=True,
    apply_author_filters=True,
    keep_accents=True,      # <-- important
    num_proc=1,
    batch_size=1000,
)

print("Cleaned count:", len(clean_ds))
for i in range(3):
    print("—"*60)
    print(clean_ds[i]["text"])
    print("label:", clean_ds[i]["label"])



Raw count: 1842804
Cleaned count: 1696469
————————————————————————————————————————————————————————————
Cardoso e Cunha «Insatisfatório» O resultado do referendo francês é insatisfatório, tímido e modesto. Os problemas da Comunidade Europeia exigiam, da parte de um país que sempre esteve na linha da frente, uma vitória mais clara. A vitória tangencial do «sim» dá que pensar. Esta margem tão pequena que deu a vitória ao «sim» torna imprevisível o que vai acontecer na Grã-Bretanha.
label: 0
————————————————————————————————————————————————————————————
Santo Tirso de fora Apesar da intenção das principais autarquias do Ave, para já Santo Tirso quer ficar de fora deste processo. Joaquim Couto, o presidente da edilidade tirsense, não está muito convencido da interesse das empresas e prefere concessionar alguns serviços a privados. É o caso da recolha e transporte dos lixos domésticos e da gestão e exploração do abastecimento de água. «A concessão a privados dá mais transparência à gestão da a

In [6]:
# If needed: from datasets import load_dataset, concatenate_datasets
from cleantext import clean
import random, html

# ASCII ONLY FOR THE MATCHING KEY (final text stays accented)
def to_ascii_like_paper(s: str) -> str:
    return clean(
        s, fix_unicode=True, to_ascii=True, lower=False,
        no_line_breaks=False, no_urls=False, no_emails=False, no_phone_numbers=False,
        no_numbers=False, no_digits=False, no_currency_symbols=False,
    )

def ascii_key(s: str) -> str:
    s = html.unescape(s).replace("\xa0", " ")
    return " ".join(to_ascii_like_paper(s).split())

def combine_splits_to_one(ds_dict, add_split_col=True):
    from datasets import concatenate_datasets
    parts = []
    for split_name, split_ds in ds_dict.items():
        if add_split_col:
            split_ds = split_ds.map(lambda x, sn=split_name: {"__split__": sn})
        parts.append(split_ds)
    return concatenate_datasets(parts)

def build_needed_keys_for_sample(official_all, idxs, text_col="text", label_col="label"):
    keys_in_order = []
    for i in idxs:
        keys_in_order.append(f"{ascii_key(official_all[text_col][i])}||{official_all[label_col][i]}")
    return keys_in_order, set(keys_in_order)

def build_mapping_from_cleaned_sharded(clean_ds, needed_keys,
                                       text_col="text", label_col="label",
                                       num_shards=100, batch_size=500):
    """Scan your preprocessed RAW (with accents) in shards; keep first hit for each needed key."""
    mapping, remaining = {}, set(needed_keys)
    for shard_idx in range(num_shards):
        if not remaining:
            break
        shard = clean_ds.shard(num_shards=num_shards, index=shard_idx)
        # compute combined keys for this shard in small batches
        def _mk(batch):
            cks = []
            for t, lbl in zip(batch[text_col], batch[label_col]):
                cks.append(f"{ascii_key(t)}||{lbl}")
            return {"__ckey__": cks}
        shard_k = shard.map(_mk, batched=True, batch_size=batch_size, num_proc=1)
        for i in range(len(shard_k)):
            ck = shard_k["__ckey__"][i]
            if ck in remaining:
                mapping[ck] = shard[text_col][i]   # ACCENTED, preprocessed text
                remaining.remove(ck)
                if not remaining:
                    break
        del shard, shard_k
    return mapping

from datasets import load_dataset

DOMAIN   = "journalistic"
N_SAMPLES = 20
SEED      = 123

# 1) Load official (paper) dataset and flatten splits
official_dd  = load_dataset("liaad/PtBrVId", DOMAIN)   # ASCII text
official_all = combine_splits_to_one(official_dd, add_split_col=True)

# 2) Ensure you already have your preprocessed RAW (with accents)
# If not already created earlier:
# clean_ds = clean_one_domain(domain=DOMAIN, keep_accents=True, num_proc=1, batch_size=1000)

# 3) Pick random official rows to compare
import random
random.seed(SEED)
sample_idxs = random.sample(range(len(official_all)), k=min(N_SAMPLES, len(official_all)))

# 4) Build keys just for those samples and find their matches inside your cleaned set
keys_in_order, needed = build_needed_keys_for_sample(official_all, sample_idxs)
raw_map = build_mapping_from_cleaned_sharded(clean_ds, needed, num_shards=100, batch_size=500)

# 5) Pretty print comparisons
def has_accents(s): return any(ord(c) > 127 for c in s)

hits = 0
print(f"\nComparing {len(sample_idxs)} random examples from '{DOMAIN}':\n")
for i, ck in zip(sample_idxs, keys_in_order):
    off_txt = official_all["text"][i]      # ASCII (paper release)
    lbl     = official_all["label"][i]
    split   = official_all["__split__"][i]
    acc_txt = raw_map.get(ck)

    print("—"*100)
    print(f"[{split}] label={lbl}  |  idx={i}")
    print("OFFICIAL (ASCII):")
    print(off_txt[:600].replace("\n"," "))
    if acc_txt is not None:
        hits += 1
        print("\nCLEANED RAW (ACCENTED):")
        print(acc_txt[:600].replace("\n"," "))
        print(f"\nHas accents? {has_accents(acc_txt)}")
    else:
        print("\nCLEANED RAW (ACCENTED): <not found — your pipeline may have filtered it out>")
        print(f"Has accents? {has_accents(off_txt)}")

print("\nSummary:")
print(f"  Requested samples: {len(sample_idxs)}")
print(f"  Found accented matches: {hits}")
print(f"  Misses: {len(sample_idxs) - hits}")


Map:   0%|          | 0/16965 [00:00<?, ? examples/s]

Map:   0%|          | 0/16965 [00:00<?, ? examples/s]

Map:   0%|          | 0/16965 [00:00<?, ? examples/s]

Map:   0%|          | 0/16965 [00:00<?, ? examples/s]

Map:   0%|          | 0/16965 [00:00<?, ? examples/s]

Map:   0%|          | 0/16965 [00:00<?, ? examples/s]

Map:   0%|          | 0/16965 [00:00<?, ? examples/s]

Map:   0%|          | 0/16965 [00:00<?, ? examples/s]

Map:   0%|          | 0/16965 [00:00<?, ? examples/s]

Map:   0%|          | 0/16965 [00:00<?, ? examples/s]

Map:   0%|          | 0/16965 [00:00<?, ? examples/s]

Map:   0%|          | 0/16965 [00:00<?, ? examples/s]

Map:   0%|          | 0/16965 [00:00<?, ? examples/s]

Map:   0%|          | 0/16965 [00:00<?, ? examples/s]

Map:   0%|          | 0/16965 [00:00<?, ? examples/s]

Map:   0%|          | 0/16965 [00:00<?, ? examples/s]

Map:   0%|          | 0/16965 [00:00<?, ? examples/s]

Map:   0%|          | 0/16965 [00:00<?, ? examples/s]

Map:   0%|          | 0/16965 [00:00<?, ? examples/s]

Map:   0%|          | 0/16965 [00:00<?, ? examples/s]

Map:   0%|          | 0/16965 [00:00<?, ? examples/s]

Map:   0%|          | 0/16965 [00:00<?, ? examples/s]

Map:   0%|          | 0/16965 [00:00<?, ? examples/s]

Map:   0%|          | 0/16965 [00:00<?, ? examples/s]

Map:   0%|          | 0/16965 [00:00<?, ? examples/s]

Map:   0%|          | 0/16965 [00:00<?, ? examples/s]

Map:   0%|          | 0/16965 [00:00<?, ? examples/s]

Map:   0%|          | 0/16965 [00:00<?, ? examples/s]

Map:   0%|          | 0/16965 [00:00<?, ? examples/s]

Map:   0%|          | 0/16965 [00:00<?, ? examples/s]

Map:   0%|          | 0/16965 [00:00<?, ? examples/s]

Map:   0%|          | 0/16965 [00:00<?, ? examples/s]

Map:   0%|          | 0/16965 [00:00<?, ? examples/s]

Map:   0%|          | 0/16965 [00:00<?, ? examples/s]

Map:   0%|          | 0/16965 [00:00<?, ? examples/s]

Map:   0%|          | 0/16965 [00:00<?, ? examples/s]

Map:   0%|          | 0/16965 [00:00<?, ? examples/s]

Map:   0%|          | 0/16965 [00:00<?, ? examples/s]

Map:   0%|          | 0/16965 [00:00<?, ? examples/s]

Map:   0%|          | 0/16965 [00:00<?, ? examples/s]

Map:   0%|          | 0/16965 [00:00<?, ? examples/s]

Map:   0%|          | 0/16965 [00:00<?, ? examples/s]

Map:   0%|          | 0/16965 [00:00<?, ? examples/s]

Map:   0%|          | 0/16965 [00:00<?, ? examples/s]

Map:   0%|          | 0/16965 [00:00<?, ? examples/s]

Map:   0%|          | 0/16965 [00:00<?, ? examples/s]

Map:   0%|          | 0/16965 [00:00<?, ? examples/s]

Map:   0%|          | 0/16965 [00:00<?, ? examples/s]

Map:   0%|          | 0/16965 [00:00<?, ? examples/s]

Map:   0%|          | 0/16965 [00:00<?, ? examples/s]

Map:   0%|          | 0/16965 [00:00<?, ? examples/s]

Map:   0%|          | 0/16965 [00:00<?, ? examples/s]

Map:   0%|          | 0/16965 [00:00<?, ? examples/s]

Map:   0%|          | 0/16965 [00:00<?, ? examples/s]

Map:   0%|          | 0/16965 [00:00<?, ? examples/s]

Map:   0%|          | 0/16965 [00:00<?, ? examples/s]

Map:   0%|          | 0/16965 [00:00<?, ? examples/s]

Map:   0%|          | 0/16965 [00:00<?, ? examples/s]

Map:   0%|          | 0/16965 [00:00<?, ? examples/s]

Map:   0%|          | 0/16965 [00:00<?, ? examples/s]

Map:   0%|          | 0/16965 [00:00<?, ? examples/s]

Map:   0%|          | 0/16965 [00:00<?, ? examples/s]

Map:   0%|          | 0/16965 [00:00<?, ? examples/s]

Map:   0%|          | 0/16965 [00:00<?, ? examples/s]

Map:   0%|          | 0/16965 [00:00<?, ? examples/s]

Map:   0%|          | 0/16965 [00:00<?, ? examples/s]

Map:   0%|          | 0/16965 [00:00<?, ? examples/s]

Map:   0%|          | 0/16965 [00:00<?, ? examples/s]

Map:   0%|          | 0/16965 [00:00<?, ? examples/s]

Map:   0%|          | 0/16964 [00:00<?, ? examples/s]

Map:   0%|          | 0/16964 [00:00<?, ? examples/s]

Map:   0%|          | 0/16964 [00:00<?, ? examples/s]

Map:   0%|          | 0/16964 [00:00<?, ? examples/s]

Map:   0%|          | 0/16964 [00:00<?, ? examples/s]

Map:   0%|          | 0/16964 [00:00<?, ? examples/s]

Map:   0%|          | 0/16964 [00:00<?, ? examples/s]

Map:   0%|          | 0/16964 [00:00<?, ? examples/s]

Map:   0%|          | 0/16964 [00:00<?, ? examples/s]

Map:   0%|          | 0/16964 [00:00<?, ? examples/s]

Map:   0%|          | 0/16964 [00:00<?, ? examples/s]

Map:   0%|          | 0/16964 [00:00<?, ? examples/s]

Map:   0%|          | 0/16964 [00:00<?, ? examples/s]

Map:   0%|          | 0/16964 [00:00<?, ? examples/s]

Map:   0%|          | 0/16964 [00:00<?, ? examples/s]

Map:   0%|          | 0/16964 [00:00<?, ? examples/s]

Map:   0%|          | 0/16964 [00:00<?, ? examples/s]

Map:   0%|          | 0/16964 [00:00<?, ? examples/s]

Map:   0%|          | 0/16964 [00:00<?, ? examples/s]

Map:   0%|          | 0/16964 [00:00<?, ? examples/s]

Map:   0%|          | 0/16964 [00:00<?, ? examples/s]

Map:   0%|          | 0/16964 [00:00<?, ? examples/s]

Map:   0%|          | 0/16964 [00:00<?, ? examples/s]

Map:   0%|          | 0/16964 [00:00<?, ? examples/s]

Map:   0%|          | 0/16964 [00:00<?, ? examples/s]

Map:   0%|          | 0/16964 [00:00<?, ? examples/s]

Map:   0%|          | 0/16964 [00:00<?, ? examples/s]

Map:   0%|          | 0/16964 [00:00<?, ? examples/s]

Map:   0%|          | 0/16964 [00:00<?, ? examples/s]

Map:   0%|          | 0/16964 [00:00<?, ? examples/s]

Map:   0%|          | 0/16964 [00:00<?, ? examples/s]


Comparing 20 random examples from 'journalistic':

————————————————————————————————————————————————————————————————————————————————————————————————————
[train] label=0  |  idx=109814
OFFICIAL (ASCII):
Siria desmente apelo oficial ao assassinio de Saddam "AL-THAWRA", um jornal oficial sirio, apelara sabado aos iraquianos para que assassinassem o Presidente Saddam Hussein, mas o ministro da Informacao, Mohamed Salman, disse de imediato que semelhante declaracao nao espelha uma posicao do seu Governo.

CLEANED RAW (ACCENTED):
Síria desmente apelo oficial ao assassínio de Saddam «AL-THAWRA», um jornal oficial sírio, apelara sábado aos iraquianos para que assassinassem o Presidente Saddam Hussein, mas o ministro da Informação, Mohamed Salman, disse de imediato que semelhante declaração não espelha uma posição do seu Governo.

Has accents? True
————————————————————————————————————————————————————————————————————————————————————————————————————
[train] label=0  |  idx=561359
OFFICIAL (ASCII)

In [46]:
first_0 = journalistic_df_train[journalistic_df_train['label'] == 0].head(10)
first_1 = journalistic_df_train[journalistic_df_train['label'] == 1].head(10)

In [47]:
first_0

Unnamed: 0,text,label
0,E preciso um alerta nacional e internacional p...,0
1,Bancos centrais garantem defesa do iene A reun...,0
2,L' Ile au Tresor Desenho de Michel Faure Texto...,0
3,Polemica sobre a chefia da forca de paz na Bos...,0
4,Assim sera mesmo que os resultados -- que so d...,0
5,"Hoje, realiza-se o leilao de 10 milhoes de con...",0
6,"Boa velocidade maxima, aceleracoes e recuperac...",0
7,Casas para os amigos Foi em Setembro de 1997 q...,0
8,"Assim, preve-se, ja para Maio, o inicio da pre...",0
9,Recorde-se que a criacao do cargo de vice-gove...,0


In [48]:
first_1

Unnamed: 0,text,label
1442906,"Mesmo assim, o diretor reconhece ""a importanci...",1
1442907,Os funcionarios com mais de 40 anos sao proteg...,1
1442908,"De seu lado, a America Latina, movida a invest...",1
1442909,Os funcionarios que querem fundos de pensao co...,1
1442910,Gemeos nascem na Inglaterra com diferentes cor...,1
1442911,"""A defasagem e um custo a mais, que tera de se...",1
1442912,Nova filial A Cia. Ultragaz esta abrindo filia...,1
1442913,Nova novela alterna pequenas ousadias com gran...,1
1442914,"Muitos precos ficaram para tras, defasados, o ...",1
1442915,"VOTACAO COMECA COM DERROTA DE JOBIM, GABRIELA ...",1


este dataset nao é paralelo e apenas tem duas colunas: texto e label.  Primeiro, removeu entradas nulas, vazias e duplicadas. Em seguida, utilizou a biblioteca clean-text para corrigir erros de Unicode e normalizar todo o texto para ASCII. No domínio Web, aplicou ainda o jusText para eliminar sentenças irrelevantes e código-base HTML. Por fim, identificou e removeu outliers em cada domínio: calculou o intervalo inter-quartil (IQR) do número de tokens, usando o tokenizer do NLTK para português, e descartou textos com comprimento inferior a 
𝑄
1
−
1,5
×
IQR
Q1−1,5×IQR ou superior a 
𝑄
3
+
1,5
×
IQR
Q3+1,5×IQR. Este processo assegurou a exclusão de documentos demasiado curtos ou longos para o respetivo domínio, resultando num corpus mais limpo e consistente.

**//FRMT DATASET**

In [49]:
GITHUB_RAW = ("https://raw.githubusercontent.com/google-research/google-research/"
              "HEAD/frmt/dataset")

buckets = {          # bucket → filename prefix inside that bucket
    "lexical": "pt_lexical",
    "entity" : "pt_entity",
    "random" : "pt_random",
}

splits   = ["dev", "test", "exemplars"]        # the paper’s three splits
regions  = ["pt-BR", "pt-PT"]                  # ⬅️ we ignore zh-* files

def urls(bucket):
    prefix = buckets[bucket]
    return [f"{GITHUB_RAW}/{bucket}_bucket/"
            f"{prefix}_{split}_en_{region}.tsv"
            for split in splits
            for region in regions]

# sanity-check
print(urls("entity")[:3])


['https://raw.githubusercontent.com/google-research/google-research/HEAD/frmt/dataset/entity_bucket/pt_entity_dev_en_pt-BR.tsv', 'https://raw.githubusercontent.com/google-research/google-research/HEAD/frmt/dataset/entity_bucket/pt_entity_dev_en_pt-PT.tsv', 'https://raw.githubusercontent.com/google-research/google-research/HEAD/frmt/dataset/entity_bucket/pt_entity_test_en_pt-BR.tsv']


In [50]:
def split_key(bucket, split, region):
    # pt-BR  ➜  pt_BR   (dash → underscore)
    return f"{bucket}_{split}_{region.replace('-', '_')}"

data_files = {
    split_key(bucket, split, region): [
        f"{GITHUB_RAW}/{bucket}_bucket/"
        f"{buckets[bucket]}_{split}_en_{region}.tsv"
    ]
    for bucket in buckets
    for split  in splits
    for region in regions
}

from datasets import load_dataset, DatasetDict

ds = load_dataset(
        "csv",
        data_files   = data_files,
        delimiter    = "\t",
        column_names = ["en", "pt"],
)

print(list(ds.keys())[:6])
# ['lexical_dev_pt_BR', 'lexical_dev_pt_PT', 'lexical_test_pt_BR', …]


['lexical_dev_pt_BR', 'lexical_dev_pt_PT', 'lexical_test_pt_BR', 'lexical_test_pt_PT', 'lexical_exemplars_pt_BR', 'lexical_exemplars_pt_PT']


In [51]:
entity_br = pd.DataFrame(ds['entity_dev_pt_BR'])

In [52]:
entity_br

Unnamed: 0,en,pt
0,Constâncio was Secretary of State for Planning...,Constâncio foi Secretário de Estado para Plane...
1,He then became Finance Minister from January t...,"Depois, tornou-se Ministro das Finanças de jan..."
2,Constâncio was secretary-general of the Social...,"De 1986 a 1989, Constâncio foi secretário-gera..."
3,He lost the legislative elections of 19 July 1...,Ele perdeu as eleições legislativas de 19 de j...
4,"He resigned the following year, being replaced...","No ano seguinte, ele renunciou, sendo substitu..."
...,...,...
930,"The northern and western sides of the castle, ...","Os lados norte e oeste do castelo, por sua vez..."
931,The castle is also partially encircled by a mo...,"O castelo também é, em parte, rodeado por um f..."
932,The main entrance is fronted by a stone bridge...,"De frente para a entrada principal, há uma pon..."
933,"On the west side, there is a long curtain wall...","No lado oeste, há uma divisória que se estende..."


In [53]:
entity_pt = pd.DataFrame(ds['entity_dev_pt_PT'])
entity_pt[entity_pt["en"].duplicated(keep=False)].head()

Unnamed: 0,en,pt
52,After the Portuguese legislative election of 2...,Após as eleições legislativas portuguesas de 2...
64,After the Portuguese legislative election of 2...,Após as eleições legislativas portuguesas de 2...


In [57]:
entity_br[entity_br["en"].duplicated(keep=False)].head()

Unnamed: 0,en,pt
52,After the Portuguese legislative election of 2...,"Após a eleição legislativa portuguesa de 2009,..."
64,After the Portuguese legislative election of 2...,"Após a eleição legislativa portuguesa de 2009,..."


**//GOLD COLLECTION**

In [54]:
ds = load_dataset("joaosanches/golden_collection")

ds


DatasetDict({
    gold_collection: Dataset({
        features: ['text'],
        num_rows: 500
    })
    referencia_DeepL: Dataset({
        features: ['text'],
        num_rows: 500
    })
    referencia_manual: Dataset({
        features: ['text'],
        num_rows: 500
    })
})

In [55]:
pd.DataFrame(ds['gold_collection'])

Unnamed: 0,text
0,"Segundo Kellner, apesar de o animal ser um bai..."
1,"Para a ONG, há evidências de que as companhias..."
2,"Mas, segundo a agência de notícias France Pres..."
3,Ele afirmou que dieta e exercícios devem conti...
4,"O biólogo William Eberhard, da Universidade da..."
...,...
495,Isso significa que a preposição é o termo que ...
496,Foi então que a vestimenta mais feminina que s...
497,Um de seus professores foi Martin Wegelius.\n
498,"Nessa época, iniciou uma verdadeira polêmica c..."


In [56]:
pd.DataFrame(ds['referencia_manual'])

Unnamed: 0,text
0,"Segundo Kellner, apesar de o animal ser muito ..."
1,"Para a ONG, há evidências de que as empresas q..."
2,"Mas, segundo a agência de notícias France Pres..."
3,Ele afirmou que dieta e exercícios devem conti...
4,"O biólogo William Eberhard, da Universidade da..."
...,...
495,Isso significa que a preposição é o termo que ...
496,Foi então que a peça de vestuário mais feminin...
497,Um dos seus professores foi Martin Wegelius.\n
498,"Nessa época, iniciou-se uma verdadeira polémic..."
