In [1]:
import multiprocessing as mp
from collections import Counter

import datasets
import numpy as np
import tiktoken

N_PROC = mp.cpu_count()
TOKENIZER = tiktoken.get_encoding("cl100k_base")

  from .autonotebook import tqdm as notebook_tqdm


# Raw

In [2]:
raw = datasets.load_dataset("u1537782/PTradutor", "raw")

In [31]:
temp = raw.filter(lambda x: x["en"].lower().startswith("list of recent"), num_proc=N_PROC)
temp

Filter (num_proc=96): 100%|██████████| 3965207/3965207 [00:01<00:00, 2663930.16 examples/s]
Filter (num_proc=96): 100%|██████████| 1734/1734 [00:00<00:00, 2188.35 examples/s]


DatasetDict({
    train: Dataset({
        features: ['idx', 'source', 'domain', 'pt', 'en'],
        num_rows: 1132
    })
    valid: Dataset({
        features: ['idx', 'source', 'domain', 'pt', 'en'],
        num_rows: 0
    })
})

In [32]:
temp["train"]["pt"][0]

'Lista das mudanças recentes a todas as páginas para as quais a página fornecida contém links (ou de todas as que pertencem à categoria fornecida). As suas páginas vigiadas aparecem a negrito. Opções das mudanças recentes Legenda: N Esta edição criou uma nova página (lista de páginas novas) m Esta é uma edição menor b Esta edição foi feita por um robô (±123) Alteração no tamanho da página, em bytes Mostrar as últimas 50 | 100 | 250 | 500 mudanças nos últimos 1 | 3 | 7 | 14 | 30 dias Esconder edições menores | Mostrar robôs | Esconder utilizadores anónimos | Esconder utilizadores registados | Mostrar as minhas edições Mostrar as novas mudanças a partir das 09h43min de 3 de dezembro de 2022 Espaço nominal: todos (Principal) Discussão Utilizador Utilizador Discussão METIS METIS Discussão Ficheiro Ficheiro Discussão MediaWiki MediaWiki Discussão Predefinição Predefinição Discussão Ajuda Ajuda Discussão Categoria Categoria Discussão Propriedade Discussão propriedade Type Type talk Form Form

## Number of documents

In [13]:
print("Number of examples train:", len(raw["train"]))
print("Number of examples valid:", len(raw["valid"]))


Number of examples train: 3966538
Number of examples valid: 403


## Number of tokens

In [10]:
raw = raw.map(lambda x: {"n_tkns": len(TOKENIZER.encode(x["pt"]))}, num_proc=N_PROC)

Map (num_proc=96): 100%|██████████| 3966538/3966538 [00:25<00:00, 156541.22 examples/s]
Map (num_proc=96): 100%|██████████| 403/403 [00:00<00:00, 752.25 examples/s]


In [12]:
n_tkns_raw_train = sum(raw["train"]["n_tkns"])
print(f"n_tkns_raw_train: {n_tkns_raw_train}")

n_tkns_raw_valid = sum(raw["valid"]["n_tkns"])
print(f"n_tkns_raw_valid: {n_tkns_raw_valid}")

n_tkns_train: 424010237
n_tkns_test: 25607


# Clean

In [14]:
clean = datasets.load_dataset("u1537782/PTradutor", "clean")

Downloading data: 100%|██████████| 106M/106M [00:03<00:00, 31.6MB/s] 
Downloading data: 100%|██████████| 358M/358M [00:15<00:00, 23.6MB/s] 
Downloading data: 100%|██████████| 380M/380M [00:17<00:00, 21.7MB/s] 
Downloading data: 100%|██████████| 406M/406M [00:06<00:00, 63.3MB/s] 
Downloading data: 100%|██████████| 293M/293M [00:05<00:00, 49.8MB/s] 
Downloading data: 100%|██████████| 117k/117k [00:00<00:00, 280kB/s]
Generating train split: 100%|██████████| 3065063/3065063 [00:16<00:00, 184466.87 examples/s]
Generating test split: 100%|██████████| 347/347 [00:00<00:00, 68503.41 examples/s]


## Number of documents

In [15]:
print("Number of examples train:", len(clean["train"]))
print("Number of examples valid:", len(clean["valid"]))


Number of examples train: 3065063
Number of examples valid: 347


## Number of tokens

In [19]:
clean = clean.map(lambda x: {"n_tkns_pt": len(TOKENIZER.encode(x["pt"]))}, num_proc=N_PROC)
clean = clean.map(lambda x: {"n_tkns_en": len(TOKENIZER.encode(x["en"]))}, num_proc=N_PROC)

Map (num_proc=96): 100%|██████████| 3065063/3065063 [00:14<00:00, 207484.50 examples/s]
Map (num_proc=96): 100%|██████████| 347/347 [00:00<00:00, 715.14 examples/s]
Map (num_proc=96): 100%|██████████| 3065063/3065063 [00:14<00:00, 206967.38 examples/s]
Map (num_proc=96): 100%|██████████| 347/347 [00:00<00:00, 735.32 examples/s] 


In [20]:
n_tkns_clean_train = sum(clean["train"]["n_tkns_pt"])
print(f"N tokens train pt: {n_tkns_clean_train}")

n_tkns_clean_valid = sum(clean["valid"]["n_tkns_pt"])
print(f"N tokens valid pt: {n_tkns_clean_valid}")

N tokens train pt: 329344387
N tokens test pt: 22316


## Count by domain

In [23]:
domains = set(clean["train"]["domain"])
print(domains)

{'default', 'journalistic', 'literature', 'web', 'politics', 'legal', 'social_media'}


In [11]:
def print_stats_by_domain(dataset):
    print("\t".join(["domain", "n_examples", "n_tkns_pt", "min_pt", "max_pt", "avg_pt", "n_tkns_en", "min_en", "max_en", "avg_en"]))
    for domain in domains:
        ds = dataset.filter(lambda x: x["domain"] == domain, num_proc=N_PROC)
        if not len(ds):
            continue
        info = [
            domain,
            len(ds),
            # pt
            sum(ds["n_tkns_pt"]),
            min(ds["n_tkns_pt"]),
            max(ds["n_tkns_pt"]),
            round(sum(ds["n_tkns_pt"]) / len(ds), 1), # avg
            # en
            sum(ds["n_tkns_en"]),
            min(ds["n_tkns_en"]),
            max(ds["n_tkns_en"]),
            round(sum(ds["n_tkns_en"]) / len(ds), 1), # avg
        ]
        print(info)

In [40]:
print_stats_by_domain(clean["train"])

domain	n_examples	n_tkns_pt	min_pt	max_pt	avg_pt	n_tkns_en	min_en	max_en	avg_en


Filter (num_proc=96): 100%|██████████| 3065063/3065063 [00:00<00:00, 3326014.92 examples/s]


['default', 1171, 74940, 14, 132, 64.0, 55818, 10, 105, 47.7]


Filter (num_proc=96): 100%|██████████| 3065063/3065063 [00:00<00:00, 3159051.92 examples/s]


['journalistic', 1296965, 256531369, 18, 511, 197.8, 190297919, 13, 436, 146.7]


Filter (num_proc=96): 100%|██████████| 3065063/3065063 [00:00<00:00, 3220343.88 examples/s]


['literature', 17181, 1721240, 26, 510, 100.2, 1283728, 18, 360, 74.7]


Filter (num_proc=96): 100%|██████████| 3065063/3065063 [00:01<00:00, 3055030.35 examples/s]


['web', 12624, 2202027, 12, 544, 174.4, 1635845, 11, 416, 129.6]


Filter (num_proc=96): 100%|██████████| 3065063/3065063 [00:00<00:00, 3203634.20 examples/s]


['politics', 757, 128201, 25, 524, 169.4, 89347, 15, 380, 118.0]


Filter (num_proc=96): 100%|██████████| 3065063/3065063 [00:00<00:00, 3124521.45 examples/s]


['legal', 332851, 27463703, 9, 483, 82.5, 20527801, 6, 417, 61.7]


Filter (num_proc=96): 100%|██████████| 3065063/3065063 [00:00<00:00, 3116404.15 examples/s]


['social_media', 1403514, 41222907, 3, 132, 29.4, 32150831, 2, 121, 22.9]


In [41]:
print_stats_by_domain(clean["test"])

domain	n_examples	n_tkns_pt	min_pt	max_pt	avg_pt	n_tkns_en	min_en	max_en	avg_en


Filter (num_proc=96): 100%|██████████| 347/347 [00:00<00:00, 840.55 examples/s]


['default', 347, 22316, 13, 135, 64.3, 16555, 10, 97, 47.7]


Filter (num_proc=96): 100%|██████████| 347/347 [00:00<00:00, 802.23 examples/s] 
Filter (num_proc=96): 100%|██████████| 347/347 [00:00<00:00, 746.47 examples/s]
Filter (num_proc=96): 100%|██████████| 347/347 [00:00<00:00, 700.49 examples/s] 
Filter (num_proc=96): 100%|██████████| 347/347 [00:00<00:00, 844.73 examples/s]
Filter (num_proc=96): 100%|██████████| 347/347 [00:00<00:00, 796.98 examples/s] 
Filter (num_proc=96): 100%|██████████| 347/347 [00:00<00:00, 725.06 examples/s] 


Total


In [42]:
info = [
    "All",
    len(clean["train"]),
    
    # pt
    min(clean["train"]["n_tkns_pt"]),
    max(clean["train"]["n_tkns_pt"]),
    round(sum(clean["train"]["n_tkns_pt"]) / len(clean["train"]), 1), # avg
    sum(clean["train"]["n_tkns_pt"]),

    # en
    min(clean["train"]["n_tkns_en"]),
    max(clean["train"]["n_tkns_en"]),
    round(sum(clean["train"]["n_tkns_en"]) / len(clean["train"]), 1), # avg
    sum(clean["train"]["n_tkns_en"]),
]
print(info)

['All', 3065063, 3, 544, 107.5, 329344387, 2, 436, 80.3, 246041289]


# Super Clean

In [5]:
superclean = datasets.load_dataset("u1537782/PTradutor", "superclean")

## Number of documents

In [6]:
print("Number of examples train:", len(superclean["train"]))
print("Number of examples valid:", len(superclean["valid"]))


Number of examples train: 1719002
Number of examples valid: 1734


## Number of tokens

In [7]:
superclean = superclean.map(lambda x: {"n_tkns_pt": len(TOKENIZER.encode(x["pt"]))}, num_proc=N_PROC)
superclean = superclean.map(lambda x: {"n_tkns_en": len(TOKENIZER.encode(x["en"]))}, num_proc=N_PROC)

Map (num_proc=96): 100%|██████████| 1719002/1719002 [00:12<00:00, 140919.79 examples/s]
Map (num_proc=96): 100%|██████████| 1734/1734 [00:00<00:00, 3502.67 examples/s]
Map (num_proc=96): 100%|██████████| 1719002/1719002 [00:11<00:00, 154489.47 examples/s]
Map (num_proc=96): 100%|██████████| 1734/1734 [00:00<00:00, 3580.11 examples/s]


In [8]:
n_tkns_clean_train = sum(superclean["train"]["n_tkns_pt"])
print(f"N tokens train pt: {n_tkns_clean_train}")

n_tkns_clean_valid = sum(superclean["valid"]["n_tkns_pt"])
print(f"N tokens valid pt: {n_tkns_clean_valid}")

N tokens train pt: 293628259
N tokens valid pt: 110334


## Count by domain

In [9]:
domains = set(superclean["train"]["domain"])
print(domains)

{'literature', 'politics', 'social_media', 'legal', 'journalistic', 'web'}


In [12]:
print_stats_by_domain(superclean["train"])

domain	n_examples	n_tkns_pt	min_pt	max_pt	avg_pt	n_tkns_en	min_en	max_en	avg_en


Filter (num_proc=96): 100%|██████████| 1719002/1719002 [00:00<00:00, 2516532.63 examples/s]
Downloading data:   0%|          | 0.00/288M [23:38<?, ?B/s]


['literature', 12082, 1461651, 51, 510, 121.0, 1085296, 37, 360, 89.8]


Filter (num_proc=96): 100%|██████████| 1719002/1719002 [00:00<00:00, 2578962.92 examples/s]


['politics', 477, 116836, 53, 524, 244.9, 81801, 36, 380, 171.5]


Filter (num_proc=96): 100%|██████████| 1719002/1719002 [00:00<00:00, 2472799.94 examples/s]


['social_media', 163585, 11622673, 41, 129, 71.0, 9025327, 26, 121, 55.2]


Filter (num_proc=96): 100%|██████████| 1719002/1719002 [00:00<00:00, 2476566.09 examples/s]


['legal', 282870, 24635676, 44, 451, 87.1, 18346240, 25, 385, 64.9]


Filter (num_proc=96): 100%|██████████| 1719002/1719002 [00:00<00:00, 2460481.20 examples/s]


['journalistic', 1250982, 253767361, 45, 511, 202.9, 188072054, 25, 433, 150.3]


Filter (num_proc=96): 100%|██████████| 1719002/1719002 [00:00<00:00, 2289220.58 examples/s]


['web', 9006, 2024062, 44, 555, 224.7, 1504751, 28, 416, 167.1]


In [21]:
info = [
    "DSL-TL",
    str(len(superclean["valid"])),
    
    # pt
    str(min(superclean["valid"]["n_tkns_pt"])),
    str(max(superclean["valid"]["n_tkns_pt"])),
    str(round(sum(superclean["valid"]["n_tkns_pt"]) / len(superclean["valid"]), 1)),
    str(sum(superclean["valid"]["n_tkns_pt"])),

    # en
    str(min(superclean["valid"]["n_tkns_en"])),
    str(max(superclean["valid"]["n_tkns_en"])),
    str(round(sum(superclean["valid"]["n_tkns_en"]) / len(superclean["valid"]), 1)),
    str(sum(superclean["valid"]["n_tkns_en"])),
]
print(" &\t".join(info))

DSL-TL &	1734 &	14 &	135 &	63.6 &	110334 &	10 &	108 &	47.2 &	81821


Total


In [17]:
info = [
    "All",
    len(superclean["train"]),
    
    # pt
    min(superclean["train"]["n_tkns_pt"]),
    max(superclean["train"]["n_tkns_pt"]),
    round(sum(superclean["train"]["n_tkns_pt"]) / len(superclean["train"]), 1), # avg
    sum(superclean["train"]["n_tkns_pt"]),

    # en
    min(superclean["train"]["n_tkns_en"]),
    max(superclean["train"]["n_tkns_en"]),
    round(sum(superclean["train"]["n_tkns_en"]) / len(superclean["train"]), 1), # avg
    sum(superclean["train"]["n_tkns_en"]),
]
print(info)

['All', 1719002, 41, 555, 170.8, 293628259, 25, 433, 126.9, 218115469]
