In [16]:
import multiprocessing as mp
from collections import Counter

import datasets
import numpy as np
import tiktoken

N_PROC = mp.cpu_count()
TOKENIZER = tiktoken.get_encoding("cl100k_base")

# Raw

In [None]:
raw = datasets.load_dataset("liaad/PTradutor", "raw")

## Number of documents

In [13]:
print("Number of examples train:", len(raw["train"]))
print("Number of examples valid:", len(raw["valid"]))


Number of examples train: 3966538
Number of examples valid: 403


## Number of tokens

In [10]:
raw = raw.map(lambda x: {"n_tkns": len(TOKENIZER.encode(x["pt"]))}, num_proc=N_PROC)

Map (num_proc=96): 100%|██████████| 3966538/3966538 [00:25<00:00, 156541.22 examples/s]
Map (num_proc=96): 100%|██████████| 403/403 [00:00<00:00, 752.25 examples/s]


In [12]:
n_tkns_raw_train = sum(raw["train"]["n_tkns"])
print(f"n_tkns_raw_train: {n_tkns_raw_train}")

n_tkns_raw_valid = sum(raw["valid"]["n_tkns"])
print(f"n_tkns_raw_valid: {n_tkns_raw_valid}")

n_tkns_train: 424010237
n_tkns_test: 25607


# Clean

In [14]:
clean = datasets.load_dataset("liaad/PTradutor", "clean")

Downloading data: 100%|██████████| 106M/106M [00:03<00:00, 31.6MB/s] 
Downloading data: 100%|██████████| 358M/358M [00:15<00:00, 23.6MB/s] 
Downloading data: 100%|██████████| 380M/380M [00:17<00:00, 21.7MB/s] 
Downloading data: 100%|██████████| 406M/406M [00:06<00:00, 63.3MB/s] 
Downloading data: 100%|██████████| 293M/293M [00:05<00:00, 49.8MB/s] 
Downloading data: 100%|██████████| 117k/117k [00:00<00:00, 280kB/s]
Generating train split: 100%|██████████| 3065063/3065063 [00:16<00:00, 184466.87 examples/s]
Generating test split: 100%|██████████| 347/347 [00:00<00:00, 68503.41 examples/s]


## Number of documents

In [15]:
print("Number of examples train:", len(clean["train"]))
print("Number of examples valid:", len(clean["valid"]))


Number of examples train: 3065063
Number of examples valid: 347


## Number of tokens

In [19]:
clean = clean.map(lambda x: {"n_tkns_pt": len(TOKENIZER.encode(x["pt"]))}, num_proc=N_PROC)
clean = clean.map(lambda x: {"n_tkns_en": len(TOKENIZER.encode(x["en"]))}, num_proc=N_PROC)

Map (num_proc=96): 100%|██████████| 3065063/3065063 [00:14<00:00, 207484.50 examples/s]
Map (num_proc=96): 100%|██████████| 347/347 [00:00<00:00, 715.14 examples/s]
Map (num_proc=96): 100%|██████████| 3065063/3065063 [00:14<00:00, 206967.38 examples/s]
Map (num_proc=96): 100%|██████████| 347/347 [00:00<00:00, 735.32 examples/s] 


In [20]:
n_tkns_clean_train = sum(clean["train"]["n_tkns_pt"])
print(f"N tokens train pt: {n_tkns_clean_train}")

n_tkns_clean_valid = sum(clean["valid"]["n_tkns_pt"])
print(f"N tokens valid pt: {n_tkns_clean_valid}")

N tokens train pt: 329344387
N tokens test pt: 22316


## Count by domain

In [23]:
domains = set(clean["train"]["domain"])
print(domains)

{'default', 'journalistic', 'literature', 'web', 'politics', 'legal', 'social_media'}


In [39]:
def print_stats_by_domain(dataset):
    print("\t".join(["domain", "n_examples", "n_tkns_pt", "min_pt", "max_pt", "avg_pt", "n_tkns_en", "min_en", "max_en", "avg_en"]))
    for domain in domains:
        ds = dataset.filter(lambda x: x["domain"] == domain, num_proc=N_PROC)
        if not len(ds):
            continue
        info = [
            domain,
            len(ds),
            # pt
            sum(ds["n_tkns_pt"]),
            min(ds["n_tkns_pt"]),
            max(ds["n_tkns_pt"]),
            round(sum(ds["n_tkns_pt"]) / len(ds), 1), # avg
            # en
            sum(ds["n_tkns_en"]),
            min(ds["n_tkns_en"]),
            max(ds["n_tkns_en"]),
            round(sum(ds["n_tkns_en"]) / len(ds), 1), # avg
        ]
        print(info)

In [40]:
print_stats_by_domain(clean["train"])

domain	n_examples	n_tkns_pt	min_pt	max_pt	avg_pt	n_tkns_en	min_en	max_en	avg_en


Filter (num_proc=96): 100%|██████████| 3065063/3065063 [00:00<00:00, 3326014.92 examples/s]


['default', 1171, 74940, 14, 132, 64.0, 55818, 10, 105, 47.7]


Filter (num_proc=96): 100%|██████████| 3065063/3065063 [00:00<00:00, 3159051.92 examples/s]


['journalistic', 1296965, 256531369, 18, 511, 197.8, 190297919, 13, 436, 146.7]


Filter (num_proc=96): 100%|██████████| 3065063/3065063 [00:00<00:00, 3220343.88 examples/s]


['literature', 17181, 1721240, 26, 510, 100.2, 1283728, 18, 360, 74.7]


Filter (num_proc=96): 100%|██████████| 3065063/3065063 [00:01<00:00, 3055030.35 examples/s]


['web', 12624, 2202027, 12, 544, 174.4, 1635845, 11, 416, 129.6]


Filter (num_proc=96): 100%|██████████| 3065063/3065063 [00:00<00:00, 3203634.20 examples/s]


['politics', 757, 128201, 25, 524, 169.4, 89347, 15, 380, 118.0]


Filter (num_proc=96): 100%|██████████| 3065063/3065063 [00:00<00:00, 3124521.45 examples/s]


['legal', 332851, 27463703, 9, 483, 82.5, 20527801, 6, 417, 61.7]


Filter (num_proc=96): 100%|██████████| 3065063/3065063 [00:00<00:00, 3116404.15 examples/s]


['social_media', 1403514, 41222907, 3, 132, 29.4, 32150831, 2, 121, 22.9]


In [41]:
print_stats_by_domain(clean["test"])

domain	n_examples	n_tkns_pt	min_pt	max_pt	avg_pt	n_tkns_en	min_en	max_en	avg_en


Filter (num_proc=96): 100%|██████████| 347/347 [00:00<00:00, 840.55 examples/s]


['default', 347, 22316, 13, 135, 64.3, 16555, 10, 97, 47.7]


Filter (num_proc=96): 100%|██████████| 347/347 [00:00<00:00, 802.23 examples/s] 
Filter (num_proc=96): 100%|██████████| 347/347 [00:00<00:00, 746.47 examples/s]
Filter (num_proc=96): 100%|██████████| 347/347 [00:00<00:00, 700.49 examples/s] 
Filter (num_proc=96): 100%|██████████| 347/347 [00:00<00:00, 844.73 examples/s]
Filter (num_proc=96): 100%|██████████| 347/347 [00:00<00:00, 796.98 examples/s] 
Filter (num_proc=96): 100%|██████████| 347/347 [00:00<00:00, 725.06 examples/s] 


Total


In [42]:
info = [
    "All",
    len(clean["train"]),
    
    # pt
    min(clean["train"]["n_tkns_pt"]),
    max(clean["train"]["n_tkns_pt"]),
    round(sum(clean["train"]["n_tkns_pt"]) / len(clean["train"]), 1), # avg
    sum(clean["train"]["n_tkns_pt"]),

    # en
    min(clean["train"]["n_tkns_en"]),
    max(clean["train"]["n_tkns_en"]),
    round(sum(clean["train"]["n_tkns_en"]) / len(clean["train"]), 1), # avg
    sum(clean["train"]["n_tkns_en"]),
]
print(info)

['All', 3065063, 3, 544, 107.5, 329344387, 2, 436, 80.3, 246041289]


# Clean Pipeline

In [43]:
from src.process import drop_duplicates

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [44]:
step1 = drop_duplicates(clean)

In [45]:
print_stats_by_domain(step1["train"])

domain	n_examples	n_tkns_pt	min_pt	max_pt	avg_pt	n_tkns_en	min_en	max_en	avg_en


Filter (num_proc=96): 100%|██████████| 3043768/3043768 [01:57<00:00, 25999.14 examples/s]


['default', 1171, 74940, 14, 132, 64.0, 55818, 10, 105, 47.7]


Filter (num_proc=96): 100%|██████████| 3043768/3043768 [01:56<00:00, 26053.43 examples/s]


['journalistic', 1296965, 256531369, 18, 511, 197.8, 190297919, 13, 436, 146.7]


Filter (num_proc=96):  94%|█████████▍| 2853538/3043768 [01:50<00:07, 25849.31 examples/s]


TimeoutError: 