In [1]:
import os
import string
from datasets import list_datasets, load_dataset, load_from_disk
from huggingface_hub import HfApi
from pprint import pprint
import pandas as pd
import numpy as np
import ast
import jsonlines

pd.set_option("max_colwidth", 400)
pd.set_option("display.max_rows", 600)

In [2]:
HUGGINGFACE_TOKEN = os.environ.get("HUGGINGFACE_TOKEN")

In [3]:
roots_datasets = [
    ds_info.id
    for ds_info in HfApi().list_datasets(use_auth_token=HUGGINGFACE_TOKEN)
    if ds_info.id.startswith("bigscience-data/roots")
]
len(roots_datasets)

500

In [6]:
roots_datasets.sort()
roots_datasets

['bigscience-data/roots-1e-1',
 'bigscience-data/roots-1e-2_subset_tfidf_l1_tsne',
 'bigscience-data/roots_ar_arabench',
 'bigscience-data/roots_ar_arabic_billion_words',
 'bigscience-data/roots_ar_brad_2',
 'bigscience-data/roots_ar_habibi',
 'bigscience-data/roots_ar_kalimat',
 'bigscience-data/roots_ar_ksucca',
 'bigscience-data/roots_ar_labr',
 'bigscience-data/roots_ar_multi_un_2',
 'bigscience-data/roots_ar_open_subtitles',
 'bigscience-data/roots_ar_openiti_proc',
 'bigscience-data/roots_ar_opus100',
 'bigscience-data/roots_ar_oscar',
 'bigscience-data/roots_ar_pseudocrawl-filtered_595_mawdoo3_com',
 'bigscience-data/roots_ar_qedcorpus',
 'bigscience-data/roots_ar_sanad',
 'bigscience-data/roots_ar_tashkeela',
 'bigscience-data/roots_ar_ted_talks_iwslt',
 'bigscience-data/roots_ar_uncorpus',
 'bigscience-data/roots_ar_wikibooks',
 'bigscience-data/roots_ar_wikinews',
 'bigscience-data/roots_ar_wikipedia',
 'bigscience-data/roots_ar_wikiquote',
 'bigscience-data/roots_ar_wikisour

In [4]:
langs = set()
for ds in roots_datasets:
    tokens = ds.replace("/", "-").replace("_", "-").split("-")
    langs.add(tokens[3])
langs

{'1e',
 'ar',
 'ca',
 'code',
 'en',
 'es',
 'eu',
 'fr',
 'id',
 'indic',
 'nigercongo',
 'pt',
 'vi',
 'zh',
 'zhs',
 'zht'}

In [25]:
dataset = load_from_disk("../data/bigscience-data/roots_vi_ted_talks_iwslt")

In [26]:
text = dataset["train"][133]["text"]
print(text)

Đâu là điểm chung giữa công nghệ, mỹ thuật và khoa học? Đó là sự tò mò Sự tò mò dẫn chúng ta đến với những khám phá mới mà mắt thường không thể thấy Và tôi thích dùng phim ảnh để đưa mọi người đến với cuộc hành trình vượt không gian và thời gian biến những thứ vô hình thành hữu hình Nó mở rộng chân trời thay đổi nhận thức làm chúng ta cởi mở hơn và chạm đến trái tim của mỗi người Sau đây là một số cảnh quay từ bộ phim 3D của tôi có tên "Những điều bí ẩn của thế giới vô hình"
(Nhạc)
Có những biến đổi quá chậm để mắt người có thể nhận ra, và kĩ thuật "tua nhanh" giúp chúng ta mở rộng góc nhìn về cuộc sống. Chúng ta có thể thấy sự phát triển của sinh vật, bằng cách nào mà cây leo có thể sống sót qua những tầng rừng để nhìn thấy ánh sáng mặt trời. Ở quy mô lớn hơn, "tua nhanh" giúp chúng ta nhìn thấy sự chuyển động của thế giới
không chỉ dừng lại ở những bước tiến dài của tự nhiên mà còn cả những tiến bộ không ngừng nghỉ của nhân loại Mỗi vệt chấm đại diện cho một máy bay chở khách nhờ vào

In [27]:
text.split()

['Đâu',
 'là',
 'điểm',
 'chung',
 'giữa',
 'công',
 'nghệ,',
 'mỹ',
 'thuật',
 'và',
 'khoa',
 'học?',
 'Đó',
 'là',
 'sự',
 'tò',
 'mò',
 'Sự',
 'tò',
 'mò',
 'dẫn',
 'chúng',
 'ta',
 'đến',
 'với',
 'những',
 'khám',
 'phá',
 'mới',
 'mà',
 'mắt',
 'thường',
 'không',
 'thể',
 'thấy',
 'Và',
 'tôi',
 'thích',
 'dùng',
 'phim',
 'ảnh',
 'để',
 'đưa',
 'mọi',
 'người',
 'đến',
 'với',
 'cuộc',
 'hành',
 'trình',
 'vượt',
 'không',
 'gian',
 'và',
 'thời',
 'gian',
 'biến',
 'những',
 'thứ',
 'vô',
 'hình',
 'thành',
 'hữu',
 'hình',
 'Nó',
 'mở',
 'rộng',
 'chân',
 'trời',
 'thay',
 'đổi',
 'nhận',
 'thức',
 'làm',
 'chúng',
 'ta',
 'cởi',
 'mở',
 'hơn',
 'và',
 'chạm',
 'đến',
 'trái',
 'tim',
 'của',
 'mỗi',
 'người',
 'Sau',
 'đây',
 'là',
 'một',
 'số',
 'cảnh',
 'quay',
 'từ',
 'bộ',
 'phim',
 '3D',
 'của',
 'tôi',
 'có',
 'tên',
 '"Những',
 'điều',
 'bí',
 'ẩn',
 'của',
 'thế',
 'giới',
 'vô',
 'hình"',
 '(Nhạc)',
 'Có',
 'những',
 'biến',
 'đổi',
 'quá',
 'chậm',
 'để',
 'mắt',

In [17]:
dataset = load_from_disk("../data/bigscience-data/roots_en_open_subtitles")

In [18]:
pprint(dataset["train"][133])

{'meta': "{'file': 'OpenSubtitles/raw/en/1953/46487/4648211.xml'}",
 'text': '..platform C.\n'
         '..platform B.\n'
         '..platform C.\n'
         '..platform B.\n'
         'GROCERIES, WINES\n'
         'Save a seat for me.\n'
         'Stay there!\n'
         'Get in!\n'
         'Is this my seat?\n'
         'BEACH\n'
         '- HOTEL\n'
         '- GOLF\n'
         '- STATION\n'
         'Back then, my dear lady, I was a captain in the cavalry...\n'
         'This is awful!\n'
         'Take this away.\n'
         'Monsieur...?\n'
         'Sorry?\n'
         '- May I?\n'
         '- Hulot.\n'
         'H-U-L-O-T.\n'
         'Your tobacco.\n'
         'Hey!\n'
         'Scram!\n'
         '- Good night, monsieur.\n'
         '- Monsieur, madame.\n'
         'Telephone!\n'
         'New York!\n'
         "- What's going on?\n"
         "That's not ours.\n"
         "- It's my son's.\n"
         'Where can he be?\n'
         'Denis!\n'
         'I only have a sleeve left

In [19]:
pprint(dataset["train"][133]["meta"])

"{'file': 'OpenSubtitles/raw/en/1953/46487/4648211.xml'}"


In [20]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'meta'],
        num_rows: 372795
    })
})

In [7]:
df = pd.read_csv("../data/datasets_summary_roots.csv", index_col=0)
df

Unnamed: 0,dataset,split,num_doc,fields,meta_fields,avg_doc_len,avg_doc_word_count,total_paragraphs
0,bigscience-data/roots_ar_uncorpus,train,115804,"['text', 'meta']",['file'],25041,3873,1928281
1,bigscience-data/roots_en_uncorpus,train,143703,"['text', 'meta']",['file'],31399,4631,2843571
2,bigscience-data/roots_es_uncorpus,train,118070,"['text', 'meta']",['file'],38367,5909,2964503
3,bigscience-data/roots_fr_uncorpus,train,145876,"['text', 'meta']",['file'],35149,5236,3253998
4,bigscience-data/roots_zh_uncorpus,train,81567,"['text', 'meta']",['file'],11037,360,177931
5,bigscience-data/roots_ar_arabench,train,42,"['meta', 'text']",['file'],104124,16517,2909
6,bigscience-data/roots_ar_habibi,train,29053,"['meta', 'text']",['songID'],547,89,29414
7,bigscience-data/roots_ar_kalimat,train,15123,['text'],[],2580,451,35337
8,bigscience-data/roots_ar_brad_2,train,293683,"['meta', 'text']",['id'],671,122,338781
9,bigscience-data/roots_ar_ksucca,train,410,"['meta', 'text']",['filename'],635886,117796,201438


In [8]:
def get_lang(row):
    datasetname = row["dataset"].split("/")[1]
    roots_prefix = datasetname.split("-")[0]
    lang = roots_prefix.split("_")[1]
    return lang

In [9]:
df["language"] = df.apply(lambda row: get_lang(row), axis=1)

In [10]:
df

Unnamed: 0,dataset,split,num_doc,fields,meta_fields,avg_doc_len,avg_doc_word_count,total_paragraphs,language
0,bigscience-data/roots_ar_uncorpus,train,115804,"['text', 'meta']",['file'],25041,3873,1928281,ar
1,bigscience-data/roots_en_uncorpus,train,143703,"['text', 'meta']",['file'],31399,4631,2843571,en
2,bigscience-data/roots_es_uncorpus,train,118070,"['text', 'meta']",['file'],38367,5909,2964503,es
3,bigscience-data/roots_fr_uncorpus,train,145876,"['text', 'meta']",['file'],35149,5236,3253998,fr
4,bigscience-data/roots_zh_uncorpus,train,81567,"['text', 'meta']",['file'],11037,360,177931,zh
5,bigscience-data/roots_ar_arabench,train,42,"['meta', 'text']",['file'],104124,16517,2909,ar
6,bigscience-data/roots_ar_habibi,train,29053,"['meta', 'text']",['songID'],547,89,29414,ar
7,bigscience-data/roots_ar_kalimat,train,15123,['text'],[],2580,451,35337,ar
8,bigscience-data/roots_ar_brad_2,train,293683,"['meta', 'text']",['id'],671,122,338781,ar
9,bigscience-data/roots_ar_ksucca,train,410,"['meta', 'text']",['filename'],635886,117796,201438,ar


In [11]:
dfg = df.groupby("language").sum()
dfg

Unnamed: 0_level_0,num_doc,avg_doc_len,avg_doc_word_count,total_paragraphs
language,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ar,15234080,7964622,1131168,38158452
ca,6142390,83442,13299,14816030
code,26176998,11723,2324,157322113
en,77010827,1344937,208168,364865459
es,67005817,1422322,219796,152502163
eu,5149797,65463,7917,5479279
fr,58847091,883186,135139,166789212
id,12514253,131036914,17608316,18795333
indic,84982982,447135,64278,90980583
nigercongo,1162568,14672,2207,1244980


In [12]:
43957008 + 44856841 + 992

88814841

In [24]:
dfg["num_doc"]

language
ar            15234080
ca             6142390
code          26176998
en            77010827
es            67005817
eu             5149797
fr            58847091
id            12514253
indic         84982982
nigercongo     1162568
pt            31969891
vi            34110375
zh            43957008
zhs           44856841
zht                992
Name: num_doc, dtype: int64

In [4]:
df["total_paragraphs"].sum()

1198695778

In [6]:
df.sort_values("total_paragraphs")

Unnamed: 0,dataset,split,num_doc,fields,meta_fields,avg_doc_len,avg_doc_word_count,total_paragraphs
26,bigscience-data/roots_ca_parlament_parla,train,1,"['meta', 'text']",['audio'],1162,197,1
366,bigscience-data/roots_indic-pa_ted_talks_iwslt,train,1,"['text', 'meta']","['file', 'element']",908,190,1
17,bigscience-data/roots_ca_enriched_conllu_ancora_for_ml_training,train,1,['text'],[],1289,201,1
273,bigscience-data/roots_indic-as_ted_talks_iwslt,train,1,"['text', 'meta']","['file', 'element']",2568,396,2
38,bigscience-data/roots_en_odiencorp,train,2,['text'],[],3027,450,4
225,bigscience-data/roots_fr_book_dash_books,train,6,"['text', 'meta']",['path'],3189,522,17
274,bigscience-data/roots_indic-as_wiktionary,train,18,"['meta', 'text']","['content_model', 'language', 'title', 'type']",75,11,18
314,bigscience-data/roots_indic-hi_wikimedia,train,15,"['meta', 'text']","['content_model', 'language', 'title', 'type']",2117,368,30
250,bigscience-data/roots_id_indo4b_talpco,train,1,"['text', 'meta']",['source'],56041,7423,31
456,bigscience-data/roots_vi_uit_vsmec,train,35,['text'],[],314,79,35


In [47]:
def find_whitespace(text):
    for i, c in enumerate(text):
        if c in string.whitespace:
            yield i


def get_segmentation(text, passage_tokens, overlap_tokens):
    whitespace_idx = [-1] + list(find_whitespace(text))
    unique_tokens = passage_tokens - overlap_tokens
    passages = []
    for i in range(0, len(whitespace_idx), unique_tokens):
        if i + passage_tokens >= len(whitespace_idx):
            passages.append((whitespace_idx[i] + 1, len(text)))
            break
        passages.append((whitespace_idx[i] + 1, whitespace_idx[i + passage_tokens] + 1))
    return passages


def test_reconstuct_segmented(text, segmentation, overlap_tokens):
    assert len(segmentation) > 0
    reconstructed = text[segmentation[0][0] : segmentation[0][1]]
    for segment_start, segment_end in segmentation[1:]:
        whitespace_idx = [-1] + list(find_whitespace(text[segment_start:segment_end]))
        reconstructed += text[segment_start + whitespace_idx[overlap_tokens] + 1 : segment_end]
    return text == reconstructed


def extract_segment(text, segment):
    segment_start, segment_end = segment
    return text[segment_start:segment_end]

In [48]:
dataset = load_from_disk("../data/bigscience-data/roots_en_wikipedia")
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'meta'],
        num_rows: 2055712
    })
})

In [49]:
for i, row in enumerate(dataset["train"]):
    if len(row["text"]) > 2000:
        print(i)
        print(row["text"])
        break

0
Japanese pop culture in the United States

Anime in the United States

Anime culture in the United States began as a niche community that had a grassroots foundation built by groups of fans on the local level. Some of the earliest televised anime to air in the United States were Astro Boy, Speed Racer, and Gigantor, which gained popularity with many American audiences during the late 1960s. Anime shows that aired in the United States up until the 1980s were usually heavily altered and localized, such as Science Ninja Team Gatchaman becoming Battle of the Planets in the 1970s, and the mecha show Macross becoming Robotech in the 1980s. Takara's Diaclone and Microman mecha toylines also became the basis for the Transformers franchise in the 1980s.
Small patches of isolated communities started to form around collective interest towards this new medium, which seemed reminiscent of familiar Disney visuals and Warner-Brothers narratives. One of these communities was the first anime fan club

In [50]:
dataset["train"][0]

{'text': 'Japanese pop culture in the United States\n\nAnime in the United States\n\nAnime culture in the United States began as a niche community that had a grassroots foundation built by groups of fans on the local level. Some of the earliest televised anime to air in the United States were Astro Boy, Speed Racer, and Gigantor, which gained popularity with many American audiences during the late 1960s. Anime shows that aired in the United States up until the 1980s were usually heavily altered and localized, such as Science Ninja Team Gatchaman becoming Battle of the Planets in the 1970s, and the mecha show Macross becoming Robotech in the 1980s. Takara\'s Diaclone and Microman mecha toylines also became the basis for the Transformers franchise in the 1980s.\nSmall patches of isolated communities started to form around collective interest towards this new medium, which seemed reminiscent of familiar Disney visuals and Warner-Brothers narratives. One of these communities was the first 

In [54]:
segmentations = get_segmentation(dataset["train"][0]["text"], 128, 8)

In [55]:
segmentations

[(0, 755),
 (704, 1594),
 (1528, 2350),
 (2308, 3020),
 (2971, 3802),
 (3764, 4582),
 (4529, 5268),
 (5224, 5953),
 (5902, 6671),
 (6620, 7448),
 (7389, 8180),
 (8135, 8832),
 (8782, 9501),
 (9451, 10183),
 (10134, 10899),
 (10852, 11591),
 (11545, 12366),
 (12321, 13086),
 (13033, 13479)]

In [57]:
for seg in segmentations:
    print(extract_segment(dataset["train"][0]["text"], seg))
    print("-------------------")

Japanese pop culture in the United States

Anime in the United States

Anime culture in the United States began as a niche community that had a grassroots foundation built by groups of fans on the local level. Some of the earliest televised anime to air in the United States were Astro Boy, Speed Racer, and Gigantor, which gained popularity with many American audiences during the late 1960s. Anime shows that aired in the United States up until the 1980s were usually heavily altered and localized, such as Science Ninja Team Gatchaman becoming Battle of the Planets in the 1970s, and the mecha show Macross becoming Robotech in the 1980s. Takara's Diaclone and Microman mecha toylines also became the basis for the Transformers franchise in the 1980s.

-------------------
basis for the Transformers franchise in the 1980s.
Small patches of isolated communities started to form around collective interest towards this new medium, which seemed reminiscent of familiar Disney visuals and Warner-Brot

In [58]:
len(segmentations)

19