In [1]:
from datasets import load_dataset
import pandas as pd
import nltk
from gensim.parsing.preprocessing import remove_stopwords, strip_punctuation, strip_tags, strip_multiple_whitespaces, strip_numeric, strip_short, strip_short, strip_numeric, strip_punctuation, strip_tags, strip_multiple_whitespaces, remove_stopwords
from nltk.stem import SnowballStemmer, PorterStemmer
from nltk.tokenize import sent_tokenize
from tqdm import tqdm

import sys
sys.path.append('../src')

from stemmers.stemmsk import stem as stem_sk
from stemmers.stemmcz import stem_word as stem_cz


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def load_stopwords(language):
    stopwords = []
    with open('./stopwords/{}.txt'.format(language), 'r') as f:
        for line in f:
            stopwords.append(line.strip())
    return stopwords


In [3]:
def preprocess_string(text, language):
    stopwords = load_stopwords(language)
    if language == 'slovak':
        stem = stem_sk
    elif language == 'czech':
        stem = stem_cz
    elif language == 'english':
        stemmer = PorterStemmer()
        stem = stemmer.stem
    elif language == 'german':
        stemmer = SnowballStemmer("german")
        stem = stemmer.stem

    data = text.lower()
    data = strip_tags(data)
    data = strip_punctuation(data)
    data = strip_multiple_whitespaces(data)
    data = strip_numeric(data)
    data = remove_stopwords(data, stopwords=stopwords)
    data = strip_short(data, minsize=3)
    data = stem(data)

    return data.split()

In [4]:
# count the overall number of tokens for each lanugage
data = load_dataset("wikipedia", language='sk', date="20231101", beam_runner='DirectRunner')


In [6]:
# count the overall number of tokens from data['train]
count = 0
for i in tqdm(range(len(data['train']))):
    if i == 0:
        continue
    count += len(preprocess_string(data['train'][i]['text'], 'slovak'))

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 242235/242235 [04:57<00:00, 815.29it/s]


In [8]:
slovak_count = count

In [9]:
dataset = load_dataset("wikipedia", "20220301.en")

In [11]:
english_count = 0
for i in tqdm(range(len(dataset['train']))):
    english_count += len(preprocess_string(dataset['train'][i]['text'], 'english'))

  0%|▌                                                                                                                                     | 29149/6458670 [10:52<39:59:00, 44.67it/s]


KeyboardInterrupt: 

In [12]:
english_count

42524655

In [13]:
from concurrent.futures import ProcessPoolExecutor, as_completed
from tqdm import tqdm

def preprocess_and_count(text):
    preprocessed_text = preprocess_string(text, 'english')
    return len(preprocessed_text)

batch_size = 1000
english_count2 = 0

with ProcessPoolExecutor() as executor:
    futures = [executor.submit(preprocess_and_count, item['text']) for item in dataset['train']]

    for future in tqdm(as_completed(futures), total=len(futures)):
        english_count2 += future.result()


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6458670/6458670 [1:31:55<00:00, 1170.90it/s]


In [14]:
english_count2

1555155883