In [9]:
import re
import pickle
import glob
import multiprocessing
import itertools
from collections import Counter, OrderedDict

import torch
from torchtext.vocab import vocab
import more_itertools as mit
from tqdm import tqdm

## Functions

In [10]:
def tokenize(text):
    text = text.lower()
    text = re.sub(r'[^a-ząćęłńóśźż.,!?\- ]', ' ', text)
    text = re.sub(r'([,-])', ' \\1 ', text)
    text = re.sub(r'([.!?])', ' \\1\n', text)
    sentences = text.split('\n')
    sentences = [[word for word in sentence.split(' ') if word] for sentence in sentences]
    return sentences

In [11]:
def count_tokens_in_files(paths):
    counter = Counter()
    for path in paths:
        with open(path) as f:
            content = f.read()
        
        sentences = tokenize(content)
        for sentence in sentences:
            counter.update(sentence)
    return counter

def count_tokens_in_directories(paths):
    nested_paths = [list(glob.glob(f'{path}/**/*.txt', recursive=True)) for path in paths]
    
    paths = list(itertools.chain(*nested_paths))    
    groups_count = multiprocessing.cpu_count()
    paths_groups = list(mit.distribute(groups_count, paths))
    tasks = list(zip(paths_groups))
    
    with multiprocessing.Pool(groups_count) as pool:
        counters = pool.starmap(count_tokens_in_files, tasks)
        
    total_counter = sum(counters, start=Counter())
    return total_counter

In [12]:
def create_vocabulary(directories, max_tokens):
    counter = count_tokens_in_directories(directories)
    sorted_items = tuple(sorted(counter.items(), key=lambda x: (-x[1], x[0])))
    cropped_dict = OrderedDict(sorted_items[:max_tokens])
    vocabulary = vocab(cropped_dict)
    vocabulary.set_default_index(-1)
    return vocabulary

In [13]:
def get_numbers_from_file(path, vocabulary):
    with open(path) as f:
        content = f.read()

    sentences = tokenize(content)
    
    numbers = []
    for sentence in sentences:
        sentence_numbers = vocabulary.lookup_indices(sentence)
        sentence_numbers = [num for num in sentence_numbers if num != -1]
        if sentence_numbers:
            numbers.extend(sentence_numbers)

    return numbers

In [14]:
def get_numbers_from_directories(paths, vocabulary):
    nested_paths = [list(glob.glob(f'{path}/**/*.txt', recursive=True)) for path in paths]
    paths = list(itertools.chain(*nested_paths))    
    tasks = list(zip(paths, itertools.repeat(vocabulary)))
    
    with multiprocessing.Pool(multiprocessing.cpu_count()) as pool:
        texts = pool.starmap(get_numbers_from_file, tqdm(tasks, total=len(tasks)))
        
    texts = [text for text in texts if len(text) > 20]
    return texts

## Create vocabulary

In [7]:
directories = [
    '../../data/raw_texts/ebooks17k',
]
vocabulary = create_vocabulary(directories, max_tokens=150_000)

In [10]:
torch.save(vocabulary, '../../models/vocabulary.pth')

In [15]:
vocabulary = torch.load('../../models/vocabulary.pth')

## Convert texts to numbers

In [None]:
directories = [
    '../../data/raw_texts/ebooks17k',
]
numbers = get_numbers_from_directories(directories, vocabulary)

In [15]:
with open('../../data/binary_texts/ebooks17k.pickle', 'wb') as f:
    pickle.dump(numbers, f)

Fairy tales

In [16]:
directories = [
    '../../data/raw_texts/bajki-zasypianki',
    '../../data/raw_texts/bajkokraj',
    '../../data/raw_texts/bajkownia',
    '../../data/raw_texts/basnie',
    '../../data/raw_texts/miastodzieci',
    '../../data/raw_texts/misc_fairytales',
    '../../data/raw_texts/wolne_lektury_bajki',
    '../../data/raw_texts/wolne_lektury_ksiazki',
]
numbers = get_numbers_from_directories(directories, vocabulary)

100%|██████████| 3492/3492 [00:03<00:00, 1126.97it/s]


In [18]:
with open('../../data/binary_texts/fairytales.pickle', 'wb') as f:
    pickle.dump(numbers, f)

: 