In [1]:
import re
import glob
import multiprocessing
import itertools
import string
import pickle
from pathlib import Path

from tqdm import tqdm

In [2]:
def read_file(path, encoding):
    try:
        with open(path, encoding=encoding) as f:
            lines = f.readlines()
            lines = [line.strip().lower() for line in lines]
            return ' '.join(lines)
    except:
        return ''

In [3]:
def read_corpus_file(path, encoding='cp1250', skip_sentences=0):
    text = read_file(path, encoding)
    text = re.sub(r'[^a-ząćęłńóśźż.,!? ]', '', text)
    text = text.replace('.', ' . ').replace('!', ' ! ').replace('?', ' ? ').replace(',', ' , ')
    words = text.split()
    
    sentences = []
    sentence = []
    while words:
        word = words.pop(0)
        sentence.append(word)
        if word in ['?', '.', '!']:
            if len(sentence) >= 4:
                sentences.append(sentence)
            sentence = []
            
    return sentences[skip_sentences:]

In [4]:
def read_corpus(path, encoding='utf-8', skip_sentences=0):
    corpus = []
    
    paths = list(glob.glob(f'{path}/**/*.txt', recursive=True))
    tasks = list(zip(
        paths,
        itertools.repeat(encoding),
        itertools.repeat(skip_sentences),
    ))
    
    with multiprocessing.Pool(multiprocessing.cpu_count()) as pool:
        corpus = pool.starmap(read_corpus_file, tqdm(tasks, total=len(tasks)))
        
    return corpus

In [5]:
corpus = read_corpus('../../data/raw_texts/ebooks17k/1', encoding='cp1250', skip_sentences=10)

with open('../../data/binary_texts/punctuation/ebooks17k_1.pickle', 'wb') as f:
    pickle.dump(corpus, f)

100%|██████████| 4882/4882 [02:43<00:00, 29.78it/s]


In [8]:
corpus = read_corpus('../../data/raw_texts/ebooks17k/2', encoding='cp1250', skip_sentences=10)

with open('../../data/binary_texts/punctuation/ebooks17k_2.pickle', 'wb') as f:
    pickle.dump(corpus, f)

100%|██████████| 4983/4983 [02:48<00:00, 29.52it/s]


In [10]:
corpus = read_corpus('../../data/raw_texts/ebooks17k/3', encoding='cp1250', skip_sentences=10)

with open('../../data/binary_texts/punctuation/ebooks17k_3.pickle', 'wb') as f:
    pickle.dump(corpus, f)

100%|██████████| 4912/4912 [02:29<00:00, 32.94it/s]


In [12]:
corpus = read_corpus('../../data/raw_texts/ebooks17k/4', encoding='cp1250', skip_sentences=10)

with open('../../data/binary_texts/punctuation/ebooks17k_4.pickle', 'wb') as f:
    pickle.dump(corpus, f)

100%|██████████| 1997/1997 [00:33<00:00, 59.76it/s] 


In [14]:
corpus = read_corpus('../../data/raw_texts/fairy_tales', encoding='utf-8', skip_sentences=3)

with open('../../data/binary_texts/punctuation/fairy_tales.pickle', 'wb') as f:
    pickle.dump(corpus, f)

100%|██████████| 43/43 [00:00<00:00, 56325.76it/s]


## Removing punctuation

In [17]:
def remove_punctuation(read_path, save_path):
    punctuation = list('.!?,')
    
    with open(read_path, 'rb') as f:
        corpus = pickle.load(f)
    
    for text in tqdm(corpus):
        for sentence_no in range(len(text)):
            text[sentence_no] = [word for word in text[sentence_no] if word not in punctuation]
            
    with open(save_path, 'wb') as f:
        corpus = pickle.dump(corpus, f)

In [18]:
read_base = Path('../../data/binary_texts/punctuation')
write_base = Path('../../data/binary_texts/no_punctuation')
filenames = ['ebooks17k_1.pickle', 'ebooks17k_2.pickle', 'ebooks17k_3.pickle', 'ebooks17k_4.pickle', 'fairy_tales.pickle']

In [19]:
for filename in filenames:
    read_path = read_base / filename
    write_path = write_base / filename
    remove_punctuation(read_path, write_path)

100%|██████████| 4882/4882 [00:17<00:00, 275.32it/s]
100%|██████████| 4983/4983 [00:16<00:00, 297.03it/s]
100%|██████████| 4912/4912 [00:15<00:00, 308.25it/s]
100%|██████████| 1997/1997 [00:07<00:00, 276.04it/s]
100%|██████████| 43/43 [00:00<00:00, 1539.81it/s]


## Saving in LineSentence format

In [20]:
def append_to_file(corpus_path, file_path):
    with open(corpus_path, 'rb') as f:
        corpus = pickle.load(f)
        
    with open(file_path, 'a') as f:
        for text in tqdm(corpus):
            for sentence in text:
                sentence = ' '.join(sentence) + '\n'
                f.write(sentence)

In [21]:
read_base = Path('../../data/binary_texts/no_punctuation')
filenames = ['ebooks17k_1.pickle', 'ebooks17k_2.pickle', 'ebooks17k_3.pickle', 'ebooks17k_4.pickle', 'fairy_tales.pickle']
text_file = '../../data/line_sentence/no_punctuation.txt'

In [22]:
for filename in filenames:
    read_path = read_base / filename
    append_to_file(read_path, text_file)

100%|██████████| 4882/4882 [00:14<00:00, 339.16it/s]
100%|██████████| 4983/4983 [00:13<00:00, 374.79it/s]
100%|██████████| 4912/4912 [00:13<00:00, 363.15it/s]
100%|██████████| 1997/1997 [00:05<00:00, 355.99it/s]
100%|██████████| 43/43 [00:00<00:00, 1847.16it/s]
