In [1]:
import glob
import multiprocessing
import itertools
import string
import pickle
from pathlib import Path

from tqdm import tqdm

In [2]:
def read_file(path, encoding):
    try:
        with open(path, encoding=encoding) as f:
            lines = f.readlines()
            lines = [line.strip().lower() for line in lines]
            return ' '.join(lines)
    except:
        return ''

In [3]:
illegal_characters = (set(string.punctuation) | set(string.digits) | set('–')) - set('.!?,')
illegal_characters = ''.join(illegal_characters)
illegal_characters

'0_*(8–4\\"2${^[]:+1<@5%9&;-~)/6=>`\'7}|3#'

In [4]:
def read_corpus_file(path, encoding='cp1250', skip_sentences=0):
    text = read_file(path, encoding)
    text = text.translate(str.maketrans('', '', illegal_characters))
    text = text.replace('.', ' . ').replace('!', ' ! ').replace('?', ' ? ').replace(',', ' , ')
    words = text.split()
    
    sentences = []
    sentence = []
    while words:
        word = words.pop(0)
        sentence.append(word)
        if word in ['?', '.', '!']:
            if len(sentence) >= 4:
                sentences.append(sentence)
            sentence = []
            
    return sentences[skip_sentences:]

In [5]:
def read_corpus(path, encoding='utf-8', skip_sentences=0):
    corpus = []
    
    paths = list(glob.glob(f'../../data/{path}/**/*.txt', recursive=True))
    tasks = list(zip(
        paths,
        itertools.repeat(encoding),
        itertools.repeat(skip_sentences),
    ))
    
    with multiprocessing.Pool(multiprocessing.cpu_count()) as pool:
        corpus = pool.starmap(read_corpus_file, tqdm(tasks, total=len(tasks)))
        
    return corpus

In [7]:
corpus = read_corpus('ebooks17k/1', encoding='cp1250', skip_sentences=3)

with open('../../data/ebooks17k_1.pickle', 'wb') as f:
    pickle.dump(corpus, f)

100%|██████████| 4882/4882 [02:44<00:00, 29.67it/s]


In [6]:
corpus = read_corpus('ebooks17k/2', encoding='cp1250', skip_sentences=3)

with open('../../data/ebooks17k_2.pickle', 'wb') as f:
    pickle.dump(corpus, f)

100%|██████████| 4983/4983 [02:32<00:00, 32.57it/s]


In [6]:
corpus = read_corpus('ebooks17k/3', encoding='cp1250', skip_sentences=3)

with open('../../data/ebooks17k_3.pickle', 'wb') as f:
    pickle.dump(corpus, f)

100%|██████████| 4912/4912 [02:30<00:00, 32.68it/s]


In [8]:
corpus = read_corpus('ebooks17k/4', encoding='cp1250', skip_sentences=3)

with open('../../data/ebooks17k_4.pickle', 'wb') as f:
    pickle.dump(corpus, f)

100%|██████████| 1997/1997 [00:17<00:00, 116.10it/s]


In [31]:
corpus = read_corpus('fairy_tales', encoding='utf-8', skip_sentences=3)

with open('../../data/fairy_tales.pickle', 'wb') as f:
    pickle.dump(corpus, f)

100%|██████████| 43/43 [00:00<00:00, 65227.87it/s]


## Removing punctuation

In [2]:
def remove_punctuation(read_path, save_path):
    punctuation = list('.!?,')
    
    with open(read_path, 'rb') as f:
        corpus = pickle.load(f)
    
    for text in tqdm(corpus):
        for sentence_no in range(len(text)):
            text[sentence_no] = [word for word in text[sentence_no] if word not in punctuation]
            
    with open(save_path, 'wb') as f:
        corpus = pickle.dump(corpus, f)

In [3]:
read_base = Path('../../data/binary_texts/punctuation')
write_base = Path('../../data/binary_texts/no_punctuation')
filenames = ['ebooks17k_1.pickle', 'ebooks17k_2.pickle', 'ebooks17k_3.pickle', 'ebooks17k_4.pickle', 'fairy_tales.pickle']

In [4]:
for filename in filenames:
    read_path = read_base / filename
    write_path = write_base / filename
    remove_punctuation(read_path, write_path)

100%|██████████| 4882/4882 [00:16<00:00, 298.99it/s]
100%|██████████| 4983/4983 [00:19<00:00, 258.41it/s]
100%|██████████| 4912/4912 [00:14<00:00, 331.40it/s]
100%|██████████| 1997/1997 [00:06<00:00, 304.78it/s]
100%|██████████| 43/43 [00:00<00:00, 1693.24it/s]


## Saving in LineSentence format

In [4]:
def append_to_file(corpus_path, file_path):
    with open(corpus_path, 'rb') as f:
        corpus = pickle.load(f)
        
    with open(file_path, 'a') as f:
        for text in tqdm(corpus):
            for sentence in text:
                sentence = ' '.join(sentence) + '\n'
                f.write(sentence)

In [5]:
read_base = Path('../../data/binary_texts/no_punctuation')
filenames = ['ebooks17k_1.pickle', 'ebooks17k_2.pickle', 'ebooks17k_3.pickle', 'ebooks17k_4.pickle', 'fairy_tales.pickle']
text_file = '../../data/line_sentence/no_punctuation.txt'

In [6]:
for filename in filenames:
    read_path = read_base / filename
    append_to_file(read_path, text_file)