# Create a Smaller Training Set

The whole language corpus is 5.1 GB unzipped. Developing the model and debugging it would take too long. We want quick iteration.

Hence, I will take a sample of the training set. 

In [1]:
import os
import pathlib
import random
import shutil

random.seed(1)

In [2]:
# Settings

PATH = pathlib.Path('txt')  # Origin Directory
SAMP_PATH = pathlib.Path('train_sampl')  # Destination for the subsample

FRAC= 0.01  # Fraction to be sampled

In [3]:
# Get list of languages
LANGS = list(map(lambda x: x.name, list(PATH.iterdir())))
print(LANGS)
print(f'Number of languages: {str(len(LANGS))}')

['nl', 'lv', 'es', 'el', 'cs', 'sl', 'it', 'fr', 'pt', 'fi', 'lt', 'da', 'et', 'de', 'ro', 'bg', 'pl', 'sv', 'en', 'hu', 'sk']
Number of languages: 21


In [4]:
def createDirs(parent, children):
    '''Create a set of subdirectories from parent.'''
    for child in children:
        p = parent / child
        p.mkdir(parents=True, exist_ok=True)

In [5]:
createDirs(SAMP_PATH, LANGS)

In [6]:
def moveList(fn_list, newdir):
    '''Move a list of files to a new directory.'''
    sz = 0
    for fn in fn_list:
        shutil.copy(str(fn), str(newdir/fn.name))
        sz += os.path.getsize(fn)
    print(f'Moved {str(sz/1e6)} Mb to {str(newdir)}')
    return sz / 1e6

In [7]:
sizes = {}  # Storing the destination dir sizes (MB)
num_fns = {}  # Storing the number of text files in destination
for lang in LANGS:
    src_dir = PATH/lang
    fns = list(src_dir.glob('*.txt'))
    sub = random.sample(fns, int(len(fns)*FRAC))
    num_fns[lang] = len(sub)
    sizes[lang] = moveList(sub, SAMP_PATH/lang)

Moved 4.440373 Mb to train_sampl/nl
Moved 1.135789 Mb to train_sampl/lv
Moved 2.978393 Mb to train_sampl/es
Moved 4.223865 Mb to train_sampl/el
Moved 0.709064 Mb to train_sampl/cs
Moved 1.022458 Mb to train_sampl/sl
Moved 3.534307 Mb to train_sampl/it
Moved 4.597801 Mb to train_sampl/fr
Moved 2.771892 Mb to train_sampl/pt
Moved 3.596482 Mb to train_sampl/fi
Moved 1.003252 Mb to train_sampl/lt
Moved 4.09755 Mb to train_sampl/da
Moved 0.667176 Mb to train_sampl/et
Moved 3.59453 Mb to train_sampl/de
Moved 0.269565 Mb to train_sampl/ro
Moved 0.467658 Mb to train_sampl/bg
Moved 0.605841 Mb to train_sampl/pl
Moved 2.413989 Mb to train_sampl/sv
Moved 4.903372 Mb to train_sampl/en
Moved 1.227479 Mb to train_sampl/hu
Moved 1.032164 Mb to train_sampl/sk


In [8]:
def sorted_dic(x): return sorted(x.items(), key=lambda kv: kv[1])

In [9]:
print(sorted_dic(num_fns))

[('ro', 65), ('bg', 65), ('lv', 87), ('sl', 87), ('hu', 87), ('cs', 88), ('lt', 88), ('et', 88), ('pl', 88), ('sk', 88), ('el', 92), ('de', 92), ('fi', 93), ('da', 93), ('nl', 94), ('es', 94), ('it', 94), ('fr', 94), ('pt', 94), ('sv', 94), ('en', 96)]


In [10]:
print(sorted_dic(sizes))

[('ro', 0.269565), ('bg', 0.467658), ('pl', 0.605841), ('et', 0.667176), ('cs', 0.709064), ('lt', 1.003252), ('sl', 1.022458), ('sk', 1.032164), ('lv', 1.135789), ('hu', 1.227479), ('sv', 2.413989), ('pt', 2.771892), ('es', 2.978393), ('it', 3.534307), ('de', 3.59453), ('fi', 3.596482), ('da', 4.09755), ('el', 4.223865), ('nl', 4.440373), ('fr', 4.597801), ('en', 4.903372)]


Looks like the number of files is quite similar for all languages (range: 65 - 96).

File sizes differ by a factor of 10. Romanian has the smallest sampled corpus with 0.26 Mb.