# Create a Smaller Training Set

The whole language corpus is 5.1 GB unzipped. Developing the model and debugging it would take too long. We want quick iteration.

Hence, I will take a sample of the training set for quick development. 

In [1]:
import os
import random
import shutil
import time

from pathlib import Path

start = time.time()
random.seed(1)

In [2]:
# Settings

PATH = Path('data/train')  # Origin Directory
SAMP_PATH = Path('data/train_sampl_1pct')  # Destination for the subsample

FRAC= 0.01  # Fraction to be sampled

In [3]:
# Get list of languages
LANGS = list(map(lambda x: x.name, list(PATH.iterdir())))
print(LANGS)
print(f'Number of languages: {str(len(LANGS))}')

['fi', 'et', 'it', 'lt', 'pt', 'lv', 'nl', 'pl', 'bg', 'en', 'sk', 'fr', 'da', 'hu', 'cs', 'sl', 'es', 'el', 'ro', 'de', 'sv']
Number of languages: 21


In [4]:
def createDirs(parent, children):
    '''Create subdirectories from parent. children is a list of subdirectories to be created.'''
    for child in children:
        p = parent / child
        p.mkdir(parents=True, exist_ok=True)

In [5]:
createDirs(SAMP_PATH, LANGS)

In [6]:
def moveList(fn_list, newdir):
    '''Move a list of files to a new directory. Returns sizes of moved files (in MB)'''
    sz = 0
    for fn in fn_list:
        shutil.copy(str(fn), str(newdir/fn.name))
        sz += os.path.getsize(fn)
    print(f'Moved {str(sz/1e6)} Mb to {str(newdir)}')
    return sz / 1e6

In [7]:
sizes = {}  # Storing the destination dir sizes (MB)
num_fns = {}  # Storing the number of text files in destination
for lang in LANGS:
    src_dir = PATH/lang
    fns = list(src_dir.glob('*.txt'))
    sub = random.sample(fns, int(len(fns)*FRAC))
    num_fns[lang] = len(sub)
    sizes[lang] = moveList(sub, SAMP_PATH/lang)

Moved 3.444598 Mb to data/train_sampl_1pct/fi
Moved 1.096267 Mb to data/train_sampl_1pct/et
Moved 2.859454 Mb to data/train_sampl_1pct/it
Moved 0.656748 Mb to data/train_sampl_1pct/lt
Moved 3.177443 Mb to data/train_sampl_1pct/pt
Moved 0.626933 Mb to data/train_sampl_1pct/lv
Moved 2.959135 Mb to data/train_sampl_1pct/nl
Moved 0.934612 Mb to data/train_sampl_1pct/pl
Moved 1.19858 Mb to data/train_sampl_1pct/bg
Moved 5.050996 Mb to data/train_sampl_1pct/en
Moved 0.894408 Mb to data/train_sampl_1pct/sk
Moved 3.606695 Mb to data/train_sampl_1pct/fr
Moved 4.568994 Mb to data/train_sampl_1pct/da
Moved 1.213034 Mb to data/train_sampl_1pct/hu
Moved 0.9218 Mb to data/train_sampl_1pct/cs
Moved 0.842085 Mb to data/train_sampl_1pct/sl
Moved 3.321063 Mb to data/train_sampl_1pct/es
Moved 6.37172 Mb to data/train_sampl_1pct/el
Moved 0.713616 Mb to data/train_sampl_1pct/ro
Moved 3.153675 Mb to data/train_sampl_1pct/de
Moved 3.474459 Mb to data/train_sampl_1pct/sv


In [8]:
def sorted_dic(x): return sorted(x.items(), key=lambda kv: kv[1])

print(sorted_dic(num_fns))
print('\n')
print(sorted_dic(sizes))

[('bg', 65), ('ro', 65), ('lv', 87), ('hu', 87), ('sl', 87), ('et', 88), ('lt', 88), ('pl', 88), ('sk', 88), ('cs', 88), ('el', 92), ('de', 92), ('fi', 93), ('da', 93), ('it', 94), ('pt', 94), ('nl', 94), ('fr', 94), ('es', 94), ('sv', 94), ('en', 96)]


[('lv', 0.626933), ('lt', 0.656748), ('ro', 0.713616), ('sl', 0.842085), ('sk', 0.894408), ('cs', 0.9218), ('pl', 0.934612), ('et', 1.096267), ('bg', 1.19858), ('hu', 1.213034), ('it', 2.859454), ('nl', 2.959135), ('de', 3.153675), ('pt', 3.177443), ('es', 3.321063), ('fi', 3.444598), ('sv', 3.474459), ('fr', 3.606695), ('da', 4.568994), ('en', 5.050996), ('el', 6.37172)]


Looks like the number of files is quite similar for all languages (range: 65 - 96).

File sizes differ by a factor of 10.

In [9]:
end = time.time()
print(f'Total time : {(end - start)/60} mins')

Total time : 0.012813985347747803 mins
