# Create a Smaller Training Set

The whole language corpus is 5.1 GB unzipped. Developing the model and debugging it would take too long. We want quick iteration.

Hence, I will take a sample of the training set for quick development. 

In [1]:
import os
import random
import shutil
import time

from pathlib import Path

start = time.time()
random.seed(1)

In [2]:
# Settings

PATH = Path('data/train')  # Origin Directory
SAMP_PATH = Path('data/train_sampl')  # Destination for the subsample

FRAC= 0.05  # Fraction to be sampled

In [3]:
# Get list of languages
LANGS = list(map(lambda x: x.name, list(PATH.iterdir())))
print(LANGS)
print(f'Number of languages: {str(len(LANGS))}')

['fi', 'et', 'it', 'lt', 'pt', 'lv', 'nl', 'pl', 'bg', 'en', 'sk', 'fr', 'da', 'hu', 'cs', 'sl', 'es', 'el', 'ro', 'de', 'sv']
Number of languages: 21


In [4]:
def createDirs(parent, children):
    '''Create subdirectories from parent. children is a list of subdirectories to be created.'''
    for child in children:
        p = parent / child
        p.mkdir(parents=True, exist_ok=True)

In [5]:
createDirs(SAMP_PATH, LANGS)

In [6]:
def moveList(fn_list, newdir):
    '''Move a list of files to a new directory. Returns sizes of moved files (in MB)'''
    sz = 0
    for fn in fn_list:
        shutil.copy(str(fn), str(newdir/fn.name))
        sz += os.path.getsize(fn)
    print(f'Moved {str(sz/1e6)} Mb to {str(newdir)}')
    return sz / 1e6

In [7]:
sizes = {}  # Storing the destination dir sizes (MB)
num_fns = {}  # Storing the number of text files in destination
for lang in LANGS:
    src_dir = PATH/lang
    fns = list(src_dir.glob('*.txt'))
    sub = random.sample(fns, int(len(fns)*FRAC))
    num_fns[lang] = len(sub)
    sizes[lang] = moveList(sub, SAMP_PATH/lang)

Moved 16.632654 Mb to data/train_sampl/fi
Moved 5.086472 Mb to data/train_sampl/et
Moved 20.057877 Mb to data/train_sampl/it
Moved 5.014759 Mb to data/train_sampl/lt
Moved 18.737812 Mb to data/train_sampl/pt
Moved 4.660141 Mb to data/train_sampl/lv
Moved 18.311466 Mb to data/train_sampl/nl
Moved 7.000829 Mb to data/train_sampl/pl
Moved 7.50399 Mb to data/train_sampl/bg
Moved 18.661888 Mb to data/train_sampl/en
Moved 5.761011 Mb to data/train_sampl/sk
Moved 16.878143 Mb to data/train_sampl/fr
Moved 17.539768 Mb to data/train_sampl/da
Moved 5.038574 Mb to data/train_sampl/hu
Moved 4.843666 Mb to data/train_sampl/cs
Moved 4.857163 Mb to data/train_sampl/sl
Moved 22.853735 Mb to data/train_sampl/es
Moved 29.460235 Mb to data/train_sampl/el
Moved 3.504754 Mb to data/train_sampl/ro
Moved 21.435025 Mb to data/train_sampl/de
Moved 20.786777 Mb to data/train_sampl/sv


In [8]:
def sorted_dic(x): return sorted(x.items(), key=lambda kv: kv[1])

print(sorted_dic(num_fns))
print('\n')
print(sorted_dic(sizes))

[('ro', 328), ('bg', 329), ('sl', 437), ('hu', 438), ('lv', 439), ('et', 440), ('lt', 440), ('sk', 440), ('pl', 441), ('cs', 442), ('de', 461), ('el', 463), ('fi', 466), ('da', 468), ('sv', 470), ('pt', 471), ('nl', 471), ('es', 471), ('fr', 472), ('it', 474), ('en', 483)]


[('ro', 3.504754), ('lv', 4.660141), ('cs', 4.843666), ('sl', 4.857163), ('lt', 5.014759), ('hu', 5.038574), ('et', 5.086472), ('sk', 5.761011), ('pl', 7.000829), ('bg', 7.50399), ('fi', 16.632654), ('fr', 16.878143), ('da', 17.539768), ('nl', 18.311466), ('en', 18.661888), ('pt', 18.737812), ('it', 20.057877), ('sv', 20.786777), ('de', 21.435025), ('es', 22.853735), ('el', 29.460235)]


Looks like the number of files is quite similar for all languages (range: 65 - 96).

File sizes differ by a factor of 10.

In [9]:
end = time.time()
print(f'Total time : {(end - start)/60} mins')

Total time : 0.0238874355951945 mins
