# Create a Smaller Training Set

The whole language corpus is 5.1 GB unzipped. Developing the model and debugging it would take too long. We want quick iteration.

Hence, I will take a sample of the training set for quick development. 

In [1]:
import os
import random
import shutil
import time

from pathlib import Path

start = time.time()
random.seed(1)

In [2]:
# Settings

PATH = Path('data/train')  # Origin Directory
SAMP_PATH = Path('data/train_sampl')  # Destination for the subsample

FRAC= 0.02  # Fraction to be sampled

In [3]:
# Get list of languages
LANGS = list(map(lambda x: x.name, list(PATH.iterdir())))
print(LANGS)
print(f'Number of languages: {str(len(LANGS))}')

['fi', 'et', 'it', 'lt', 'pt', 'lv', 'nl', 'pl', 'bg', 'en', 'sk', 'fr', 'da', 'hu', 'cs', 'sl', 'es', 'el', 'ro', 'de', 'sv']
Number of languages: 21


In [4]:
def createDirs(parent, children):
    '''Create subdirectories from parent. children is a list of subdirectories to be created.'''
    for child in children:
        p = parent / child
        p.mkdir(parents=True, exist_ok=True)

In [5]:
createDirs(SAMP_PATH, LANGS)

In [6]:
def moveList(fn_list, newdir):
    '''Move a list of files to a new directory. Returns sizes of moved files (in MB)'''
    sz = 0
    for fn in fn_list:
        shutil.copy(str(fn), str(newdir/fn.name))
        sz += os.path.getsize(fn)
    print(f'Moved {str(sz/1e6)} Mb to {str(newdir)}')
    return sz / 1e6

In [7]:
sizes = {}  # Storing the destination dir sizes (MB)
num_fns = {}  # Storing the number of text files in destination
for lang in LANGS:
    src_dir = PATH/lang
    fns = list(src_dir.glob('*.txt'))
    sub = random.sample(fns, int(len(fns)*FRAC))
    num_fns[lang] = len(sub)
    sizes[lang] = moveList(sub, SAMP_PATH/lang)

Moved 7.286684 Mb to data/train_sampl/fi
Moved 1.887378 Mb to data/train_sampl/et
Moved 7.971494 Mb to data/train_sampl/it
Moved 2.20503 Mb to data/train_sampl/lt
Moved 7.709349 Mb to data/train_sampl/pt
Moved 2.638039 Mb to data/train_sampl/lv
Moved 6.242693 Mb to data/train_sampl/nl
Moved 1.840279 Mb to data/train_sampl/pl
Moved 1.435472 Mb to data/train_sampl/bg
Moved 3.772149 Mb to data/train_sampl/en
Moved 1.390518 Mb to data/train_sampl/sk
Moved 5.557626 Mb to data/train_sampl/fr
Moved 9.095126 Mb to data/train_sampl/da
Moved 2.24294 Mb to data/train_sampl/hu
Moved 3.245639 Mb to data/train_sampl/cs
Moved 1.489168 Mb to data/train_sampl/sl
Moved 6.291547 Mb to data/train_sampl/es
Moved 9.462035 Mb to data/train_sampl/el
Moved 1.159719 Mb to data/train_sampl/ro
Moved 6.53398 Mb to data/train_sampl/de
Moved 5.618578 Mb to data/train_sampl/sv


In [8]:
def sorted_dic(x): return sorted(x.items(), key=lambda kv: kv[1])

print(sorted_dic(num_fns))
print('\n')
print(sorted_dic(sizes))

[('bg', 131), ('ro', 131), ('sl', 174), ('lv', 175), ('hu', 175), ('et', 176), ('lt', 176), ('pl', 176), ('sk', 176), ('cs', 176), ('de', 184), ('el', 185), ('fi', 186), ('da', 187), ('pt', 188), ('nl', 188), ('es', 188), ('sv', 188), ('it', 189), ('fr', 189), ('en', 193)]


[('ro', 1.159719), ('sk', 1.390518), ('bg', 1.435472), ('sl', 1.489168), ('pl', 1.840279), ('et', 1.887378), ('lt', 2.20503), ('hu', 2.24294), ('lv', 2.638039), ('cs', 3.245639), ('en', 3.772149), ('fr', 5.557626), ('sv', 5.618578), ('nl', 6.242693), ('es', 6.291547), ('de', 6.53398), ('fi', 7.286684), ('pt', 7.709349), ('it', 7.971494), ('da', 9.095126), ('el', 9.462035)]


Looks like the number of files is quite similar for all languages (range: 65 - 96).

File sizes differ by a factor of 10.

In [9]:
end = time.time()
print(f'Total time : {(end - start)/60} mins')

Total time : 0.06659005085627238 mins
