### This notebook preprocesses the all text datasets - monolingual corpus and parallel data - and saves then in a serialized form to be loaded for training

Imports

In [None]:
import os
import sys
import nltk
nltk.download('punkt')
from src.logger import create_logger
from src.data.dictionary import Dictionary

Preprocessing function

In [None]:
def preprocess(voc_path:str, txt_path:str, bin_path:str, glove = False):
    
    '''
    voc_path: path to word vectors
    txt_path: path to corpus or parallel data text file
    bin_path: path to serialized data. If such a file exists in the path, the data is loaded from there, if not, 
    a file is created and the data stored there.
    
    '''
    
    logger = create_logger(None)

    voc_path = voc_path
    txt_path = txt_path
    bin_path = bin_path
    assert os.path.isfile(voc_path)
    assert os.path.isfile(txt_path)

    dico = Dictionary.read_vocab(voc_path, glove)
    logger.info("")

    data = Dictionary.index_data(txt_path, bin_path, dico)
    logger.info("%i words (%i unique) in %i sentences." % (
        len(data['sentences']) - len(data['positions']),
        len(data['dico']),
        len(data['positions'])
    ))
    if len(data['unk_words']) > 0:
        logger.info("%i unknown words (%i unique), covering %.2f%% of the data." % (
            sum(data['unk_words'].values()),
            len(data['unk_words']),
            sum(data['unk_words'].values()) * 100. / (len(data['sentences']) - len(data['positions']))
        ))
        if len(data['unk_words']) < 30:
            for w, c in sorted(data['unk_words'].items(), key=lambda x: x[1])[::-1]:
                logger.info("%s: %i" % (w, c))
    else:
        logger.info("0 unknown word.")


Specify datasets

In [None]:
en = zip(['en_train', 'en_valid', 'en_test', 'en_para_valid','en_para_test'], ('en_train.txt', 'en_valid.txt', 'en_test.txt'\
                                                                                'en_para_valid.txt','en_para_test.txt'
                                                                              ))
pd = zip(['pd_train', 'pd_valid', 'pd_test','pd_para_valid','pd_para_test' ], ('pd_train.txt', 'pd_valid.txt','pd_test.txt',\
                                                                               'pd_para_valid.txt','pd_para_test.txt'))

Run preprocessing

In [None]:
for name, path in en:
    preprocess('pidg_vect_RCSLS.txt', path, name+'.pt')

In [None]:
for name, path in pd:
    preprocess('pidg_vect_RCSLS.txt', path, name+'.pt')