## Download data

In [1]:
!pwd

/home/krivas/projects/transfer_learning_nmt/nb


In [2]:
!find ../ -name _about* -exec rm -rf {} \;

In [3]:
!find ../ -name *ipynb_checkpoints* -exec rm -rf {} \;

find: ‘../nb/.ipynb_checkpoints’: No such file or directory


In [16]:
import os
import pyphen
import re
from pathlib import Path
from sklearn.model_selection import train_test_split
import unicodedata
import pandas as pd
import sentencepiece as spm
#os.chdir('../')

In [17]:
def save_f(path, arr):
    cont = 0
    with open(path, 'w') as f:
        for l in arr:
            if len(l):
                print(l, file=f)


In [18]:
def read_multi_txt(path_1, path_2, use_max_sent=True, max_sent=5000):
    lines = []
    cont = 0
    with open(path_1, 'r') as f1, open(path_2, 'r') as f2:
        for line_1, line_2 in zip(f1.read().split('\n'), f2.read().split('\n')):
            if len(line_1) and len(line_2) and cont < max_sent:
                lines.append(line_1 + '\t' + line_2)
                if use_max_sent:
                    cont += 1    
    return lines

In [26]:
def save_word_transfer(word_dir, train_in, dev_in, test_in, train_out, dev_out, test_out, use_max_sent):
    os.makedirs(word_dir, exist_ok=True)
    
    train = read_multi_txt(train_in, train_out, use_max_sent, 5000)
    valid = read_multi_txt(dev_in, dev_out, use_max_sent, 500)
    test = read_multi_txt(test_in, test_out, use_max_sent, 500)
    
    save_f(word_dir / 'train.tsv', train)
    save_f(word_dir / f'train.{lang_in}', [line.split('\t')[0].strip() for line in train if len(line)])
    save_f(word_dir / f'train.{lang_out}', [line.split('\t')[1].strip() for line in train if len(line)])
    
    save_f(word_dir / 'test.tsv', test)
    save_f(word_dir / f'test.{lang_in}', [line.split('\t')[0].strip() for line in test if len(line)])
    save_f(word_dir / f'test.{lang_out}', [line.split('\t')[1].strip() for line in test if len(line)])

    save_f(word_dir / 'valid.tsv', valid)
    save_f(word_dir / f'valid.{lang_in}', [line.split('\t')[0].strip() for line in valid if len(line)])
    save_f(word_dir / f'valid.{lang_out}', [line.split('\t')[1].strip() for line in valid if len(line)])
    

In [27]:
def save_word_translation(file_path, word_dir, lang_in, lang_out):
    os.makedirs(word_dir, exist_ok=True)
    p_lines = []
    with open(file_path) as f:
        for line in f.read().split('\n'):
            line = re.sub(r'([.¡!¿?;,:])', r' \1 ', line)
            line = ' '.join([w for w in line.split(' ')])
            line = line.replace('  ', ' ')
            line = line.strip(' ')
            p_lines.append(line)
    
    train, temp = train_test_split(p_lines, test_size=0.1, random_state=0)
    valid, test = train_test_split(temp, test_size=0.5, random_state=0)
    #print([line.split('\t') for line in train][0])
    #print([line.split('\t')[0] for line in train][:10])
    save_f(word_dir / 'train.tsv', train)
    save_f(word_dir / f'train.{lang_in}', [line.split('\t')[1].strip() for line in train if len(line)])
    save_f(word_dir / f'train.{lang_out}', [line.split('\t')[0].strip() for line in train if len(line)])
    
    save_f(word_dir / 'test.tsv', test)
    save_f(word_dir / f'test.{lang_in}', [line.split('\t')[1].strip() for line in test if len(line)])
    save_f(word_dir / f'test.{lang_out}', [line.split('\t')[0].strip() for line in test if len(line)])

    save_f(word_dir / 'valid.tsv', valid)
    save_f(word_dir / f'valid.{lang_in}', [line.split('\t')[1].strip() for line in valid if len(line)])
    save_f(word_dir / f'valid.{lang_out}', [line.split('\t')[0].strip() for line in valid if len(line)])
    

In [28]:
def save_char_segments(word_dir, char_dir, lang_in, lang_out):
    os.makedirs(char_dir, exist_ok=True)
    for lang in [lang_out, lang_in]:
        spm.SentencePieceTrainer.Train(f'--input={word_dir}/all.{lang} --model_prefix=m --vocab_size=1000 --character_coverage=1.0 --model_type=char')   

        sp = spm.SentencePieceProcessor()
        sp.Load("m.model")
        for file in [f'train.{lang}', f'valid.{lang}', f'test.{lang}']:
            f_in = open(word_dir / file, 'r')
            f_out = open(char_dir / file, 'w')
            
            for line in f_in.read().split('\n'):
                temp = []
                for word in sp.EncodeAsPieces(line.replace('<unk>', '<unknown>')):
                    if str('\u2581') in word:
                        word = word.replace(str('\u2581'), '@@')
                    temp.append(word)
                f_out.write(" ".join(temp) + "\n")

            f_in.close()
            f_out.close()
            

In [29]:
def save_bpe_segments(word_dir, prepro_dir, lang_in, lang_out, n_opers, dropout=False):
    
    os.system(f'cat {word_dir}/train.{lang_in} {word_dir}/valid.{lang_in} > {word_dir}/all.{lang_in}')  
    os.system(f'cat {word_dir}/train.{lang_out} {word_dir}/valid.{lang_out} > {word_dir}/all.{lang_out}')
    
    for oper in n_opers:
        bpe_dir = prepro_dir / (f'bpe_drop_{oper}' if dropout else f'bpe_{oper}')
        os.makedirs(bpe_dir, exist_ok=True)
        p_lines = []
          
        os.system(f'cat {word_dir}/all.{lang_out} {word_dir}/all.{lang_in} | subword-nmt learn-bpe -s {oper} -o {bpe_dir}/codes.all')

        for lang in [lang_out, lang_in]:
            os.system(f'subword-nmt apply-bpe --dropout {0.1 if dropout else 0} -c {bpe_dir}/codes.all < {word_dir}/all.{lang} | subword-nmt get-vocab > {bpe_dir}/vocab.{lang}')
            
            os.system(f'subword-nmt apply-bpe --dropout {0.1 if dropout else 0} -c {bpe_dir}/codes.all < {word_dir}/test.{lang} > {bpe_dir}/test.bpe.{lang}')
            os.system(f'subword-nmt apply-bpe --dropout {0.1 if dropout else 0} -c {bpe_dir}/codes.all < {word_dir}/train.{lang} > {bpe_dir}/train.bpe.{lang}')
            os.system(f'subword-nmt apply-bpe --dropout {0.1 if dropout else 0} -c {bpe_dir}/codes.all < {word_dir}/valid.{lang} > {bpe_dir}/valid.bpe.{lang}')
            
        print('join corpus')
        for corpus in ['valid', 'test', 'train']:
            l1 = open(f'{bpe_dir}/{corpus}.bpe.{lang_out}', 'r').read().split('\n')
            l2 = open(f'{bpe_dir}/{corpus}.bpe.{lang_in}', 'r').read().split('\n')
            pd.DataFrame(list(zip(l2, l1))).to_csv(f'{bpe_dir}/{corpus}.tsv', header=None, index=False, sep='\t')
            save_f(bpe_dir / f'{corpus}.{lang_out}', l1)
            save_f(bpe_dir / f'{corpus}.{lang_in}', l2)            
            

In [30]:
def save_segmentation(prepro_dir, lang_in, lang_out, n_opers=[5000]):
    save_bpe_segments(prepro_dir / 'word', prepro_dir, lang_in, lang_out, n_opers=n_opers)    
    save_bpe_segments(prepro_dir / 'word', prepro_dir, lang_in, lang_out, n_opers=n_opers, dropout=True)
    save_char_segments(prepro_dir / 'word', prepro_dir / 'char', lang_in, lang_out)

In [31]:
def read_txt(path):
    lines = []
    with open(path, 'r') as f:
        for line in f.read().split('\n'):
            if len(line):
                lines.append(line)
    
    return lines

In [32]:
base_dir = Path('../data')

In [33]:
raw_dir = base_dir / 'transfer' / 'raw'
prepro_dir = base_dir / 'transfer' / 'preprocessed'
for lang in ['es', 'en']:
    lang_dir = raw_dir / f'splits.{lang}'
    pair_langs = read_txt(lang_dir / f'all.train.{lang}-ll.lang-pairs')
    pair_langs = list(set(pair_langs))
    for pair_lang in pair_langs:
        lang_out, lang_in = pair_lang.split(' ')
        train_in = lang_dir / 'train' / f'{lang_in}-{lang_out}.train.{lang_in}'
        dev_in = lang_dir / 'dev' / f'{lang_in}-{lang_out}.dev.{lang_in}'
        test_in = lang_dir / 'test' / f'{lang_in}-{lang_out}.test.{lang_in}'
        
        train_out = lang_dir / 'train' / f'{lang_in}-{lang_out}.train.{lang_out}'
        dev_out = lang_dir / 'dev' / f'{lang_in}-{lang_out}.dev.{lang_out}'
        test_out = lang_dir / 'test' / f'{lang_in}-{lang_out}.test.{lang_out}'
        
        print(train_in)
        if 'shp' in pair_lang:
            use_max_sent = False
        else:
            use_max_sent = True
            
        segment_dir = prepro_dir / f'splits.{lang}' / f'{lang_in}-{lang_out}'
        save_word_transfer(segment_dir / 'word',\
                           train_in, dev_in, test_in,\
                           train_out, dev_out, test_out,\
                           use_max_sent)
        save_segmentation(segment_dir, lang_in, lang_out)


../data/transfer/raw/splits.es/train/es-de.train.es
join corpus
join corpus
../data/transfer/raw/splits.es/train/es-tr.train.es
join corpus
join corpus
../data/transfer/raw/splits.es/train/es-uk.train.es
join corpus
join corpus
../data/transfer/raw/splits.es/train/es-nl.train.es
join corpus
join corpus
../data/transfer/raw/splits.es/train/es-pl.train.es
join corpus
join corpus
../data/transfer/raw/splits.es/train/es-it.train.es
join corpus
join corpus
../data/transfer/raw/splits.es/train/es-ru.train.es
join corpus
join corpus
../data/transfer/raw/splits.es/train/es-fr.train.es
join corpus
join corpus
../data/transfer/raw/splits.es/train/es-en.train.es
join corpus
join corpus
../data/transfer/raw/splits.es/train/es-pt.train.es
join corpus
join corpus
../data/transfer/raw/splits.es/train/es-shp.train.es
join corpus
join corpus
../data/transfer/raw/splits.es/train/es-da.train.es
join corpus
join corpus
../data/transfer/raw/splits.es/train/es-hu.train.es
join corpus
join corpus
../data/tra

In [None]:
raw_dir = base_dir / 'translate' / 'raw'
prepro_dir = base_dir / 'translate' / 'preprocessed'
for dir_temp in os.listdir(raw_dir):
    lang_dir = raw_dir / dir_temp
    if os.path.isdir(lang_dir):
        print(lang_dir)
        #file_path = lang_dir / 'all.txt'#lang_dir / (os.listdir(lang_dir)[0] if 'txt' in os.listdir(lang_dir)[0] else os.listdir(lang_dir)[1])
        lang_in = 'es'
        lang_out = 'shp'
        save_word_translation(file_path, prepro_dir / 'word', lang_in, lang_out)
        save_segmentation(prepro_dir / dir_temp, lang_in, lang_out, list(range(1000, 11000, 1000)))


In [43]:
dir_temp

'Flashcards'

In [41]:
list(range(1000, 11000, 1000))

[1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000]

In [38]:
dir_temp

'Flashcards'

In [33]:
dir_temp

'shp-es'

In [32]:
file_path

PosixPath('../data/raw/shp-es/all.shi')