## Download data

In [15]:
!pwd

/home/krivas/projects/transfer_learning_nmt/nb


In [18]:
!find ../ -name _about* -exec rm -rf {} \;

In [19]:
!find ../ -name *ipynb_checkpoints* -exec rm -rf {} \;

find: ‘../data/transfer/.ipynb_checkpoints’: No such file or directory
find: ‘../data/raw/.ipynb_checkpoints’: No such file or directory


In [30]:
import os
import pyphen
import re
from pathlib import Path
from sklearn.model_selection import train_test_split
import unicodedata
import pandas as pd
import sentencepiece as spm
#os.chdir('../')

In [21]:
base_dir = Path('../data')

In [22]:
def save_f(path, arr):
    with open(path, 'w') as f:
        for l in arr:
            if len(l):
                print(l, file=f)

In [23]:
def save_word_segments(file_path, word_dir, lang_in, lang_out):
    os.makedirs(word_dir, exist_ok=True)
    p_lines = []
    with open(file_path) as f:
        for line in f.read().split('\n'):
            line = re.sub(r'([.¡!¿?;,:])', r' \1 ', line)
            line = ' '.join([w for w in line.split(' ')])
            line = line.replace('  ', ' ')
            p_lines.append(line)
    
    train, temp = train_test_split(p_lines, test_size=0.2, random_state=0)
    valid, test = train_test_split(temp, test_size=0.5, random_state=0)
    #print([line.split('\t') for line in train][0])
    #print([line.split('\t')[0] for line in train][:10])
    save_f(word_dir / 'train.tsv', train)
    save_f(word_dir / f'train.{lang_in}', [line.split('\t')[1].strip() for line in train if len(line)])
    save_f(word_dir / f'train.{lang_out}', [line.split('\t')[0].strip() for line in train if len(line)])
    
    save_f(word_dir / 'test.tsv', test)
    save_f(word_dir / f'test.{lang_in}', [line.split('\t')[1].strip() for line in test if len(line)])
    save_f(word_dir / f'test.{lang_out}', [line.split('\t')[0].strip() for line in test if len(line)])

    save_f(word_dir / 'valid.tsv', valid)
    save_f(word_dir / f'valid.{lang_in}', [line.split('\t')[1].strip() for line in valid if len(line)])
    save_f(word_dir / f'valid.{lang_out}', [line.split('\t')[0].strip() for line in valid if len(line)])
    

In [24]:
def save_bpe_segments(word_dir, prepro_dir, lang_in, lang_out, n_opers, dropout=False):
    
    os.system(f'cat {word_dir}/train.{lang_in} {word_dir}/valid.{lang_in} > {word_dir}/all.{lang_in}')  
    os.system(f'cat {word_dir}/train.{lang_out} {word_dir}/valid.{lang_out} > {word_dir}/all.{lang_out}')
    
    for oper in n_opers:
        bpe_dir = prepro_dir / (f'bpe_drop_{oper}' if dropout else f'bpe_{oper}')
        os.makedirs(bpe_dir, exist_ok=True)
        p_lines = []
          
        os.system(f'cat {word_dir}/all.{lang_out} {word_dir}/all.{lang_in} | subword-nmt learn-bpe -s {oper} -o {bpe_dir}/codes.all')

        for lang in [lang_out, lang_in]:
            os.system(f'subword-nmt apply-bpe --dropout {0.1 if dropout else 0} -c {bpe_dir}/codes.all < {word_dir}/all.{lang} | subword-nmt get-vocab > {bpe_dir}/vocab.{lang}')
            
            os.system(f'subword-nmt apply-bpe --dropout {0.1 if dropout else 0} -c {bpe_dir}/codes.all < {word_dir}/test.{lang} > {bpe_dir}/test.bpe.{lang}')
            os.system(f'subword-nmt apply-bpe --dropout {0.1 if dropout else 0} -c {bpe_dir}/codes.all < {word_dir}/train.{lang} > {bpe_dir}/train.bpe.{lang}')
            os.system(f'subword-nmt apply-bpe --dropout {0.1 if dropout else 0} -c {bpe_dir}/codes.all < {word_dir}/valid.{lang} > {bpe_dir}/valid.bpe.{lang}')
            
        print('join corpus')
        for corpus in ['valid', 'test', 'train']:
            l1 = open(f'{bpe_dir}/{corpus}.bpe.{lang_out}', 'r').read().split('\n')
            l2 = open(f'{bpe_dir}/{corpus}.bpe.{lang_in}', 'r').read().split('\n')
            pd.DataFrame(list(zip(l2, l1))).to_csv(f'{bpe_dir}/{corpus}.tsv', header=None, index=False, sep='\t')
            save_f(bpe_dir / f'{corpus}.{lang_out}', l1)
            save_f(bpe_dir / f'{corpus}.{lang_in}', l2)            
            

In [25]:
def save_char_segments(word_dir, char_dir, lang_in, lang_out):
    os.makedirs(char_dir, exist_ok=True)
    for lang in [lang_out, lang_in]:
        spm.SentencePieceTrainer.Train(f'--input={word_dir}/all.{lang} --model_prefix=m --vocab_size=1000 --character_coverage=1.0 --model_type=char')   

        sp = spm.SentencePieceProcessor()
        sp.Load("m.model")
        for file in [f'train.{lang}', f'valid.{lang}', f'test.{lang}']:
            f_in = open(word_dir / file, 'r')
            f_out = open(char_dir / file, 'w')
            
            for line in f_in.read().split('\n'):
                temp = []
                for word in sp.EncodeAsPieces(line.replace('<unk>', '<unknown>')):
                    if str('\u2581') in word:
                        word = word.replace(str('\u2581'), '@@')
                    temp.append(word)
                f_out.write(" ".join(temp) + "\n")

            f_in.close()
            f_out.close()
            

In [36]:
def save_segmentation(file_path, prepro_dir, lang_in, lang_out, n_opers=[5000]):
    save_word_segments(file_path, prepro_dir / 'Flashcards' / 'word', lang_in, lang_out)
    save_bpe_segments(prepro_dir / 'Flashcards' / 'word', prepro_dir / 'Flashcards', lang_in, lang_out, n_opers=n_opers)    
    save_bpe_segments(prepro_dir / 'Flashcards' / 'word', prepro_dir / 'Flashcards', lang_in, lang_out, n_opers=n_opers, dropout=True)
    save_char_segments(prepro_dir / 'Flashcards' / 'word', prepro_dir / 'Flashcards' / 'char', lang_in, lang_out)

In [34]:
raw_dir = base_dir / 'transfer' / 'raw'
prepro_dir = base_dir / 'transfer' / 'preprocessed'
for dir_temp in os.listdir(raw_dir):
    lang_dir = raw_dir / dir_temp
    print(lang_dir)
    file_path = lang_dir / (os.listdir(lang_dir)[0] if 'txt' in os.listdir(lang_dir)[0] else os.listdir(lang_dir)[1])
    lang_in, lang_out = dir_temp.split('-')
    save_segmentation(file_path, prepro_dir / dir_temp, lang_in, lang_out)


../data/raw/nob-eng
join corpus
join corpus
../data/raw/yue-eng
join corpus
join corpus
../data/raw/heb-eng
join corpus
join corpus
../data/raw/ind-eng
join corpus
join corpus
../data/raw/kat-eng
join corpus
join corpus
../data/raw/tur-eng
join corpus
join corpus
../data/raw/war-eng
join corpus
join corpus
../data/raw/cbk-eng
join corpus
join corpus
../data/raw/bel-eng
join corpus
join corpus
../data/raw/spa-eng
join corpus
join corpus
../data/raw/slk-eng
join corpus
join corpus
../data/raw/mar-eng
join corpus
join corpus
../data/raw/ara-eng
join corpus
join corpus
../data/raw/mkd-eng
join corpus
join corpus
../data/raw/max-eng
join corpus
join corpus
../data/raw/slv-eng
join corpus
join corpus
../data/raw/srp-eng
join corpus
join corpus
../data/raw/ber-eng
join corpus
join corpus
../data/raw/shp-es
join corpus
join corpus
../data/raw/shp-en
join corpus
join corpus
../data/raw/tel-eng
join corpus
join corpus
../data/raw/ceb-eng
join corpus
join corpus
../data/raw/jpn-eng
join corpus
jo

In [42]:
raw_dir = base_dir / 'translate' / 'raw'
prepro_dir = base_dir / 'translate' / 'preprocessed'
for dir_temp in os.listdir(raw_dir):
    lang_dir = raw_dir / dir_temp
    print(lang_dir)
    file_path = lang_dir / (os.listdir(lang_dir)[0] if 'txt' in os.listdir(lang_dir)[0] else os.listdir(lang_dir)[1])
    lang_in = 
    lang_out = dir_temp.split('-')
    save_segmentation(file_path, prepro_dir / dir_temp, lang_in, lang_out, list(range(1000, 11000, 1000)))


../data/translate/raw/Flashcards


ValueError: not enough values to unpack (expected 2, got 1)

In [43]:
dir_temp

'Flashcards'

In [41]:
list(range(1000, 11000, 1000))

[1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000]

In [38]:
dir_temp

'Flashcards'

In [33]:
dir_temp

'shp-es'

In [32]:
file_path

PosixPath('../data/raw/shp-es/all.shi')