# Creating test and train datasets

In [1]:
!unzip Archive.zip

Archive:  Archive.zip
  inflating: original.txt            
  inflating: transliterated.txt      


In [2]:
import pandas as pd
import re

original = open("original.txt").read()
transliterated = open("transliterated.txt").read()

words = pd.DataFrame({
    'orig': re.split(r'\s+', original),
    'trans': re.split(r'\s+', transliterated)
})
words.head()

Unnamed: 0,orig,trans
0,ቀዝቃዛ,kazeqaza
1,ውኃ,weha
2,የዛለችን,yazalatchene
3,ነፍስ,nefsi
4,እንደሚያረካ,inidamiyaraka


In [0]:
words.orig = words.orig.str.join(' ')
words.trans = words.trans.str.join(' ')

In [4]:
words.head(10)

Unnamed: 0,orig,trans
0,ቀ ዝ ቃ ዛ,k a z e q a z a
1,ው ኃ,w e h a
2,የ ዛ ለ ች ን,y a z a l a t c h e n e
3,ነ ፍ ስ,n e f s i
4,እ ን ደ ሚ ያ ረ ካ,i n i d a m i y a r a k a
5,ሁ ሉ ከ ሩ ቅ,h u l u k e r u q e
6,አ ገ ር,a g a r
7,የ መ ጣ,j a m e t a
8,መ ል ካ ም,m e l e k a m
9,ወ ሬ ም,w e r e m


In [9]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(words, test_size=5000)

print("Training on", train.shape)
print("Testing on", test.shape)

!mkdir -p data

# Save as training
# Can't use .to_csv because of quotins
with open('data/src-train.txt', 'w') as f:
    f.write('\n'.join(train.trans))
with open('data/tgt-train.txt', 'w') as f:
    f.write('\n'.join(train.orig))

with open('data/src-val.txt', 'w') as f:
    f.write('\n'.join(test.trans))
with open('data/tgt-val.txt', 'w') as f:
    f.write('\n'.join(test.orig))

Training on (1530622, 2)
Testing on (5000, 2)


In [11]:
!head -n 5 data/tgt-val.txt

ሕ ዝ ቡ
ዓ ለ ም
ይ ህ ን
እ ን ድ ን ጠ ላ
እ ን ደ ሚ ያ ሳ ድ ር


In [10]:
train.head()

Unnamed: 0,orig,trans
918459,ወ ይ ም,w a y e m
184976,ጋ ር,g a r i
900012,ሰ ፊ,s e f i
1109537,ጋ ዜ ጣ,g a z y e t a
1527337,አ ን ደ በ ታ ች ን ን,' a n i d e b a t a t c h e n n e


# Data setup

Following quickstart instructions from https://github.com/OpenNMT/OpenNMT-py#quickstart.

## Training

I'm just using the terminal commands because the Python bindings were just Too Much Work.

In [12]:
!pip install OpenNMT-py

Collecting OpenNMT-py
[?25l  Downloading https://files.pythonhosted.org/packages/7e/c7/b3d9bf9a6a681b10c00aa897650f79d4e7ad8a80317c5cddb6a3ef43540c/OpenNMT_py-1.1.1-py3-none-any.whl (189kB)
[K     |████████████████████████████████| 194kB 2.6MB/s 
[?25hCollecting tqdm~=4.30.0
[?25l  Downloading https://files.pythonhosted.org/packages/76/4c/103a4d3415dafc1ddfe6a6624333971756e2d3dd8c6dc0f520152855f040/tqdm-4.30.0-py2.py3-none-any.whl (47kB)
[K     |████████████████████████████████| 51kB 5.0MB/s 
[?25hCollecting configargparse
[?25l  Downloading https://files.pythonhosted.org/packages/bb/79/3045743bb26ca2e44a1d317c37395462bfed82dbbd38e69a3280b63696ce/ConfigArgParse-1.2.3.tar.gz (42kB)
[K     |████████████████████████████████| 51kB 5.4MB/s 
Collecting pyonmttok==1.*; platform_system == "Linux"
[?25l  Downloading https://files.pythonhosted.org/packages/08/20/3c57198ffe690b580fbf23d33d5000eb411862e60e4bb6853b61dc989187/pyonmttok-1.18.3-cp36-cp36m-manylinux1_x86_64.whl (2.2MB)
[K    

## Preprocess

In [13]:
!onmt_preprocess \
    -train_src data/src-train.txt \
    -train_tgt data/tgt-train.txt \
    -valid_src data/src-val.txt \
    -valid_tgt data/tgt-val.txt \
    -save_data data/demo \
    -overwrite

[2020-05-08 04:34:11,153 INFO] Extracting features...
[2020-05-08 04:34:11,154 INFO]  * number of source features: 0.
[2020-05-08 04:34:11,154 INFO]  * number of target features: 0.
[2020-05-08 04:34:11,154 INFO] Building `Fields` object...
[2020-05-08 04:34:11,154 INFO] Building & saving training data...
[2020-05-08 04:34:12,292 INFO] Building shard 0.
[2020-05-08 04:34:45,961 INFO]  * saving 0th train data shard to data/demo.train.0.pt.
[2020-05-08 04:35:05,473 INFO] Building shard 1.
[2020-05-08 04:35:23,014 INFO]  * saving 1th train data shard to data/demo.train.1.pt.
[2020-05-08 04:35:33,078 INFO]  * tgt vocab size: 426.
[2020-05-08 04:35:33,078 INFO]  * src vocab size: 65.
[2020-05-08 04:35:33,115 INFO] Building & saving validation data...
[2020-05-08 04:35:33,141 INFO] Building shard 0.
[2020-05-08 04:35:33,218 INFO]  * saving 0th valid data shard to data/demo.valid.0.pt.


## Train

In [66]:
# Change to false to get GPU power on Colab
if False:
    !onmt_train \
        -data data/demo \
        -save_model demo-model \
        --valid_steps 50 \
        --train_steps 2 \
        --early_stopping 5
else:
    !CUDA_VISIBLE_DEVICES=0 \
        onmt_train \
        -world_size 1 \
        -gpu_ranks 0 \
        -data data/demo \
        -save_model demo-model \
        --valid_steps 3000 \
        --train_steps 50000 \
        --early_stopping 3

[2020-05-08 04:57:25,932 INFO]  * src vocab size = 65
[2020-05-08 04:57:25,932 INFO]  * tgt vocab size = 426
[2020-05-08 04:57:25,932 INFO] Building model...
[2020-05-08 04:57:29,101 INFO] NMTModel(
  (encoder): RNNEncoder(
    (embeddings): Embeddings(
      (make_embedding): Sequential(
        (emb_luts): Elementwise(
          (0): Embedding(65, 500, padding_idx=1)
        )
      )
    )
    (rnn): LSTM(500, 500, num_layers=2, dropout=0.3)
  )
  (decoder): InputFeedRNNDecoder(
    (embeddings): Embeddings(
      (make_embedding): Sequential(
        (emb_luts): Elementwise(
          (0): Embedding(426, 500, padding_idx=1)
        )
      )
    )
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): StackedLSTM(
      (dropout): Dropout(p=0.3, inplace=False)
      (layers): ModuleList(
        (0): LSTMCell(1000, 500)
        (1): LSTMCell(500, 500)
      )
    )
    (attn): GlobalAttention(
      (linear_in): Linear(in_features=500, out_features=500, bias=False)
      (linear_o

# Testing it out

Testing the text below, you can see the "answer" on Google translate [right here](https://translate.google.com/#view=home&op=translate&sl=am&tl=en&text=%E1%8A%A5%E1%8A%94%E1%88%9D%E1%8D%A5%20%E1%8B%88%E1%8A%95%E1%8B%B5%E1%88%9E%E1%89%BD%20%E1%88%86%E1%8B%AD%E1%8D%A5%20%E1%8B%A8%E1%88%A5%E1%8C%8B%20%E1%8A%A5%E1%8A%95%E1%8B%B0%20%E1%88%98%E1%88%86%E1%8A%93%E1%89%BD%E1%88%81%E1%8D%A5%0A%E1%89%A0%E1%8A%AD%E1%88%AD%E1%88%B5%E1%89%B6%E1%88%B5%E1%88%9D%20%E1%88%95%E1%8D%83%E1%8A%93%E1%89%B5%20%E1%8A%A5%E1%8A%95%E1%8B%B0%20%E1%88%98%E1%88%86%E1%8A%93%E1%89%BD%E1%88%81%20%E1%8A%A5%E1%8A%95%E1%8C%82%0A%E1%88%98%E1%8A%95%E1%8D%88%E1%88%B3%E1%8B%8D%E1%8B%AB%E1%8A%95%20%E1%8A%A5%E1%8A%95%E1%8B%B0%20%E1%88%98%E1%88%86%E1%8A%93%E1%89%BD%E1%88%81%20%E1%88%8D%E1%8A%93%E1%8C%88%E1%88%AB%E1%89%BD%E1%88%81%0A%E1%8A%A0%E1%88%8D%E1%89%BB%E1%88%8D%E1%88%81%E1%88%9D%E1%8D%A2).

In [0]:
!head -n 1 original.txt

ቀዝቃዛ ውኃ የዛለችን ነፍስ እንደሚያረካ ሁሉከሩቅ አገር የመጣ መልካም ወሬም እንዲሁ ነው።


In [0]:
!head -n 1 transliterated.txt

kazeqaza weha yazalatchene nefsi inidamiyaraka hulukeruqe agar jameta melekam werem inedihu new.


In [0]:
import subprocess

# Pull the first 10 originals and transliterateds
originals = subprocess.run("head -n 500 original.txt",
                            shell=True,
                            stdout=subprocess.PIPE).stdout.decode("utf-8").strip()
transliterated = subprocess.run("head -n 500 transliterated.txt",
                            shell=True,
                            stdout=subprocess.PIPE).stdout.decode("utf-8").strip()

# You can also use other stuff
originals = open("sera_am.txt").read()
transliterated = open("sera_rom.txt").read()

In [180]:
!pip install unidecode



In [0]:
import unidecode

def clean(lat_word):
  return unidecode.unidecode(lat_word).lower()

def add_spaces(word):
  return ' '.join(word)

orig_test = []
trans_test = []

# Go through each line, making sure they
# have the same number of words before you
# add them to the list
for o, t in zip(originals.splitlines(), transliterated.splitlines()):
  o_words = [add_spaces(w) for w in re.split(r"\s+", o) if w]
  # - is sometimes transliterated as a space
  t_words = [add_spaces(w) for w in re.split(r"\s+", clean(t)) if w]
  if(len(o_words) == len(t_words)):
    orig_test.extend(o_words)
    trans_test.extend(t_words)
  else:
    print("Not a matched pair:", o, t)

In [0]:
# import re

# orig_test = [' '.join(word) for word in re.split('\s+', originals)]
# trans_test = [' '.join(word) for word in re.split('\s+', transliterated)]
with open("data/test.txt", 'w') as f:
    f.write('\n'.join(trans_test))

In [183]:
!head -n 5 data/test.txt

y e z e m e m e n i
l i t a k ' e n a
l i t i f a l e m i
k e d e m e n a
t e l i k o


In [184]:
print(len(trans_test), trans_test[:5])
print(len(orig_test), orig_test[:5])

371 ['y e z e m e m e n i', "l i t a k ' e n a", 'l i t i f a l e m i', 'k e d e m e n a', 't e l i k o']
371 ['የ ዘ መ መ ን', 'ል ታ ቀ ና', 'ል ት ፋ ለ ም', 'ከ ደ መ ና', 'ተ ል ኮ']


**You'll need to change the model name in `onmt_translate` below.** It's probably the most recently changed model file, so at the top of this list: 

In [185]:
model_name = subprocess.run("ls -t *model* | head -n 1",
                            shell=True,
                            stdout=subprocess.PIPE).stdout.decode("utf-8").strip()
print("Using model", model_name)

Using model demo-model_step_25000.pt


In [186]:
!onmt_translate \
    -model {model_name} \
    -src data/test.txt \
    -output data/pred.txt -replace_unk


[2020-05-08 05:34:13,188 INFO] Translating shard 0.
PRED AVG SCORE: -0.0986, PRED PPL: 1.1037


In [187]:
results = open("data/pred.txt").read().splitlines()
test_results = pd.DataFrame({
    'result': results,
    'original': orig_test,
    'transliterated': trans_test
})
print((test_results.result == test_results.original).value_counts(normalize=True))
test_results

False    0.606469
True     0.393531
dtype: float64


Unnamed: 0,result,original,transliterated
0,የ ዘ መ መ ን,የ ዘ መ መ ን,y e z e m e m e n i
1,ሊ ጠ ክ እ ና,ል ታ ቀ ና,l i t a k ' e n a
2,ሊ ጥ ፋ ል ም,ል ት ፋ ለ ም,l i t i f a l e m i
3,ከ ደ መ ና,ከ ደ መ ና,k e d e m e n a
4,ጠ ል ቆ,ተ ል ኮ,t e l i k o
...,...,...,...
366,ሥ ራ,ስ ራ,s i r a
367,ሥ ራ,ስ ራ,s i r a
368,ሥ ራ,ስ ራ,s i r a
369,ሥ ራ,ስ ራ,s i r a
