In [4]:
# Download if on Colab
!curl -O https://raw.githubusercontent.com/jsoma/transliteration_project/master/raw/original.txt
!curl -O https://raw.githubusercontent.com/jsoma/transliteration_project/master/raw/transliterated.txt
    
!mkdir -p raw

# -vn verbose + don't overwrite
!mv -vn original.txt raw/original.txt
!mv -vn transliterated.txt raw/transliterated.txt

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 4897k  100 4897k    0     0  7922k      0 --:--:-- --:--:-- --:--:-- 7912k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 26419  100 26419    0     0   255k      0 --:--:-- --:--:-- --:--:--  255k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 33267  100 33267    0     0   320k      0 --:--:-- --:--:-- --:--:--  321k
raw/wic.vert not overwritten
raw/google-transliterated.txt not overwritten
raw/bible-original.txt not overwritten


In [5]:
import pandas as pd

original = open("raw/original.txt").readlines()
transliterated = open("raw/transliterated.txt").readlines()

sentences = pd.DataFrame({
    'orig': original,
    'trans': transliterated
})
sentences.head()

Unnamed: 0,orig,trans
0,በእግዚአብሔር ፈቃድ የኢየሱስ ክርስቶስ ሐዋርያ ሊሆን የተጠራ ጳውሎስ ወን...,be’igizī’ābiḥēri fek’adi ye’īyesusi kirisitos...
1,በቆሮንቶስ ላለች ለእግዚአብሔር ቤተ ክርስቲያን፥ በክርስቶስ ኢየሱስ ለተቀ...,bek’oronitosi lalechi le’igizī’ābiḥēri bēte k...
2,ከእግዚአብሔር ከአባታችን ከጌታም ከኢየሱስ ክርስቶስ ጸጋና ሰላም ለእናንተ...,ke’igizī’ābiḥēri ke’ābatachini kegētami ke’īy...
3,በክርስቶስ ኢየሱስ ስላመናችሁ በተሰጣችሁ በእግዚአብሔር ጸጋ ምክንያት ሁል...,bekirisitosi īyesusi silamenachihu beteset’ach...
4,ለክርስቶስ መመስከሬ በእናንተ ዘንድ እንደ ጸና፥ በነገር ሁሉ በቃልም ሁሉ...,lekirisitosi memesikerē be’inanite zenidi inid...


In [6]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(sentences, test_size=0.2)

print("Training on", train.shape)
print("Testing on", test.shape)

!mkdir -p data

# Save as training
train.trans.to_csv("data/src-train.txt", header=False, index=False)
train.orig.to_csv("data/tgt-train.txt", header=False, index=False)

# Save as testing
test.trans.to_csv("data/src-val.txt", header=False, index=False)
test.orig.to_csv("data/tgt-val.txt", header=False, index=False)

Training on (152, 2)
Testing on (39, 2)


# Data setup

Following quickstart instructions from https://github.com/OpenNMT/OpenNMT-py#quickstart.

## Training

I'm just using the terminal commands because the Python bindings were just Too Much Work.

In [6]:
#!pip install OpenNMT-py

## Preprocess

In [7]:
!onmt_preprocess \
    -train_src data/src-train.txt \
    -train_tgt data/tgt-train.txt \
    -valid_src data/src-val.txt \
    -valid_tgt data/tgt-val.txt \
    -save_data data/demo \
    -overwrite

[2020-04-23 19:19:30,143 INFO] Extracting features...
[2020-04-23 19:19:30,144 INFO]  * number of source features: 0.
[2020-04-23 19:19:30,144 INFO]  * number of target features: 0.
[2020-04-23 19:19:30,144 INFO] Building `Fields` object...
[2020-04-23 19:19:30,144 INFO] Building & saving training data...
[2020-04-23 19:19:30,160 INFO] Building shard 0.
[2020-04-23 19:19:30,183 INFO]  * saving 0th train data shard to data/demo.train.0.pt.
[2020-04-23 19:19:30,374 INFO]  * tgt vocab size: 1116.
[2020-04-23 19:19:30,376 INFO]  * src vocab size: 1114.
[2020-04-23 19:19:30,399 INFO] Building & saving validation data...
[2020-04-23 19:19:30,544 INFO] Building shard 0.
[2020-04-23 19:19:30,550 INFO]  * saving 0th valid data shard to data/demo.valid.0.pt.


## Train

In [45]:
# Change to false to get GPU power on Colab
if True:
    !onmt_train \
        -data data/demo \
        -save_model demo-model \
        --valid_steps 50 \
        --train_steps 2 \
        --early_stopping 5
else:
    !CUDA_VISIBLE_DEVICES=0 \
        onmt_train \
        -world_size 1 \
        -gpu_ranks 0 \
        -data data/demo \
        -save_model demo-model \
        --valid_steps 50 \
        --train_steps 2000 \
        --early_stopping 10

[2020-04-23 19:33:21,702 INFO]  * src vocab size = 1114
[2020-04-23 19:33:21,703 INFO]  * tgt vocab size = 1116
[2020-04-23 19:33:21,703 INFO] Building model...
[2020-04-23 19:33:22,020 INFO] NMTModel(
  (encoder): RNNEncoder(
    (embeddings): Embeddings(
      (make_embedding): Sequential(
        (emb_luts): Elementwise(
          (0): Embedding(1114, 500, padding_idx=1)
        )
      )
    )
    (rnn): LSTM(500, 500, num_layers=2, dropout=0.3)
  )
  (decoder): InputFeedRNNDecoder(
    (embeddings): Embeddings(
      (make_embedding): Sequential(
        (emb_luts): Elementwise(
          (0): Embedding(1116, 500, padding_idx=1)
        )
      )
    )
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): StackedLSTM(
      (dropout): Dropout(p=0.3, inplace=False)
      (layers): ModuleList(
        (0): LSTMCell(1000, 500)
        (1): LSTMCell(500, 500)
      )
    )
    (attn): GlobalAttention(
      (linear_in): Linear(in_features=500, out_features=500, bias=False)
      (li

# Testing it out

Testing the text below, you can see the "answer" on Google translate [right here](https://translate.google.com/#view=home&op=translate&sl=am&tl=en&text=%E1%8A%A5%E1%8A%94%E1%88%9D%E1%8D%A5%20%E1%8B%88%E1%8A%95%E1%8B%B5%E1%88%9E%E1%89%BD%20%E1%88%86%E1%8B%AD%E1%8D%A5%20%E1%8B%A8%E1%88%A5%E1%8C%8B%20%E1%8A%A5%E1%8A%95%E1%8B%B0%20%E1%88%98%E1%88%86%E1%8A%93%E1%89%BD%E1%88%81%E1%8D%A5%0A%E1%89%A0%E1%8A%AD%E1%88%AD%E1%88%B5%E1%89%B6%E1%88%B5%E1%88%9D%20%E1%88%95%E1%8D%83%E1%8A%93%E1%89%B5%20%E1%8A%A5%E1%8A%95%E1%8B%B0%20%E1%88%98%E1%88%86%E1%8A%93%E1%89%BD%E1%88%81%20%E1%8A%A5%E1%8A%95%E1%8C%82%0A%E1%88%98%E1%8A%95%E1%8D%88%E1%88%B3%E1%8B%8D%E1%8B%AB%E1%8A%95%20%E1%8A%A5%E1%8A%95%E1%8B%B0%20%E1%88%98%E1%88%86%E1%8A%93%E1%89%BD%E1%88%81%20%E1%88%8D%E1%8A%93%E1%8C%88%E1%88%AB%E1%89%BD%E1%88%81%0A%E1%8A%A0%E1%88%8D%E1%89%BB%E1%88%8D%E1%88%81%E1%88%9D%E1%8D%A2).

In [46]:
original = """
እኔም፥ ወንድሞች ሆይ፥ የሥጋ እንደ መሆናችሁ፥
በክርስቶስም ሕፃናት እንደ መሆናችሁ እንጂ
መንፈሳውያን እንደ መሆናችሁ ልናገራችሁ
አልቻልሁም።
"""
transliterated = """
inēmi፥ wenidimochi hoyi፥ yešiga inide mehonachihu፥ 
bekirisitosimi ḥit͟s’anati inide mehonachihu inijī 
menifesawiyani inide mehonachihu linagerachihu
ālichalihumi።"""

orig_oneline = original.replace("\n", " ").strip().replace("  ", " ")
trans_oneline = transliterated.replace("\n", " ").strip().replace("  ", " ")
with open("data/test.txt", 'w') as f:
    f.write(trans_oneline)

**You'll need to change the model name in `onmt_translate` below.** It's probably the most recently changed model file, so at the top of this list: 

In [47]:
import subprocess
model_name = subprocess.run("ls -t *model* | head -n 1",
                            shell=True,
                            stdout=subprocess.PIPE).stdout.decode("utf-8").strip()
print("Using model", model_name)

Using model demo-model_step_2.pt


In [48]:
!onmt_translate \
    -model {model_name} \
    -src data/test.txt \
    -output data/pred.txt -replace_unk -verbose


[2020-04-23 19:33:54,951 INFO] Translating shard 0.

SENT 1: ['inēmi፥', 'wenidimochi', 'hoyi፥', 'yešiga', 'inide', 'mehonachihu፥', 'bekirisitosimi', 'ḥit͟s’anati', 'inide', 'mehonachihu', 'inijī', 'menifesawiyani', 'inide', 'mehonachihu', 'linagerachihu', 'ālichalihumi።']
PRED 1: 
PRED SCORE: -0.0074
PRED No words predicted


In [50]:
print("Output: ", open("data/pred.txt").read())
print("Input: ", open("data/test.txt").read())
print("Actual: ", orig_oneline)

Output:  

Input:  inēmi፥ wenidimochi hoyi፥ yešiga inide mehonachihu፥ bekirisitosimi ḥit͟s’anati inide mehonachihu inijī menifesawiyani inide mehonachihu linagerachihu ālichalihumi።
Actual:  እኔም፥ ወንድሞች ሆይ፥ የሥጋ እንደ መሆናችሁ፥ በክርስቶስም ሕፃናት እንደ መሆናችሁ እንጂ መንፈሳውያን እንደ መሆናችሁ ልናገራችሁ አልቻልሁም።
