## Pull in the datasets

In [32]:
!rm datasets.zip
!rm -rf raw
!wget https://github.com/jsoma/transliteration_project/raw/master/raw/datasets.zip
!unzip -o datasets.zip

--2020-04-30 19:41:35--  https://github.com/jsoma/transliteration_project/raw/master/raw/datasets.zip
Resolving github.com (github.com)... 140.82.118.3
Connecting to github.com (github.com)|140.82.118.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/jsoma/transliteration_project/master/raw/datasets.zip [following]
--2020-04-30 19:41:36--  https://raw.githubusercontent.com/jsoma/transliteration_project/master/raw/datasets.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9609299 (9.2M) [application/zip]
Saving to: ‘datasets.zip’


2020-04-30 19:41:37 (31.5 MB/s) - ‘datasets.zip’ saved [9609299/9609299]

Archive:  datasets.zip
  inflating: raw/transliterated.txt  
  inflating: raw/original.txt     

## Read in the data from `new-am.txt`

Hopefully this is all of the text we have.

In [0]:
import tensorflow as tf

import numpy as np
import os
import time


In [1]:
# Read, then decode for py2 compat.
text = open('raw/new-am.txt', 'rb').read().decode(encoding='utf-8')
# length of text is the number of characters in it
print ('Length of text: {} characters'.format(len(text)))


Length of text: 4431040 characters


In [2]:
print(text[:250])


ጠ/ሚ መለስ ዜናዊ "ጦርነት ኳስ ጨዋታ አይደለም!" አሉ
ሰሞኑን በሕወሓት/ኢሕአዴግ ግምገማ ውስጥ ዋነኛው የግምገማ በትር ያረፈው በጠ/ሚ መለስ ዜናዊ ላይ መሆኑ ተደጋግሞ እየተሰማ ነው።
ከዚሁ ጋር ተያይዞ የጠ/ሚንስትሩ ጋርዶች በሌሎች መቀየራቸው፣  ከአቶ መለስ ዜናዊ ጋር የሚያገናኙ የቤተ መንግሥት የስልክ ግንኙነቶች መቋረጣቸው በሰፊው እየተነገረ ሲሆን፣ማንኛውንም የወቅቱን ጉዳይ አስመልክቶ መ


In [3]:
vocab = sorted(set(text))
print ('{} unique characters'.format(len(vocab)))


445 unique characters


In [0]:
# Store 
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in text])


In [0]:
import json

with open('char2idx.json', 'w') as f:
  json.dump(char2idx, f)

with open('idx2char.json', 'w') as f:
  json.dump(list(idx2char), f)

In [0]:
from google.colab import files

files.download('char2idx.json')
files.download('idx2char.json')

In [41]:
print('{')
for char,_ in zip(char2idx, range(20)):
    print('  {:4s}: {:3d},'.format(repr(char), char2idx[char]))
print('  ...\n}')


{
  '\n':   0,
  ' ' :   1,
  '!' :   2,
  '"' :   3,
  '#' :   4,
  '$' :   5,
  '%' :   6,
  '&' :   7,
  "'" :   8,
  '(' :   9,
  ')' :  10,
  '*' :  11,
  '+' :  12,
  ',' :  13,
  '-' :  14,
  '.' :  15,
  '/' :  16,
  '0' :  17,
  '1' :  18,
  '2' :  19,
  ...
}


In [42]:
print ('{} ---- characters mapped to int ---- > {}'.format(repr(text[:13]), text_as_int[:13]))


'“የዳታ ፕሮሰሲንግ አ' ---- characters mapped to int ---- > [426 311 321 222   1 396 175 177 179 247 340   1 258]


In [43]:
# The maximum length sentence we want for a single input in characters
seq_length = 100
examples_per_epoch = len(text)//(seq_length+1)

# Create training examples / targets
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

for i in char_dataset.take(5):
  print(idx2char[i.numpy()])


“
የ
ዳ
ታ
 


In [44]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

for item in sequences.take(5):
  print(repr(''.join(idx2char[item.numpy()])))


'“የዳታ ፕሮሰሲንግ አገልግሎት” ማለት በኮምፒዩተር ሥርዓት አማካኝነት ዳታን የመቀበል ፣ የማከማቸት ፣ የመተንተን ፣ የማሰራጨት ፣ የማጓጓዝ ወይም የማስተላለፍ '
'አገልግሎት ሲሆን የኔትዎርክ አገልግሎችንም\n“የዳታ ፕሮሰሲንግ አገልግሎት” ማለት በኮምፒዩተር ሥርዓት አማካኝነት ዳታን የመቀበል ፣ የማከማቸት ፣ የመተንተን ፣ '
'የማሰራጨት ፣ የማጓጓዝ ወይም የማስተላለፍ አገልግሎት ሲሆን የኔትዎርክ አገልግሎችንም\nየኤጀንሲው ዓላማ\nየኤጀንሲው ዓላማ\nእንዲህ አለው፦ “ዮፍታሔ እንዲህ ይላል፦'
' ‘እስራኤል የሞዓባውያንን ምድርና የአሞናውያንን ምድር አልወሰደም ፤\nእንዲህ አለው፦ “ዮፍታሔ እንዲህ ይላል፦ ‘እስራኤል የሞዓባውያንን ምድርና የአሞናውያንን ም'
'ድር አልወሰደም ፤\nሆኖም ትርፏና የምትቀበለው ክፍያ ለይሖዋ የተቀደሰ ይሆናል ። አይከማችም ወይም አይጠራቀምም ፤ ምክንያቱም በይሖዋ ፊት የሚኖሩ ሰዎች እስኪጠግ'


In [0]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)


In [46]:
for input_example, target_example in  dataset.take(1):
  print ('Input data: ', repr(''.join(idx2char[input_example.numpy()])))
  print ('Target data:', repr(''.join(idx2char[target_example.numpy()])))


Input data:  '“የዳታ ፕሮሰሲንግ አገልግሎት” ማለት በኮምፒዩተር ሥርዓት አማካኝነት ዳታን የመቀበል ፣ የማከማቸት ፣ የመተንተን ፣ የማሰራጨት ፣ የማጓጓዝ ወይም የማስተላለፍ'
Target data: 'የዳታ ፕሮሰሲንግ አገልግሎት” ማለት በኮምፒዩተር ሥርዓት አማካኝነት ዳታን የመቀበል ፣ የማከማቸት ፣ የመተንተን ፣ የማሰራጨት ፣ የማጓጓዝ ወይም የማስተላለፍ '


In [47]:
for i, (input_idx, target_idx) in enumerate(zip(input_example[:5], target_example[:5])):
    print("Step {:4d}".format(i))
    print("  input: {} ({:s})".format(input_idx, repr(idx2char[input_idx])))
    print("  expected output: {} ({:s})".format(target_idx, repr(idx2char[target_idx])))


Step    0
  input: 426 ('“')
  expected output: 311 ('የ')
Step    1
  input: 311 ('የ')
  expected output: 321 ('ዳ')
Step    2
  input: 321 ('ዳ')
  expected output: 222 ('ታ')
Step    3
  input: 222 ('ታ')
  expected output: 1 (' ')
Step    4
  input: 1 (' ')
  expected output: 396 ('ፕ')


In [48]:
# Batch size
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

dataset


<BatchDataset shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>

In [0]:
# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024


In [0]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
  model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
    tf.keras.layers.GRU(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size)
  ])
  return model


In [0]:
model = build_model(
  vocab_size = len(vocab),
  embedding_dim=embedding_dim,
  rnn_units=rnn_units,
  batch_size=BATCH_SIZE)


In [52]:
for input_example_batch, target_example_batch in dataset.take(1):
  example_batch_predictions = model(input_example_batch)
  print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")


(64, 100, 445) # (batch_size, sequence_length, vocab_size)


In [53]:
model.summary()


Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (64, None, 256)           113920    
_________________________________________________________________
gru_2 (GRU)                  (64, None, 1024)          3938304   
_________________________________________________________________
dense_2 (Dense)              (64, None, 445)           456125    
Total params: 4,508,349
Trainable params: 4,508,349
Non-trainable params: 0
_________________________________________________________________


In [0]:
# print("Input: \n", repr("".join(idx2char[input_example_batch[0]])))
# print()
# print("Next Char Predictions: \n", repr("".join(idx2char[sampled_indices ])))


In [54]:
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

example_batch_loss  = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("scalar_loss:      ", example_batch_loss.numpy().mean())


Prediction shape:  (64, 100, 445)  # (batch_size, sequence_length, vocab_size)
scalar_loss:       6.099281


In [0]:
model.compile(optimizer='adam', loss=loss)


In [0]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)


In [0]:
EPOCHS=100


In [0]:
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100

In [0]:
tf.train.latest_checkpoint(checkpoint_dir)
#!rm -rf training_checkpoints

In [0]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

model.build(tf.TensorShape([1, None]))


In [0]:
# def generate_text(model, start_string, num_generate):
#   # Evaluation step (generating text using the learned model)

#   # Converting our start string to numbers (vectorizing)
#   input_eval = [char2idx[s] for s in start_string]
#   input_eval = tf.expand_dims(input_eval, 0)

#   # Empty string to store our results
#   text_generated = []

#   # Low temperatures results in more predictable text.
#   # Higher temperatures results in more surprising text.
#   # Experiment to find the best setting.
#   temperature = 1.0

#   # Here batch size == 1
#   model.reset_states()
#   for i in range(num_generate):
#       predictions = model(input_eval)
#       # remove the batch dimension
#       predictions = tf.squeeze(predictions, 0)

#       # using a categorical distribution to predict the character returned by the model
#       predictions = predictions / temperature
#       predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

#       # We pass the predicted character as the next input to the model
#       # along with the previous hidden state
#       input_eval = tf.expand_dims([predicted_id], 0)

#       text_generated.append(idx2char[predicted_id])

#   return (start_string + ''.join(text_generated))

# print(generate_text(x, start_string=u"በ", num_generate=4))


In [0]:
!rm -rf char_model
model.save("char_model")
!tar -zcvf char_model.tar.gz char_model

from google.colab import files
files.download('char_model.tar.gz')

model/
model/assets/
model/saved_model.pb
model/variables/
model/variables/variables.data-00001-of-00002
model/variables/variables.index
model/variables/variables.data-00000-of-00002


In [109]:
start_string = "በ"
temperature = 1.0
num_generate = 1
input_eval = [char2idx[s] for s in start_string]
input_eval = tf.expand_dims(input_eval, 0)

model.reset_states()
predictions = model(input_eval)
predictions = tf.squeeze(predictions, 0)
predictions = predictions / temperature

predicted_id = np.argsort(predictions)[0, -3:]
predicted_id
print("".join(idx2char[predicted_id]))

ቃቁኋ


In [83]:
 model.reset_states()
predictions = model(input_eval)
predictions = tf.squeeze(predictions, 0)
predictions
#tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()
pred_ids = tf.argsort(predictions).numpy()
"".join(idx2char[pred_ids[0]])

'ዎ \n”\xa0‘»)ናቭ!"’ቌሼ፦′ኗንGב/ኑኧ-ኄkሖዬ\uf035̈ኻנh́_&ፏፎ̧j¬ፉ\uf038\uf020ነጪጇኚ\uf034ኝ+ሻu￼ኼፒጬዜ@Ñsሑኳምח\uf031ሴ—ች\uf037*ኖזፍዋኜ]\uf033ሒz÷ጅጂዉ¾ኒዤፑጙX\uf032\u2060דעêህVዒNmJሃ\uf039ፋØ\'\x9d#ኹዱጹጄאኆÓኙ\uf036ቊዊጁptR‹ዷጱጿጲnሽኸק…=qoZ:DጶቫዽቩyኽቐŸ̃ሟ`ጯA¡Tዲፂ0Fዮªዔሷ፸ያጆቪr}\x81ቮO;lዶÏኦቼቨሤድeMጼÁቺיኰ>וፕBצ፯ሞKሔዪÍלשቢዖ“Hi›፩ƒዣሱIዐጩኾS¨ኛዥEስዠሢvደጧ፥\ufeffLQቿY(ፃa8ጋጳሎኡኺ$ይጐዴቸጻዌልጭ^ኁቻጾጢbሾתከጸሳ፫w%ዝቾ×ክ[፶ጨሓውፌWf፬ፄጮgርዢሄኪ5<ሂPሸጵሿ©ጉኅዑቹበ፪፷ኘ፻U,3ፈኔ፳ሊሹዳሐx?ካጓጽፐሉ9ኬቲሮጴሡለሁቬየcd፮ጤሶጰጌዩፅ፴ጠሲCፁሪሀፓሜ7ኮኤፔቦኞፀ፱ዕ6፲ጎግማጡሠ4ጃዟ፡ዡ«ታገወሆጫ፰ጊሏ2ጥቶኟሦሥቆ፹ፆሣኩሙጺቱቅ፣ዘዞሬሺፊጦባፖብ፭ላኀቤቷጀዦአ፵ሕሌእቴቀ1ቄኣዓጣቂራረዙቡ።፤ተቧሩሯኃዛ.ኢሰ፺ቋሚመትዚቃቁኋ'

In [82]:
pred_ids[0]

array([-4.43266106e+00, -4.45042610e+00, -2.47942710e+00, -2.41254807e+00,
       -7.67365634e-01,  6.74478114e-02,  2.08724096e-01, -1.31721056e+00,
       -7.77756095e-01,  1.17902607e-02, -2.92985010e+00, -9.77099717e-01,
       -1.16710937e+00,  4.13990587e-01, -1.62699938e+00,  2.19381833e+00,
       -1.66038346e+00, -4.08759475e-01,  1.65054584e+00,  1.01426339e+00,
        4.15807396e-01,  8.95875454e-01,  3.13411653e-01,  8.16473126e-01,
        7.47952998e-01,  3.06870155e-02,  5.12327909e-01, -5.37239552e-01,
       -3.50599825e-01,  3.14085543e-01, -5.48883498e-01, -2.49892622e-01,
        4.90262806e-01, -1.08195901e+00, -4.29478854e-01, -2.43841708e-01,
        6.81038558e-01, -5.17008364e-01, -7.08252490e-02, -4.04450148e-01,
       -1.69954181e+00, -1.70376465e-01, -1.34267777e-01, -7.97093570e-01,
       -2.21413478e-01, -1.68119892e-02, -2.90579498e-01, -8.33392680e-01,
       -3.59685093e-01,  3.20138514e-01, -1.53631568e-02, -6.47049069e-01,
       -9.03016031e-02, -

In [0]:
from google.colab import files

files.download('example.txt')

In [41]:
!zip training_checkpoints.zip training_checkpoints/*

  adding: training_checkpoints/checkpoint.gz (stored 0%)
  adding: training_checkpoints/ckpt_10.data-00000-of-00002.gz (deflated 0%)
  adding: training_checkpoints/ckpt_10.data-00001-of-00002.gz (deflated 0%)
  adding: training_checkpoints/ckpt_10.index.gz (stored 0%)
  adding: training_checkpoints/ckpt_11.data-00000-of-00002.gz (deflated 0%)
  adding: training_checkpoints/ckpt_11.data-00001-of-00002 (deflated 8%)
  adding: training_checkpoints/ckpt_11.index (deflated 62%)
  adding: training_checkpoints/ckpt_12.data-00000-of-00002 (deflated 7%)
  adding: training_checkpoints/ckpt_12.data-00001-of-00002 (deflated 8%)
  adding: training_checkpoints/ckpt_12.index (deflated 62%)
  adding: training_checkpoints/ckpt_13.data-00000-of-00002 (deflated 8%)
  adding: training_checkpoints/ckpt_13.data-00001-of-00002 (deflated 8%)
  adding: training_checkpoints/ckpt_13.index (deflated 62%)
  adding: training_checkpoints/ckpt_14.data-00000-of-00002 (deflated 8%)
  adding: training_checkpoints/ckpt_1

In [43]:
from google.colab import files

files.download('training_checkpoints.zip')

----------------------------------------
Exception happened during processing of request from ('::ffff:127.0.0.1', 37034, 0, 0)
Traceback (most recent call last):
  File "/usr/lib/python3.6/socketserver.py", line 320, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/usr/lib/python3.6/socketserver.py", line 351, in process_request
    self.finish_request(request, client_address)
  File "/usr/lib/python3.6/socketserver.py", line 364, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/usr/lib/python3.6/socketserver.py", line 724, in __init__
    self.handle()
  File "/usr/lib/python3.6/http/server.py", line 418, in handle
    self.handle_one_request()
  File "/usr/lib/python3.6/http/server.py", line 406, in handle_one_request
    method()
  File "/usr/lib/python3.6/http/server.py", line 639, in do_GET
    self.copyfile(f, self.wfile)
  File "/usr/lib/python3.6/http/server.py", line 800, in copyfile
    shutil.copyfil

In [39]:
from google.colab import files
import glob

for file in glob.glob("training_checkpoints/*"):
  files.download(file)

NameError: ignored

In [0]:
import pandas as pd

original = open("raw/original.txt").readlines()
transliterated = open("raw/transliterated.txt").readlines()

sentences = pd.DataFrame({
    'orig': original,
    'trans': transliterated
})
sentences.head()

Unnamed: 0,orig,trans
0,እንዲህ ሲል ። ሁለት ሰዎች ሊጸልዩ ወደ መቅደስ ወጡ ፥ አንዱ ፈሪሳዊ ሁ...,inidihi sil . hulat sewo liseliyu wade meqdese...
1,እንዲህ ሲል ። ሁለት ሰዎች ሊጸልዩ ወደ መቅደስ ወጡ ፥ አንዱ ፈሪሳዊ ሁ...,'endihe sil . hulet sawotche liseliju wede maq...
2,እንዲህ ሲል ። ሁለት ሰዎች ሊጸልዩ ወደ መቅደስ ወጡ ፥ አንዱ ፈሪሳዊ ሁ...,`enedihe sil . huleti sawoc liseleju wada meki...
3,እንዲህ ሲል ። ሁለት ሰዎች ሊጸልዩ ወደ መቅደስ ወጡ ፥ አንዱ ፈሪሳዊ ሁ...,inidih sili . hulat sawoci liseleyu weda meqed...
4,እንዲህ ሲል ። ሁለት ሰዎች ሊጸልዩ ወደ መቅደስ ወጡ ፥ አንዱ ፈሪሳዊ ሁ...,`enidih sile . hulat sawokhi litsalju wada maq...


In [0]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(sentences, test_size=5000)

print("Training on", train.shape)
print("Testing on", test.shape)

!mkdir -p data

# Save as training
train.trans.to_csv("data/src-train.txt", header=False, index=False)
train.orig.to_csv("data/tgt-train.txt", header=False, index=False)

# Save as testing
test.trans.to_csv("data/src-val.txt", header=False, index=False)
test.orig.to_csv("data/tgt-val.txt", header=False, index=False)

Training on (303760, 2)
Testing on (5000, 2)


In [0]:
train.head()

Unnamed: 0,orig,trans
251677,ኢየሱስም ። እውነት እውነት እላችኋለሁ ፥ አብርሃም ሳይወለድ እኔ አለሁ ...,`ijasusme . 'eweneti iwnat `elacehualehu : 'ab...
230550,አባቱን ወይም እናቱን አያከብርም ትላላችሁ ፤ ስለ ወጋችሁም የእግዚአብሔር...,abatune wejeme ina'tun ajakebirm tlalacihu ; s...
217602,በምን ዓይነትም ሞት ይሞት ዘንድ እንዳለው ሲያመለክታቸው ይህን ተናገረ ።\n,bamine 'ajenetim mote ymoti zendi `enidalawe s...
245314,ሌሎች ። እርሱ ነው አሉ ፤ ሌሎች ። አይደለም እርሱን ይመስላል እንጂ አ...,lelwoci . `eresu newe alu ; leloce . ajedalem ...
132727,ከኤፊቆሮስ ወገንና ኢስጦኢኮችም ከተባሉት ፈላስፎች አንዳንዶቹ ከእርሱ ጋር...,ceefikoros wegenena isto'ikwochime katabalut f...


# Data setup

Following quickstart instructions from https://github.com/OpenNMT/OpenNMT-py#quickstart.

## Training

I'm just using the terminal commands because the Python bindings were just Too Much Work.

In [0]:
!pip install OpenNMT-py

Collecting OpenNMT-py
[?25l  Downloading https://files.pythonhosted.org/packages/7e/c7/b3d9bf9a6a681b10c00aa897650f79d4e7ad8a80317c5cddb6a3ef43540c/OpenNMT_py-1.1.1-py3-none-any.whl (189kB)
[K     |████████████████████████████████| 194kB 2.8MB/s 
Collecting tqdm~=4.30.0
[?25l  Downloading https://files.pythonhosted.org/packages/76/4c/103a4d3415dafc1ddfe6a6624333971756e2d3dd8c6dc0f520152855f040/tqdm-4.30.0-py2.py3-none-any.whl (47kB)
[K     |████████████████████████████████| 51kB 5.8MB/s 
Collecting waitress
[?25l  Downloading https://files.pythonhosted.org/packages/a8/ca/ede3ed29723ca944f6e77bd1d7b38c271dd801c7d6a11ab6037597e4fd5b/waitress-1.4.3-py2.py3-none-any.whl (148kB)
[K     |████████████████████████████████| 153kB 43.7MB/s 
[?25hCollecting configargparse
[?25l  Downloading https://files.pythonhosted.org/packages/bb/79/3045743bb26ca2e44a1d317c37395462bfed82dbbd38e69a3280b63696ce/ConfigArgParse-1.2.3.tar.gz (42kB)
[K     |████████████████████████████████| 51kB 6.7MB/s 
[

## Preprocess

In [0]:
!onmt_preprocess \
    -train_src data/src-train.txt \
    -train_tgt data/tgt-train.txt \
    -valid_src data/src-val.txt \
    -valid_tgt data/tgt-val.txt \
    -save_data data/demo \
    -overwrite

[2020-04-27 15:57:36,018 INFO] Extracting features...
[2020-04-27 15:57:36,018 INFO]  * number of source features: 0.
[2020-04-27 15:57:36,018 INFO]  * number of target features: 0.
[2020-04-27 15:57:36,018 INFO] Building `Fields` object...
[2020-04-27 15:57:36,018 INFO] Building & saving training data...
[2020-04-27 15:57:37,026 INFO] Building shard 0.
[2020-04-27 15:58:02,642 INFO]  * saving 0th train data shard to data/demo.train.0.pt.
[2020-04-27 15:58:19,563 INFO]  * tgt vocab size: 26568.
[2020-04-27 15:58:21,235 INFO]  * src vocab size: 50002.
[2020-04-27 15:58:22,683 INFO] Building & saving validation data...
[2020-04-27 15:58:23,864 INFO] Building shard 0.
[2020-04-27 15:58:24,108 INFO]  * saving 0th valid data shard to data/demo.valid.0.pt.


## Train

In [0]:
# Change to false to get GPU power on Colab
if False:
    !onmt_train \
        -data data/demo \
        -save_model demo-model \
        --valid_steps 50 \
        --train_steps 2 \
        --early_stopping 5
else:
    !CUDA_VISIBLE_DEVICES=0 \
        onmt_train \
        -world_size 1 \
        -gpu_ranks 0 \
        -data data/demo \
        -save_model demo-model \
        --valid_steps 1000 \
        --train_steps 20000 \
        --early_stopping 3

[2020-04-27 17:02:26,357 INFO]  * src vocab size = 50002
[2020-04-27 17:02:26,375 INFO]  * tgt vocab size = 26568
[2020-04-27 17:02:26,376 INFO] Building model...
[2020-04-27 17:02:29,548 INFO] NMTModel(
  (encoder): RNNEncoder(
    (embeddings): Embeddings(
      (make_embedding): Sequential(
        (emb_luts): Elementwise(
          (0): Embedding(50002, 500, padding_idx=1)
        )
      )
    )
    (rnn): LSTM(500, 500, num_layers=2, dropout=0.3)
  )
  (decoder): InputFeedRNNDecoder(
    (embeddings): Embeddings(
      (make_embedding): Sequential(
        (emb_luts): Elementwise(
          (0): Embedding(26568, 500, padding_idx=1)
        )
      )
    )
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): StackedLSTM(
      (dropout): Dropout(p=0.3, inplace=False)
      (layers): ModuleList(
        (0): LSTMCell(1000, 500)
        (1): LSTMCell(500, 500)
      )
    )
    (attn): GlobalAttention(
      (linear_in): Linear(in_features=500, out_features=500, bias=False)
     

# Testing it out

Testing the text below, you can see the "answer" on Google translate [right here](https://translate.google.com/#view=home&op=translate&sl=am&tl=en&text=%E1%8A%A5%E1%8A%94%E1%88%9D%E1%8D%A5%20%E1%8B%88%E1%8A%95%E1%8B%B5%E1%88%9E%E1%89%BD%20%E1%88%86%E1%8B%AD%E1%8D%A5%20%E1%8B%A8%E1%88%A5%E1%8C%8B%20%E1%8A%A5%E1%8A%95%E1%8B%B0%20%E1%88%98%E1%88%86%E1%8A%93%E1%89%BD%E1%88%81%E1%8D%A5%0A%E1%89%A0%E1%8A%AD%E1%88%AD%E1%88%B5%E1%89%B6%E1%88%B5%E1%88%9D%20%E1%88%95%E1%8D%83%E1%8A%93%E1%89%B5%20%E1%8A%A5%E1%8A%95%E1%8B%B0%20%E1%88%98%E1%88%86%E1%8A%93%E1%89%BD%E1%88%81%20%E1%8A%A5%E1%8A%95%E1%8C%82%0A%E1%88%98%E1%8A%95%E1%8D%88%E1%88%B3%E1%8B%8D%E1%8B%AB%E1%8A%95%20%E1%8A%A5%E1%8A%95%E1%8B%B0%20%E1%88%98%E1%88%86%E1%8A%93%E1%89%BD%E1%88%81%20%E1%88%8D%E1%8A%93%E1%8C%88%E1%88%AB%E1%89%BD%E1%88%81%0A%E1%8A%A0%E1%88%8D%E1%89%BB%E1%88%8D%E1%88%81%E1%88%9D%E1%8D%A2).

In [0]:
!head -n 1 raw/original.txt

እንዲህ ሲል ። ሁለት ሰዎች ሊጸልዩ ወደ መቅደስ ወጡ ፥ አንዱ ፈሪሳዊ ሁለተኛውም ቀራጭ ።


In [0]:
!head -n 1 raw/transliterated.txt

inidihi sil . hulat sewo liseliyu wade meqdese watu : andu ferisawi huletenawm qera .


In [0]:
original = """ሁለት ሰዎች ሊጸልዩ ወደ መቅደስ ወጡ ፥ አንዱ ፈሪሳዊ"""
transliterated = """hulat sewo liseliyu wade meqdese watu : andu ferisawi"""

orig_oneline = original.replace("\n", " ").strip().replace("  ", " ")
trans_oneline = transliterated.replace("\n", " ").strip().replace("  ", " ")
with open("data/test.txt", 'w') as f:
    f.write(trans_oneline)

**You'll need to change the model name in `onmt_translate` below.** It's probably the most recently changed model file, so at the top of this list: 

In [0]:
import subprocess
model_name = subprocess.run("ls -t *model* | head -n 1",
                            shell=True,
                            stdout=subprocess.PIPE).stdout.decode("utf-8").strip()
print("Using model", model_name)

Using model demo-model_step_20000.pt


In [0]:
!onmt_translate \
    -model {model_name} \
    -src data/test.txt \
    -output data/pred.txt -replace_unk -verbose


[2020-04-27 18:10:49,724 INFO] Translating shard 0.

SENT 1: ['hulat', 'sewo', 'liseliyu', 'wade', 'meqdese', 'watu', ':', 'andu', 'ferisawi']
PRED 1: "ደግሞም ሁሉን በየስፍራው ወደ መቅደስ ሲወጣ ፥ ደስ በለው ፥
PRED SCORE: -3.1997
PRED AVG SCORE: -0.3200, PRED PPL: 1.3771


In [0]:
print("Output: ", open("data/pred.txt").read())
print("Input: ", open("data/test.txt").read())
print("Actual: ", orig_oneline)

Output:  "ደግሞም ሁሉን በየስፍራው ወደ መቅደስ ሲወጣ ፥ ደስ በለው ፥

Input:  hulat sewo liseliyu wade meqdese watu : andu ferisawi
Actual:  ሁለት ሰዎች ሊጸልዩ ወደ መቅደስ ወጡ ፥ አንዱ ፈሪሳዊ
