# Pull in the transliteration table (Ethiopic --> Latin)

In [12]:
import requests

user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
headers = {'User-Agent': user_agent}
response = requests.get('https://scriptsource.org/cms/scripts/page.php?item_id=entry_detail&uid=vsytndbyev', headers=headers)

In [13]:
import pandas as pd

results = pd.read_html(response.text)
df = results[1]
df.head()

Unnamed: 0,Glyph,USV,ALA-LC,Alone-Stokes,B&D,Campbell,Chaîne,Cohen,Dawkins,Dept of State,ethiop,SERA
0,,1200,ha,hā,hä,ha,hă,ha,hä,ha,ha,he
1,,1201,hu,hu,hu,hū,hu,hu,hu,hu,hu,hu
2,,1202,hi,hī,hi,hī,hi,hi,hi,hi,hi,hi
3,,1203,hā,hā,ha,hā,ha,ha,ha,ha,hā,ha
4,,1204,hé,hē,he,hē,he,he,he,he,hē,hE


In [14]:
df.Glyph = df.USV.apply(lambda val: chr(int(val, 16)))
df.head()

Unnamed: 0,Glyph,USV,ALA-LC,Alone-Stokes,B&D,Campbell,Chaîne,Cohen,Dawkins,Dept of State,ethiop,SERA
0,ሀ,1200,ha,hā,hä,ha,hă,ha,hä,ha,ha,he
1,ሁ,1201,hu,hu,hu,hū,hu,hu,hu,hu,hu,hu
2,ሂ,1202,hi,hī,hi,hī,hi,hi,hi,hi,hi,hi
3,ሃ,1203,hā,hā,ha,hā,ha,ha,ha,ha,hā,ha
4,ሄ,1204,hé,hē,he,hē,he,he,he,he,hē,hE


In [15]:
df.to_csv("transliteration-table.csv", index=False)

## Simplify Transliteration Table

In [16]:
df = pd.read_csv("transliteration-table.csv")
df = df.drop(columns=['USV'])
df = df.set_index('Glyph')
df

Unnamed: 0_level_0,ALA-LC,Alone-Stokes,B&D,Campbell,Chaîne,Cohen,Dawkins,Dept of State,ethiop,SERA
Glyph,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ሀ,ha,hā,hä,ha,hă,ha,hä,ha,ha,he
ሁ,hu,hu,hu,hū,hu,hu,hu,hu,hu,hu
ሂ,hi,hī,hi,hī,hi,hi,hi,hi,hi,hi
ሃ,hā,hā,ha,hā,ha,ha,ha,ha,hā,ha
ሄ,hé,hē,he,hē,he,he,he,he,hē,hE
...,...,...,...,...,...,...,...,...,...,...
ᎎ,,,,,,,,,pwē,pWE
ᎏ,,,,,,,,,pwe,pW
ፘ,rya,,,,,rya,rya,,~ri,
ፙ,mya,,,,,mya,,,~mA,


In [17]:
!pip install Unidecode

You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [18]:
import unidecode
import numpy as np

def simplify(value):
    try:
        return unidecode.unidecode(value).lower()
    except:
        return np.nan
    
simplified = df.applymap(simplify)
merged = df.join(simplified, rsuffix='_simp')
merged.head(10)

Unnamed: 0_level_0,ALA-LC,Alone-Stokes,B&D,Campbell,Chaîne,Cohen,Dawkins,Dept of State,ethiop,SERA,ALA-LC_simp,Alone-Stokes_simp,B&D_simp,Campbell_simp,Chaîne_simp,Cohen_simp,Dawkins_simp,Dept of State_simp,ethiop_simp,SERA_simp
Glyph,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
ሀ,ha,hā,hä,ha,hă,ha,hä,ha,ha,he,ha,ha,ha,ha,ha,ha,ha,ha,ha,he
ሁ,hu,hu,hu,hū,hu,hu,hu,hu,hu,hu,hu,hu,hu,hu,hu,hu,hu,hu,hu,hu
ሂ,hi,hī,hi,hī,hi,hi,hi,hi,hi,hi,hi,hi,hi,hi,hi,hi,hi,hi,hi,hi
ሃ,hā,hā,ha,hā,ha,ha,ha,ha,hā,ha,ha,ha,ha,ha,ha,ha,ha,ha,ha,ha
ሄ,hé,hē,he,hē,he,he,he,he,hē,hE,he,he,he,he,he,he,he,he,he,he
ህ,he/h,h/hi,hə/hø,he,hĕ,h/hə,hï,hɪ,he,h,he/h,h/hi,h@/ho,he,he,h/h@,hi,hi,he,h
ሆ,ho,ho,ho,hō,ho,hʷo,ho,ho,ho,ho,ho,ho,ho,ho,ho,hwo,ho,ho,ho,ho
ለ,la,la,lä,la,lă,lɑ,lä,lə,la,le,la,la,la,la,la,la,la,l@,la,le
ሉ,lu,lu,lu,lū,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu,lu
ሊ,li,lī,li,lī,li,li,li,li,li,li,li,li,li,li,li,li,li,li,li,li


In [19]:
uniqued = df.fillna('').apply(lambda row: list(set([v for v in row.values if '/' not in str(v)])), axis=1)
uniqued

Glyph
ሀ    [he, hă, ha, hä, hā]
ሁ                [hū, hu]
ሂ                [hi, hī]
ሃ                [ha, hā]
ሄ        [he, hē, hé, hE]
             ...         
ᎎ            [, pwē, pWE]
ᎏ             [, pW, pwe]
ፘ            [, rya, ~ri]
ፙ            [, mya, ~mA]
ፚ            [, fya, ~fi]
Length: 368, dtype: object

In [20]:
uniqued.to_json("lookup-table.json")

In [21]:
import json

with open("lookup-table.json") as f:
    lookup = json.load(f)

punctuation = [
    ['።', '.'],
    ['፡', ' '],
    ['፣', ','],
    ['፤', ';'],
    ['፥', ':'],
    ['፧', '?']
]

for punc in punctuation:
    lookup[punc[0]] = [punc[1]]

lookup

{'ሀ': ['he', 'hă', 'ha', 'hä', 'hā'],
 'ሁ': ['hū', 'hu'],
 'ሂ': ['hi', 'hī'],
 'ሃ': ['ha', 'hā'],
 'ሄ': ['he', 'hē', 'hé', 'hE'],
 'ህ': ['he', 'hĕ', 'h', 'hɪ', 'hï'],
 'ሆ': ['ho', 'hʷo', 'hō'],
 'ለ': ['la', 'lə', 'lă', 'lɑ', 'lä', 'le'],
 'ሉ': ['lū', 'lu'],
 'ሊ': ['lī', 'li'],
 'ላ': ['lā', 'la'],
 'ሌ': ['lé', 'lē', 'lʸe', 'le', 'lE'],
 'ል': ['l', 'lï', 'lɪ', 'lĕ', 'le'],
 'ሎ': ['lo', 'lʷo', 'lō'],
 'ሏ': ['', 'lwa', 'lwā', 'lWa', 'lwä'],
 'ሐ': ['ḥă', 'ḥa', 'He', 'ha', 'hä', 'hā'],
 'ሑ': ['Hu', 'ḥu', 'hu', 'ḥū'],
 'ሒ': ['Hi', 'ḥi', 'ḥī', 'hi', 'hī'],
 'ሓ': ['ḥa', 'ha', 'ḥā', 'Ha', 'hā'],
 'ሔ': ['he', 'ḥé', 'ḥe', 'hē', 'HE', 'ḥē'],
 'ሕ': ['hɪ', 'ḥĕ', 'ḥe', 'hï', 'H'],
 'ሖ': ['ḥō', 'ho', 'hʷo', 'ḥo', 'Ho'],
 'ሗ': ['', 'ḥwā', 'HWa'],
 'መ': ['ma', 'mə', 'me', 'mɑ', 'mă', 'mä'],
 'ሙ': ['mu', 'mū'],
 'ሚ': ['mī', 'mi'],
 'ማ': ['ma', 'mā'],
 'ሜ': ['mʸe', 'me', 'mE', 'mé', 'mē'],
 'ም': ['me', 'm', 'mï', 'mĕ', 'mɪ'],
 'ሞ': ['mʷo', 'mo', 'mō'],
 'ᎀ': ['', 'mwa', 'mWe'],
 'ᎁ': ['', 'mWi', 'mwi'],
 '

In [22]:
lookup_simp = {}
for key in lookup.keys():
    values = list(set([unidecode.unidecode(v).lower() for v in lookup[key]]))
    lookup_simp[key] = [v for v in values if '@' not in v]
    
lookup_simp

{'ሀ': ['he', 'ha'],
 'ሁ': ['hu'],
 'ሂ': ['hi'],
 'ሃ': ['ha'],
 'ሄ': ['he'],
 'ህ': ['he', 'hi', 'h'],
 'ሆ': ['ho', 'hwo'],
 'ለ': ['le', 'la'],
 'ሉ': ['lu'],
 'ሊ': ['li'],
 'ላ': ['la'],
 'ሌ': ['le', 'lye'],
 'ል': ['le', 'l', 'li'],
 'ሎ': ['lwo', 'lo'],
 'ሏ': ['', 'lwa'],
 'ሐ': ['he', 'ha'],
 'ሑ': ['hu'],
 'ሒ': ['hi'],
 'ሓ': ['ha'],
 'ሔ': ['he'],
 'ሕ': ['he', 'hi', 'h'],
 'ሖ': ['ho', 'hwo'],
 'ሗ': ['', 'hwa'],
 'መ': ['me', 'ma'],
 'ሙ': ['mu'],
 'ሚ': ['mi'],
 'ማ': ['ma'],
 'ሜ': ['me', 'mye'],
 'ም': ['m', 'me', 'mi'],
 'ሞ': ['mo', 'mwo'],
 'ᎀ': ['', 'mwa', 'mwe'],
 'ᎁ': ['', 'mwi'],
 'ሟ': ['', 'mwa'],
 'ᎂ': ['', 'mwe'],
 'ᎃ': ['', 'mw', 'mwe'],
 'ሠ': ["'se", 'sa'],
 'ሡ': ['su', "'su"],
 'ሢ': ['si', "'si"],
 'ሣ': ["'sa", 'sa'],
 'ሤ': ['sye', "'se", 'se'],
 'ሥ': ["'s", 'si', 'se'],
 'ሦ': ['so', 'swo', "'so"],
 'ሧ': ['', 'swa', "'swa"],
 'ረ': ['re', 'ra'],
 'ሩ': ['ru'],
 'ሪ': ['ri'],
 'ራ': ['ra'],
 'ሬ': ['rye', 're'],
 'ር': ['ri', 're', 'r'],
 'ሮ': ['rwo', 'ro'],
 'ሯ': ['', 'rwa'],
 'ሰ': ['se'

# Create reverse dictionary (Latin --> Ethiopic)

In [23]:
# create reverse dictionary
lat_eth = dict()

for key in lookup_simp.keys():
    values = lookup_simp[key]
    for v in values:
        if v in lat_eth.keys():
            new_list = lat_eth[v].copy()
            new_list.append(key)
            lat_eth[v] = new_list
        else:
            lat_eth[v] = [key]

lat_eth

{'he': ['ሀ', 'ሄ', 'ህ', 'ሐ', 'ሔ', 'ሕ', 'ኄ', 'ኅ', 'ኼ', 'ⷔ', 'ⷕ'],
 'ha': ['ሀ', 'ሃ', 'ሐ', 'ሓ', 'ኀ', 'ኃ', 'ኸ', 'ኻ', 'ⷐ', 'ⷓ'],
 'hu': ['ሁ', 'ሑ', 'ኁ', 'ኹ', 'ⷑ'],
 'hi': ['ሂ', 'ህ', 'ሒ', 'ሕ', 'ኂ', 'ኅ', 'ኺ', 'ኽ', 'ⷒ'],
 'h': ['ህ', 'ሕ'],
 'ho': ['ሆ', 'ሖ', 'ኆ', 'ኾ', 'ⷖ'],
 'hwo': ['ሆ', 'ሖ', 'ኆ', 'ኾ'],
 'le': ['ለ', 'ሌ', 'ል'],
 'la': ['ለ', 'ላ'],
 'lu': ['ሉ'],
 'li': ['ሊ', 'ል'],
 'lye': ['ሌ'],
 'l': ['ል'],
 'lwo': ['ሎ'],
 'lo': ['ሎ'],
 '': ['ሏ',
  'ሗ',
  'ᎀ',
  'ᎁ',
  'ሟ',
  'ᎂ',
  'ᎃ',
  'ሧ',
  'ሯ',
  'ሷ',
  'ሸ',
  'ሹ',
  'ሺ',
  'ሻ',
  'ሼ',
  'ሽ',
  'ሾ',
  'ሿ',
  'ቇ',
  'ⷀ',
  'ⷁ',
  'ⷂ',
  'ⷃ',
  'ⷄ',
  'ⷅ',
  'ⷆ',
  'ቐ',
  'ቑ',
  'ቒ',
  'ቓ',
  'ቔ',
  'ቕ',
  'ቖ',
  'ቘ',
  'ቚ',
  'ቛ',
  'ቜ',
  'ቝ',
  'ᎄ',
  'ᎅ',
  'ቧ',
  'ᎆ',
  'ᎇ',
  'ቨ',
  'ቩ',
  'ቪ',
  'ቫ',
  'ቬ',
  'ቭ',
  'ቮ',
  'ቯ',
  'ቷ',
  'ቸ',
  'ቹ',
  'ቺ',
  'ቻ',
  'ቼ',
  'ች',
  'ቾ',
  'ቿ',
  'ኗ',
  'ኘ',
  'ኙ',
  'ኚ',
  'ኛ',
  'ኜ',
  'ኝ',
  'ኞ',
  'ኟ',
  'ኧ',
  'ኯ',
  'ⷈ',
  'ⷉ',
  'ⷊ',
  'ⷋ',
  'ⷌ',
  'ⷍ',
  'ⷎ',
  'ኸ',
  'ኹ',
  'ኺ',
  '

# Transliteration work

In [19]:
# load dictionary to prune transliteration options 
am_dic_file = open("am_dic.txt", "r", encoding="utf-8")
am_dic = []

for w in am_dic_file.readlines():
    am_dic.append(w.rstrip())

am_dic_file.close()
am_dic = set(am_dic)

## Use CountVectorizer() to do char n grams

In [21]:
# clean corpus
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

file = open("raw/new-am.txt", "r", encoding="utf-8")
corpus = file.read()

# tokenize corpus (https://machinelearningmastery.com/clean-text-machine-learning-python/)
tokens = word_tokenize(corpus)
# remove all tokens that are not alphabetic
tokens = [word for word in tokens if word.isalpha()]

[nltk_data] Downloading package punkt to /Users/soma/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [22]:
# train model to do ngram work
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(1, 3), analyzer="char_wb")
cv_fit = cv.fit_transform(tokens)

In [23]:
print("Vocabulary size 1-3gram: ", len(cv.vocabulary_))

Vocabulary size 1-3gram:  102434


In [24]:
import numpy as np

ngram_list = cv.get_feature_names()
count_list = np.asarray(cv_fit.sum(axis=0))[0]

# make a dictionary with frequencies 
freq_dict = dict(zip(ngram_list,count_list))

# get unigram, bigram, trigram total counts
unigram_count = 0
bigram_count = 0
trigram_count = 0

for key in freq_dict.keys():
    if len(key)==1:
        unigram_count += freq_dict[key]
    elif len(key)==2:
        bigram_count += freq_dict[key]
    else:
        trigram_count += freq_dict[key]

print("Unigram Count: ", unigram_count)
print("Bigram Count: ", bigram_count)
print("Trigram Count: ", trigram_count)

Unigram Count:  4307773
Bigram Count:  3595134
Trigram Count:  2882495


## Actual Transliterating

In [32]:
import json

with open('lat_eth.json', 'w') as f:
    json.dump(lat_eth, f)

In [26]:
def get_breakdowns(segment):
    options = []
    max_len = min([len(segment), 4])
    for i in range(1, max_len+1):
        potential = segment[:i]
        if potential in lat_eth.keys():
            remainder = segment[i:]
            if remainder == "":
                options.append([potential])
            else:
                enders = get_breakdowns(remainder)
                if enders == []:
                    return []
                else:
                    options.extend([potential, *e] for e in enders)
    return options
    

breakdowns = get_breakdowns("inidihi")
breakdowns

[['i', 'n', 'i', 'd', 'i', 'h', 'i'],
 ['i', 'n', 'i', 'd', 'i', 'hi'],
 ['i', 'n', 'i', 'di', 'h', 'i'],
 ['i', 'n', 'i', 'di', 'hi'],
 ['i', 'ni', 'd', 'i', 'h', 'i'],
 ['i', 'ni', 'd', 'i', 'hi'],
 ['i', 'ni', 'di', 'h', 'i'],
 ['i', 'ni', 'di', 'hi']]

In [119]:
def convert(segmentation):
    final_list = []
    relevant_lists = []
    for elmt in segmentation:
        relevant_lists.append(lat_eth[elmt])
    for i in itertools.product(*relevant_lists):
        final_list.append(''.join(i))
    return final_list

In [120]:
eth_options = convert(['i', 'n', 'i', 'd', 'i', 'h', 'i'])
eth_options[0:5]

['ኢንኢድኢህኢ', 'ኢንኢድኢህእ', 'ኢንኢድኢህዒ', 'ኢንኢድኢህዕ', 'ኢንኢድኢሕኢ']

In [4]:
import json

with open("char2idx.json") as f:
    char2idx = json.load(f)

with open("idx2char.json") as f:
    idx2char = json.load(f)

In [29]:
import tensorflow as tf
import numpy as np
import os
import time
import keras

def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

model = tf.keras.models.load_model("char_model", compile=False)

model.compile(optimizer='adam', loss=loss)


In [27]:
tf.__version__

'2.2.0-rc2'

In [28]:
tf.keras.__version__

'2.3.0-tf'

In [6]:
model.build(tf.TensorShape([1, None]))

In [9]:
model

<tensorflow.python.keras.engine.sequential.Sequential at 0x13eb97320>

In [33]:
start_string = "በ"
temperature = 1.0
num_generate = 1
input_eval = [char2idx[s] for s in start_string]
input_eval = tf.expand_dims(input_eval, 0)

model.reset_states()
predictions = model(input_eval)
predictions = tf.squeeze(predictions, 0)
predictions = predictions / temperature

predicted_id = np.argsort(predictions)[0, -3:]
predicted_id
print("".join(idx2char[predicted_id]))



ValueError: Tensor's shape (1, 64, 1024) is not compatible with supplied shape [1, 1, 1024]

In [31]:
# generate possible transliterations

import string
import itertools
import operator
import string 
import random

# given a sentence in latin characters, splits and sends word by word to the 
# function transliterate_word
def transliterate(text):
    sent_trans = []

    sentence = [i for j in text.split() for i in (j, ' ')][:-1]
    cleaned = []
    for elmt in sentence:
      elmt_tokenized = word_tokenize(elmt)
      if elmt == ' ':
        cleaned.append(' ')
      elif len(elmt) == len(elmt_tokenized):
        cleaned.append(elmt)
      else:
        for i in elmt_tokenized:
          if i == "'":
            elmt_tokenized.remove(i)
        cleaned += elmt_tokenized

    for word in cleaned:
      sent_trans.append(transliterate_word(word))
    
    return "".join(sent_trans)

    
# transliterate_word returns spaces/punctuations as appropriate
# and sends an actual latin character word to ngram_selected(word) to 
# obtain the appropriate transliterated word in ethiopic
def transliterate_word(word):
    print("transliterating the word")
    if word in string.punctuation and word not in lat_eth.keys():
      return word
    elif word.isnumeric() == True:
      return word
    elif word == " ":
      return word
    elif len(word) > 15:
      return word
    else:
      word = unidecode.unidecode(word).lower()
      print(f"Sending {word} to ngram_selected")
      return ngram_selected(word)

# ngram_selected takes a latin character word and generates all possible ethiopic
# transliterations by calling the function possibilities; it then selects the 
# ethiopic option with the highest score using the function word_score
def ngram_selected(word):
    options = possibilities(word)
    print('possibilities are', options)
    if len(options) == 0:
      return word
    else:
      scores = dict()
      for opt in options:
        score = word_score(opt)
        scores[opt] = score
      print('scores were calculated as', scores)
      selected_word = max(scores.items(), key=operator.itemgetter(1))[0]
      return selected_word

# the function possibilities takes a latin character word and returns all 
# possible transliterations into ethiopic based on the reverse dictionary
# this function calls the function prune to remove entries that are 
# not in an actual amharic dictionary (unless pruning results in 0 options)   
# this function also calls the function convert to go from latin char to 
# ethiopic char as based on the reverse dictionary
def possibilities(word):
    print("calculating possibilities")
    # split word into chars
    chars = list(word)
    
    # generate all combinations 
    # https://stackoverflow.com/questions/27263155/python-find-all-possible-
    # word-combinations-with-a-sequence-of-characters-word
    print("all combinations of", chars)
    combinatorics = itertools.product([True, False], repeat=len(chars) - 1)
    latin_segmentation = []
    add = True
    for combination in combinatorics:
        print("looking at", combination)
        i = 0
        one_such_combination = [chars[i]]
        for slab in combination:
            print("slab is", slab)
            i += 1
            if not slab: # there is a join
                one_such_combination[-1] += chars[i]
            else:
                one_such_combination += [chars[i]]
        
        for elmt in one_such_combination:
            print("Looking up", elmt)
            if elmt not in lat_eth.keys():
                add = False
                break
        # only add/consider if segmentation can be converted into ethiopic 
        # characters
        if add == True:  
            latin_segmentation.append(one_such_combination)
            
        # reset
        add = True
    
    # conversion
    ethiopic_opts = []
    for segmentation in latin_segmentation:
        ethiopic_opts += convert(segmentation)

    pruned = prune(ethiopic_opts)
    if len(pruned) == 0:
      if len(ethiopic_opts) < 100:
        return ethiopic_opts
      else:
        sampling = random.choices(ethiopic_opts, k=99)
        return sampling
    else:
      return pruned

# this is called by the function possibilities to convert from latin char
# to ethiopic char given a particular segmentation (i.e. i-di vs. i-d-i might
# both be sent separately)
def convert(segmentation):
    final_list = []
    relevant_lists = []
    for elmt in segmentation:
        relevant_lists.append(lat_eth[elmt])
    for i in itertools.product(*relevant_lists):
        final_list.append(''.join(i))
    return final_list

# this is called by the function possibilities to prune the list of possible
# ethiopic transliterations
def prune(possibilities):
    final_possibilities = []
    for candidate in possibilities: 
      if candidate in am_dic:
          final_possibilities.append(candidate)
    return final_possibilities

# this function is called by ngram_selected to determine the probability of
# an ethiopic word occurring (using ngram counts)
# this function calls get_ngrams to split the given word into n-length 
# subsections for scoring
# this function also calls one or multiple of the [n]gram_probability functions 
# to compute each [n]gram score, which are then weighted evenly in computing 
# the final score
def word_score(word):
  sequence = list(word)
  if len(word) >= 3:
    # calculate trigram probability
    w = 1/3.0
    trigrams = get_ngrams(sequence, 3)
    bigrams = get_ngrams(sequence, 2)
    unigrams = get_ngrams(sequence, 1)
    score_t = trigram_probability(trigrams)
    score_b = bigram_probability(bigrams)
    score_u = unigram_probability(unigrams)
    score = (w*score_t)+(w*score_b)+(w*score_u) 
  elif len(word) >= 2:
    # calculate bigram probability
    w = 1/2.0
    bigrams = get_ngrams(sequence, 2)
    unigrams = get_ngrams(sequence, 1)
    score_b = bigram_probability(bigrams)
    score_u = unigram_probability(unigrams)
    score = (w*score_b)+(w*score_u) 
  else:
    # calculate unigram probability
    unigrams = get_ngrams(sequence, 1)
    score = unigram_probability(word)
  return score

# called by the function word_score to generate n gram subsections 
# from a given ethiopic word
def get_ngrams(sequence, n):
    input = sequence
    output = []
    for i in range(0, len(input) - n + 1):
        output.append(input[i:i + n])
    
    return [''.join(l) for l in output]

# these funtions are called by word_score to compute [n]gram probabilities given
# an ethiopic word 
def trigram_probability(trigrams):
  freq = 0
  for t in trigrams:
    # get freq
    if t in freq_dict:
      freq += freq_dict[t]
  avg_prob = freq/(len(trigrams) * trigram_count)
  return avg_prob

def bigram_probability(bigrams):
  freq = 0
  for b in bigrams:
    # get freq
    if b in freq_dict:
      freq += freq_dict[b]
  avg_prob = freq/(len(bigrams) * bigram_count)
  return avg_prob

def unigram_probability(unigrams):
  freq = 0
  for u in unigrams:
    # get freq
    if u in freq_dict:
      freq += freq_dict[u]
  avg_prob = freq/(len(unigrams) * unigram_count)
  return avg_prob

In [32]:
ngram_selected("inidihi")

calculating possibilities
all combinations of ['i', 'n', 'i', 'd', 'i', 'h', 'i']
looking at (True, True, True, True, True, True)
slab is True
slab is True
slab is True
slab is True
slab is True
slab is True
Looking up i
Looking up n
Looking up i
Looking up d
Looking up i
Looking up h
Looking up i
looking at (True, True, True, True, True, False)
slab is True
slab is True
slab is True
slab is True
slab is True
slab is False
Looking up i
Looking up n
Looking up i
Looking up d
Looking up i
Looking up hi
looking at (True, True, True, True, False, True)
slab is True
slab is True
slab is True
slab is True
slab is False
slab is True
Looking up i
Looking up n
Looking up i
Looking up d
Looking up ih
looking at (True, True, True, True, False, False)
slab is True
slab is True
slab is True
slab is True
slab is False
slab is False
Looking up i
Looking up n
Looking up i
Looking up d
Looking up ihi
looking at (True, True, True, False, True, True)
slab is True
slab is True
slab is True
slab is False
s

'እንዲህ'

In [148]:
transliterate("inidihi sil . hulat sewochi liseliyu wade")
#እንዲህ ሲል ። ሁለት ሰዎች ሊጸልዩ ወደ

'እንዲህ ስል ። ሁለት ሰዎች ሊጸልዩ ወደ'

In [149]:
transliterate("ijasusme . 'eweneti iwnat `elacehualehu")
#ኢየሱስም ። እውነት እውነት እላችኋለሁ

'ኢየሱስም ። እውንት እውነት እላችኋለሁ'

In [150]:
transliterate("abatune wejeme ina'tun ajakebirm tlalacihu ;")
#አባቱን ወይም እናቱን አያከብርም ትላላችሁ ፤ 

'አባቱን ወይም እናቱን አየከቢርም ትላላችሁ ፤'

In [151]:
transliterate("bamine 'ajenetim mote ymoti zendi `enidalawe")
#በምን ዓይነትም ሞት ይሞት ዘንድ እንዳለው

'በምን አይነትም ሞት ይሞት ዘንድ እንዳለው'

# TEST

In [166]:
! pip install python-levenshtein



In [0]:
# EVALUATION

import Levenshtein
from statistics import mean

# this function takes two files (parallel texts) and generates
# two lists after stripping beginning/trailing whitespace
def extract_from_files(ethiopic_file, latin_file):
  eth_file = open(ethiopic_file)
  lat_file = open(latin_file)
  ethiopic = [line.rstrip() for line in eth_file.readlines()]
  latin = [line.rstrip() for line in lat_file.readlines()] 
  
  ethiopic = list(ethiopic)
  latin = list(latin)

  return ethiopic, latin

# this function takes two parallel lists and evaluates how
# our model performs
def evaluate(ethiopic, latin):
  accuracies = []

  count = 0
  for line in latin:
    predict = transliterate(line)
    correct = ethiopic[count]
    accuracy = Levenshtein.ratio(predict,correct)
    accuracies.append(accuracy) 
    count += 1

  return mean(accuracies)

## first on google translated pairs

In [0]:
ethiopic_tot, latin_tot = extract_from_files("original.txt", "transliterated.txt")

In [0]:
ethiopic_2 = ethiopic_tot[0:10]
latin_2 = latin_tot[0:10]

In [177]:
evaluate(ethiopic_2, latin_2)

0.6636800710490158

In [0]:
ethiopic_3 = ethiopic_tot[0:100]
latin_3 = latin_tot[0:100]

In [179]:
evaluate(ethiopic_3, latin_3)

0.7332939181236338

In [0]:
ethiopic = ethiopic_tot[0:1000]
latin = latin_tot[0:1000]

In [99]:
evaluate(ethiopic,latin)

0.7471351453969316

In [100]:
num_words = 0

for line in ethiopic_tot[0:1000]:
  num_words += len(line)
num_words

81840

## now on manually generated parallel texts

In [0]:
ethiopic_manual, latin_manual = extract_from_files('taitu_am.txt','taitu_rom.txt')

In [171]:
evaluate(ethiopic_manual, latin_manual)

0.5986416548645752