In [38]:
import unidecode
import json
import string 
from tqdm import tqdm_notebook

# Read in mappings

In [3]:
with open("raw/lat_eth.json") as f:
    lat_eth = json.load(f)

with open("raw/eth_lat.json") as f:
    eth_lat = json.load(f)

# Build corpus

In [17]:
# load dictionary to prune transliteration options 
am_dic_file = open("am_dic.txt", "r", encoding="utf-8")
am_dic = []

for w in am_dic_file.readlines():
    am_dic.append(w.rstrip())

am_dic_file.close()
am_dic = set(am_dic)

In [18]:
am_dic_file_2 = open("AMH-wiki-tok.txt", "r", encoding="utf-8")
am_dic_2 = []

for w in am_dic_file_2.readlines():
  line = w.rstrip()
  words = line.split()
  for i in words:
    if i in string.punctuation:
      words.remove(i)
  am_dic_2 += words

In [19]:
am_dic = am_dic.union(set(am_dic_2))
len(am_dic)

231333

## Use CountVectorizer() to do char n grams

In [20]:
# clean corpus
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

file = open("raw/new-am.txt", "r", encoding="utf-8")
corpus = file.read()

# tokenize corpus (https://machinelearningmastery.com/clean-text-machine-learning-python/)
tokens = list(set(word_tokenize(corpus) + am_dic_2))
# remove all tokens that are not alphabetic
tokens = [word for word in tokens if word.isalpha()]

[nltk_data] Downloading package punkt to /Users/soma/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [21]:
# train model to do ngram work
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(1, 3), analyzer="char_wb")
cv_fit = cv.fit_transform(tokens)

In [22]:
print("Vocabulary size 1-3gram: ", len(cv.vocabulary_))

Vocabulary size 1-3gram:  207249


In [23]:
import numpy as np

ngram_list = cv.get_feature_names()
count_list = np.asarray(cv_fit.sum(axis=0))[0]

# make a dictionary with frequencies 
freq_dict = dict(zip(ngram_list,count_list))

# get unigram, bigram, trigram total counts
unigram_count = 0
bigram_count = 0
trigram_count = 0

for key in freq_dict.keys():
  if len(key)==1:
    unigram_count += freq_dict[key]
  elif len(key)==2:
    bigram_count += freq_dict[key]
  else:
    trigram_count += freq_dict[key]

print("Unigram Count: ", unigram_count)
print("Bigram Count: ", bigram_count)
print("Trigram Count: ", trigram_count)

Unigram Count:  1585420
Bigram Count:  1365220
Trigram Count:  1145020


## Actual Transliterating

In [59]:
# generate possible transliterations

import string
import itertools
import operator
import string 
import random

# given a sentence in latin characters, splits and sends word by word to the 
# function transliterate_word
def transliterate(text):
    sent_trans = []

    sentence = [i for j in text.split() for i in (j, ' ')][:-1]
    cleaned = []
    for elmt in sentence:
      elmt_tokenized = word_tokenize(elmt)
      if elmt == ' ':
        cleaned.append(' ')
      elif len(elmt) == len(elmt_tokenized):
        cleaned.append(elmt)
      else:
        for i in elmt_tokenized:
          if i == "'":
            elmt_tokenized.remove(i)
        cleaned += elmt_tokenized

    for word in cleaned:
      sent_trans.append(transliterate_word(word))
    
    return "".join(sent_trans)

    
# transliterate_word returns spaces/punctuations as appropriate
# and sends an actual latin character word to ngram_selected(word) to 
# obtain the appropriate transliterated word in ethiopic
def transliterate_word(word):
    if word in string.punctuation and word not in lat_eth.keys():
      return word
    elif word.isnumeric() == True:
      return word
    elif word == " ":
      return word
    elif len(word) > 15:
      return word
    else:
      word = unidecode.unidecode(word).lower()
      return ngram_selected(word)

# ngram_selected takes a latin character word and generates all possible ethiopic
# transliterations by calling the function possibilities; it then selects the 
# ethiopic option with the highest score using the function word_score
cached_best = {}
def ngram_selected(word):
    cache_key = word
    if cache_key in cached_best.keys():
        return cached_best[cache_key]
    options = possibilities(word)
    if len(options) == 0:
      return word
    else:
      scores = dict()
      for opt in options:
        score = word_score(opt)
        scores[opt] = score
      selected_word = max(scores.items(), key=operator.itemgetter(1))[0]
      cached_best[cache_key] = selected_word
      return selected_word

# the function possibilities takes a latin character word and returns all 
# possible transliterations into ethiopic based on the reverse dictionary
# this function calls the function prune to remove entries that are 
# not in an actual amharic dictionary (unless pruning results in 0 options)   
# this function also calls the function convert to go from latin char to 
# ethiopic char as based on the reverse dictionary
def possibilities(word):
    # split word into chars
    chars = list(word)
    
    # generate all combinations 
    # https://stackoverflow.com/questions/27263155/python-find-all-possible-
    # word-combinations-with-a-sequence-of-characters-word
    combinatorics = itertools.product([True, False], repeat=len(chars) - 1)
    latin_segmentation = []
    add = True
    for combination in combinatorics:
        i = 0
        one_such_combination = [chars[i]]
        for slab in combination:
            i += 1
            if not slab: # there is a join
                one_such_combination[-1] += chars[i]
            else:
                one_such_combination += [chars[i]]
        
        for elmt in one_such_combination:
            if elmt not in lat_eth.keys():
                add = False
                break
        # only add/consider if segmentation can be converted into ethiopic 
        # characters
        if add == True:  
            latin_segmentation.append(one_such_combination)
            
        # reset
        add = True
    
    # conversion ******************************
    ethiopic_opts = []
    for segmentation in latin_segmentation:
        ethiopic_opts += convert(segmentation)

    pruned = prune(ethiopic_opts)
    if len(pruned) == 0:
      if len(ethiopic_opts) < 100:
        return ethiopic_opts
      else:
        sampling = random.choices(ethiopic_opts, k=99)
        return sampling
    else:
      return pruned


# this is called by the function possibilities to convert from latin char
# to ethiopic char given a particular segmentation (i.e. i-di vs. i-d-i might
# both be sent separately)
def convert(segmentation):
    final_list = []
    relevant_lists = []
    for elmt in segmentation:
        relevant_lists.append(lat_eth[elmt])
    for i in itertools.product(*relevant_lists):
        final_list.append(''.join(i))
    return final_list

# this is called by the function possibilities to prune the list of possible
# ethiopic transliterations
def prune(possibilities):
    final_possibilities = []
    for candidate in possibilities: 
      if candidate in am_dic:
          final_possibilities.append(candidate)
    return final_possibilities

# this function is called by ngram_selected to determine the probability of
# an ethiopic word occurring (using ngram counts)
# this function calls get_ngrams to split the given word into n-length 
# subsections for scoring
# this function also calls one or multiple of the [n]gram_probability functions 
# to compute each [n]gram score, which are then weighted evenly in computing 
# the final score

cached = {}

def word_score(word):
  cache_key = word
  if cache_key in cached.keys():
    return cached[cache_key]

  sequence = list(word)
  if len(word) >= 3:
    # calculate trigram probability
    w = 1/3.0
    trigrams = get_ngrams(sequence, 3)
    bigrams = get_ngrams(sequence, 2)
    unigrams = get_ngrams(sequence, 1)
    score_t = trigram_probability(trigrams)
    score_b = bigram_probability(bigrams)
    score_u = unigram_probability(unigrams)
    score = (w*score_t)+(w*score_b)+(w*score_u) 
  elif len(word) >= 2:
    # calculate bigram probability
    w = 1/2.0
    bigrams = get_ngrams(sequence, 2)
    unigrams = get_ngrams(sequence, 1)
    score_b = bigram_probability(bigrams)
    score_u = unigram_probability(unigrams)
    score = (w*score_b)+(w*score_u) 
  else:
    # calculate unigram probability
    unigrams = get_ngrams(sequence, 1)
    score = unigram_probability(word)

  cached[cache_key] = score
  return score

# called by the function word_score to generate n gram subsections 
# from a given ethiopic word
def get_ngrams(sequence, n):
    input = sequence
    output = []
    for i in range(0, len(input) - n + 1):
        output.append(input[i:i + n])
    
    return [''.join(l) for l in output]

# these funtions are called by word_score to compute [n]gram probabilities given
# an ethiopic word 
def trigram_probability(trigrams):
  freq = 0
  for t in trigrams:
    # get freq
    if t in freq_dict:
      freq += freq_dict[t]
  avg_prob = freq/(len(trigrams) * trigram_count)
  return avg_prob

def bigram_probability(bigrams):
  freq = 0
  for b in bigrams:
    # get freq
    if b in freq_dict:
      freq += freq_dict[b]
  avg_prob = freq/(len(bigrams) * bigram_count)
  return avg_prob

def unigram_probability(unigrams):
  freq = 0
  for u in unigrams:
    # get freq
    if u in freq_dict:
      freq += freq_dict[u]
  avg_prob = freq/(len(unigrams) * unigram_count)
  return avg_prob

In [54]:
transliterate("inidihi sil . hulat sewochi liseliyu wade")
#እንዲህ ሲል ። ሁለት ሰዎች ሊጸልዩ ወደ

'እንድህ ስል ። ሁለት ስዎች ሊጸልዩ ወደ'

In [26]:
transliterate("ijasusme . 'eweneti iwnat `elacehualehu")
#ኢየሱስም ። እውነት እውነት እላችኋለሁ

'እየሱስም ። እውንት እውነት እላችኋለሁ'

In [27]:
transliterate("abatune wejeme ina'tun ajakebirm tlalacihu ;")
#አባቱን ወይም እናቱን አያከብርም ትላላችሁ ፤ 

'አባቱን ውይም እናቱን አያከብርም ትላላችሁ ፤'

In [28]:
transliterate("bamine 'ajenetim mote ymoti zendi `enidalawe")
#በምን ዓይነትም ሞት ይሞት ዘንድ እንዳለው

'በምን አይነትም ሞት ይሞት ዘንድ እንዳለው'

# TEST

In [29]:
#!pip install python-levenshtein

In [36]:
# EVALUATION

import Levenshtein
from statistics import mean

# this function takes two files (parallel texts) and generates
# two lists after stripping beginning/trailing whitespace
def extract_from_files(ethiopic_file, latin_file):
  eth_file = open(ethiopic_file)
  lat_file = open(latin_file)
  ethiopic = [line.rstrip() for line in eth_file.readlines()]
  latin = [line.rstrip() for line in lat_file.readlines()] 
  
  ethiopic = list(ethiopic)
  latin = list(latin)

  return ethiopic, latin

# this function takes two parallel lists and evaluates how
# our model performs
def evaluate(ethiopic, latin):
  accuracies = []
  count = 0
  total_length = 0

  for line in ethiopic:
    total_length += len(line)

  for line in tqdm_notebook(latin):
    predict = transliterate(line)
    correct = ethiopic[count]
    weight = len(correct) / total_length
    accuracy = Levenshtein.ratio(predict,correct) * weight
    accuracies.append(accuracy) 
    count += 1

  return sum(accuracies)

## first on google translated pairs

In [33]:
ethiopic_tot, latin_tot = extract_from_files("raw/original.txt", "raw/transliterated.txt")

In [34]:
ethiopic_2 = ethiopic_tot[0:10]
latin_2 = latin_tot[0:10]

In [62]:
evaluate(ethiopic_2, latin_2)

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

0.7700315602130576

In [63]:
ethiopic_3 = ethiopic_tot[0:100]
latin_3 = latin_tot[0:100]

In [64]:
evaluate(ethiopic_3, latin_3)

HBox(children=(IntProgress(value=0), HTML(value='')))

KeyboardInterrupt: 

In [None]:
ethiopic_large = ethiopic_tot[0:1000]
latin_large = latin_tot[0:1000]
evaluate(ethiopic_large,latin_large)

## now on manually generated parallel texts

In [57]:
ethiopic_taitu, latin_taitu = extract_from_files('taitu_am.txt','taitu_rom.txt')
taitu_score = evaluate(ethiopic_taitu, latin_taitu)
taitu_score

0


0.4663842474737726

In [58]:
ethiopic_sera, latin_sera = extract_from_files('sera_am.txt','sera_rom.txt')
sera_score = evaluate(ethiopic_sera, latin_sera)
sera_score

0
100


0.6770958851877996

In [59]:
ethiopic_tewodros, latin_tewodros = extract_from_files('tewodros_am.txt','tewodros_rom.txt')
tewodros_score = evaluate(ethiopic_tewodros, latin_tewodros)
tewodros_score

0


0.7004329720057354

In [60]:
ethiopic_prayer, latin_prayer = extract_from_files('lords_prayer_am.txt',
                                                   'lords_prayer_rom.txt')
prayer_score = evaluate(ethiopic_prayer, latin_prayer)
prayer_score

0


0.6854984422936918

In [61]:
ethiopic_yasstesseriyal, latin_yasstesseriyal = extract_from_files('yasstesseriyal_am.txt',
                                                   'yasstesseriyal_rom.txt')
yasstesseriyal_score = evaluate(ethiopic_yasstesseriyal, latin_yasstesseriyal)
yasstesseriyal_score

0


0.5501152019011929

In [62]:
average_manual = mean([sera_score, yasstesseriyal_score, tewodros_score, 
                      prayer_score, taitu_score])
average_manual

0.6159053497724385