In [1]:
#!pip install python-levenshtein

In [2]:
import pandas as pd
import re
import Levenshtein
from statistics import mean
import tensorflow as tf
import numpy as np
import os
import time
import unidecode
import json
import string 
from tqdm import tqdm_notebook

# Conversion tables

In [3]:
with open('lat_eth.json') as f:
    lat_eth = json.load(f)

with open('eth_lat.json') as f:
    eth2lat = json.load(f)

with open('char2idx.json') as f:
    char2idx = json.load(f)

with open('idx2char.json') as f:
    idx2char = np.array(json.load(f))


# Character n-grams

## Build corpus

Read in Amharic dictionary and Amharic Wikipedia, combine into corpus.

In [4]:
# load dictionary to prune transliteration options 
am_dic_file = open("am_dic.txt", "r", encoding="utf-8")
am_dic = []

for w in am_dic_file.readlines():
    am_dic.append(w.rstrip())

am_dic_file.close()
am_dic = set(am_dic)

# Read in Wikipedia
am_dic_file_2 = open("AMH-wiki-tok.txt", "r", encoding="utf-8")
am_dic_2 = []

for w in am_dic_file_2.readlines():
  line = w.rstrip()
  words = line.split()
  for i in words:
    if i in string.punctuation:
      words.remove(i)
  am_dic_2 += words

am_dic = am_dic.union(set(am_dic_2))
len(am_dic)

231333

## Compute char n-grams

In [5]:
# clean corpus
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

file = open("raw/new-am.txt", "r", encoding="utf-8")
corpus = file.read()

# tokenize corpus (https://machinelearningmastery.com/clean-text-machine-learning-python/)
tokens = list(set(word_tokenize(corpus) + am_dic_2))
# remove all tokens that are not alphabetic
tokens = [word for word in tokens if word.isalpha()]

[nltk_data] Downloading package punkt to /Users/soma/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
# train model to do ngram work
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(1, 3), analyzer="char_wb")
cv_fit = cv.fit_transform(tokens)

print("Vocabulary size 1-3gram: ", len(cv.vocabulary_))

Vocabulary size 1-3gram:  207249


In [7]:
import numpy as np

ngram_list = cv.get_feature_names()
count_list = np.asarray(cv_fit.sum(axis=0))[0]

# make a dictionary with frequencies 
freq_dict = dict(zip(ngram_list,count_list))

# get unigram, bigram, trigram total counts
unigram_count = 0
bigram_count = 0
trigram_count = 0

for key in freq_dict.keys():
  if len(key)==1:
    unigram_count += freq_dict[key]
  elif len(key)==2:
    bigram_count += freq_dict[key]
  else:
    trigram_count += freq_dict[key]

print("Unigram Count: ", unigram_count)
print("Bigram Count: ", bigram_count)
print("Trigram Count: ", trigram_count)

Unigram Count:  1585420
Bigram Count:  1365220
Trigram Count:  1145020


## Transliteration tools

In [8]:
# generate possible transliterations

import string
import itertools
import operator
import string 
import random

# given a sentence in latin characters, splits and sends word by word to the 
# function transliterate_word
def char_ngram_transliterate(text):
    sent_trans = []

    sentence = [i for j in text.split() for i in (j, ' ')][:-1]
    cleaned = []
    for elmt in sentence:
      elmt_tokenized = word_tokenize(elmt)
      if elmt == ' ':
        cleaned.append(' ')
      elif len(elmt) == len(elmt_tokenized):
        cleaned.append(elmt)
      else:
        for i in elmt_tokenized:
          if i == "'":
            elmt_tokenized.remove(i)
        cleaned += elmt_tokenized

    for word in cleaned:
      sent_trans.append(char_ngram_transliterate_word(word))
    
    return "".join(sent_trans)

    
# transliterate_word returns spaces/punctuations as appropriate
# and sends an actual latin character word to ngram_selected(word) to 
# obtain the appropriate transliterated word in ethiopic
def char_ngram_transliterate_word(word):
    if word in string.punctuation and word not in lat_eth.keys():
      return word
    elif word.isnumeric() == True:
      return word
    elif word == " ":
      return word
    elif len(word) > 15:
      return word
    else:
      word = unidecode.unidecode(word).lower()
      return ngram_selected(word)

# ngram_selected takes a latin character word and generates all possible ethiopic
# transliterations by calling the function possibilities; it then selects the 
# ethiopic option with the highest score using the function word_score
char_ngram_cached_best = {}
def ngram_selected(word):
    cache_key = word
    if cache_key in char_ngram_cached_best.keys():
        return char_ngram_cached_best[cache_key]
    options = possibilities(word)
    if len(options) == 0:
      return word
    else:
      scores = dict()
      for opt in options:
        score = word_score(opt)
        scores[opt] = score
      selected_word = max(scores.items(), key=operator.itemgetter(1))[0]
#       char_ngram_cached_best[cache_key] = selected_word
      return selected_word

# the function possibilities takes a latin character word and returns all 
# possible transliterations into ethiopic based on the reverse dictionary
# this function calls the function prune to remove entries that are 
# not in an actual amharic dictionary (unless pruning results in 0 options)   
# this function also calls the function convert to go from latin char to 
# ethiopic char as based on the reverse dictionary
def possibilities(word):
    # split word into chars
    chars = list(word)
    
    # generate all combinations 
    # https://stackoverflow.com/questions/27263155/python-find-all-possible-
    # word-combinations-with-a-sequence-of-characters-word
    combinatorics = itertools.product([True, False], repeat=len(chars) - 1)
    latin_segmentation = []
    add = True
    for combination in combinatorics:
        i = 0
        one_such_combination = [chars[i]]
        for slab in combination:
            i += 1
            if not slab: # there is a join
                one_such_combination[-1] += chars[i]
            else:
                one_such_combination += [chars[i]]
        
        for elmt in one_such_combination:
            if elmt not in lat_eth.keys():
                add = False
                break
        # only add/consider if segmentation can be converted into ethiopic 
        # characters
        if add == True:  
            latin_segmentation.append(one_such_combination)
            
        # reset
        add = True
    
    # conversion ******************************
    ethiopic_opts = []
    for segmentation in latin_segmentation:
        ethiopic_opts += convert(segmentation)

    pruned = prune(ethiopic_opts)
    if len(pruned) == 0:
      if len(ethiopic_opts) < 100:
        return ethiopic_opts
      else:
        sampling = random.choices(ethiopic_opts, k=99)
        return sampling
    else:
      return pruned


# this is called by the function possibilities to convert from latin char
# to ethiopic char given a particular segmentation (i.e. i-di vs. i-d-i might
# both be sent separately)
def convert(segmentation):
    final_list = []
    relevant_lists = []
    for elmt in segmentation:
        relevant_lists.append(lat_eth[elmt])
    for i in itertools.product(*relevant_lists):
        final_list.append(''.join(i))
    return final_list

# this is called by the function possibilities to prune the list of possible
# ethiopic transliterations
def prune(possibilities):
    final_possibilities = []
    for candidate in possibilities: 
      if candidate in am_dic:
          final_possibilities.append(candidate)
    return final_possibilities

# this function is called by ngram_selected to determine the probability of
# an ethiopic word occurring (using ngram counts)
# this function calls get_ngrams to split the given word into n-length 
# subsections for scoring
# this function also calls one or multiple of the [n]gram_probability functions 
# to compute each [n]gram score, which are then weighted evenly in computing 
# the final score


def word_score(word):
  sequence = list(word)
  if len(word) >= 3:
    # calculate trigram probability
    w = 1/3.0
    trigrams = get_ngrams(sequence, 3)
    bigrams = get_ngrams(sequence, 2)
    unigrams = get_ngrams(sequence, 1)
    score_t = trigram_probability(trigrams)
    score_b = bigram_probability(bigrams)
    score_u = unigram_probability(unigrams)
    score = (w*score_t)+(w*score_b)+(w*score_u) 
  elif len(word) >= 2:
    # calculate bigram probability
    w = 1/2.0
    bigrams = get_ngrams(sequence, 2)
    unigrams = get_ngrams(sequence, 1)
    score_b = bigram_probability(bigrams)
    score_u = unigram_probability(unigrams)
    score = (w*score_b)+(w*score_u) 
  else:
    # calculate unigram probability
    unigrams = get_ngrams(sequence, 1)
    score = unigram_probability(word)

  return score

# called by the function word_score to generate n gram subsections 
# from a given ethiopic word
def get_ngrams(sequence, n):
    input = sequence
    output = []
    for i in range(0, len(input) - n + 1):
        output.append(input[i:i + n])
    
    return [''.join(l) for l in output]

# these funtions are called by word_score to compute [n]gram probabilities given
# an ethiopic word 
def trigram_probability(trigrams):
  freq = 0
  for t in trigrams:
    # get freq
    if t in freq_dict:
      freq += freq_dict[t]
  avg_prob = freq/(len(trigrams) * trigram_count)
  return avg_prob

def bigram_probability(bigrams):
  freq = 0
  for b in bigrams:
    # get freq
    if b in freq_dict:
      freq += freq_dict[b]
  avg_prob = freq/(len(bigrams) * bigram_count)
  return avg_prob

def unigram_probability(unigrams):
  freq = 0
  for u in unigrams:
    # get freq
    if u in freq_dict:
      freq += freq_dict[u]
  avg_prob = freq/(len(unigrams) * unigram_count)
  return avg_prob

## Example usage

In [9]:
char_ngram_transliterate("inidihi sil . hulat sewochi liseliyu wade")
#እንዲህ ሲል ። ሁለት ሰዎች ሊጸልዩ ወደ

'እንድህ ስል ። ሁለት ስዎች ሊጸልዩ ወደ'

In [10]:
char_ngram_transliterate("ijasusme . 'eweneti iwnat `elacehualehu")
#ኢየሱስም ። እውነት እውነት እላችኋለሁ

'እየሱስም ። እውንት እውነት እላችኋለሁ'

In [11]:
char_ngram_transliterate("abatune wejeme ina'tun ajakebirm tlalacihu ;")
#አባቱን ወይም እናቱን አያከብርም ትላላችሁ ፤ 

'አባቱን ውይም እናቱን አያከብርም ትላላችሁ ፤'

In [12]:
char_ngram_transliterate("bamine 'ajenetim mote ymoti zendi `enidalawe")
#በምን ዓይነትም ሞት ይሞት ዘንድ እንዳለው

'በምን አይነትም ሞት ይሞት ዘንድ እንዳለው'

# Word n-gram model

## Build corpus

In [13]:
punctuation = ["፤", ".", "»", "(", ")", "/", "።", "’", "“", "፣", "!", "”", "‘",\
               "፦", "’", "፡", "&", "፥", "፧"]
def tokenize(word): 
  clean_word = ""
  for n in range(len(word)):
    if word[n] in punctuation:
      clean_word += " "
    elif word[n].isdigit():
      clean_word += " "
    else:
      clean_word += word[n]
  
  clean_word = clean_word.strip()
  clean_words = clean_word.split()
  return clean_words

new_am = open("raw/new-am.txt", "r")
corpus = open("corpus.txt", "w") # corpus = cleaned new_am

unique_words = set()

for line in new_am.readlines():
  words = line.split()
  clean_words = []
  for n in range(len(words)):
    word = words[n]
    if word.isdigit():
      continue
    else: 
      word_tokenized = tokenize(word)
      for w in word_tokenized:
        clean_words.append(w)
        unique_words.add(w)
  corpus.write(" ".join(clean_words))
  corpus.write("\n")

corpus.close()
new_am.close()

corpus_dic = open("corpus_dic.txt", "w")
for word in unique_words:
  corpus_dic.write(word)
  corpus_dic.write("\n")
corpus_dic.close()

In [14]:
from nltk.util import ngrams

def extract_ngrams(corpus_file):
  corpus = open(corpus_file, "r", encoding="utf-8")

  uni_dict = {}
  bi_dict = {}
  tri_dict = {}
  ngram_totals = {'uni': 0, 'bi': 0, 'tri': 0}
  for line in corpus.readlines():
    words = line.split()

    # Compute ngrams
    unigrams = ngrams(words, 1)
    bigrams = ngrams(words, 2)
    trigrams = ngrams(words, 3)

    # Calculate frequency of each ngram unit
    for uni_tuple in unigrams:
      uni = uni_tuple[0]
      if uni in uni_dict:
        uni_dict[uni] += 1
      else: 
        uni_dict[uni] = 1
      ngram_totals['uni'] += 1
      
    for bi in bigrams:
      if bi in bi_dict:
        bi_dict[bi] += 1
      else:
        bi_dict[bi] = 1
      ngram_totals['bi'] += 1

    for tri in trigrams:
      if tri in tri_dict:
        tri_dict[tri] += 1
      else:
        tri_dict[tri] = 1
      ngram_totals['tri'] += 1

  return uni_dict, bi_dict, tri_dict, ngram_totals

In [15]:
uni_dict, bi_dict, tri_dict, ngram_totals = extract_ngrams("corpus.txt")



In [16]:
print("unigram dictionary:", len(uni_dict))
print("bigram dictionary:", len(bi_dict))
print("trigram dictionary:", len(tri_dict))

ngram_dict = [uni_dict, bi_dict, tri_dict]

unigram dictionary: 119481
bigram dictionary: 492127
trigram dictionary: 601817


In [17]:
def calculate_ngram_prob(ngram_dict, ngram_totals, vocab_dict):
  trigrams = ngram_dict[2]
  bigrams = ngram_dict[1]
  unigrams = ngram_dict[0]

  v = len(vocab_dict)

  # trigram prob
  tri_prob_dict = {}
  for (tri, frq) in trigrams.items():
    b = tri[0], tri[1]
    prob = frq + 1/ bigrams[b] + v # add 1 smoothing
    tri_prob_dict[tri] = prob
  
  # bigram prob
  bi_prob_dict = {}
  for (bi, frq) in bigrams.items():
    u = bi[0]
    prob = frq / unigrams[u]
    bi_prob_dict[bi] = prob

  # unigram prob
  uni_prob_dict = {}
  for (uni, frq) in unigrams.items():
    prob = frq / ngram_totals['uni']
    uni_prob_dict[uni] = prob

  return uni_prob_dict, bi_prob_dict, tri_prob_dict

In [18]:
uni_prob_dict, bi_prob_dict, tri_prob_dict = calculate_ngram_prob(ngram_dict, ngram_totals, unique_words)

## Transliteration tools

In [19]:
def extract_word_probabilities(sen, ngram_dict_probabilities):
  probability = {}
  token = sen.split()

  # trigram prob
  tri_dict_prob = ngram_dict_probabilities[2]
  if len(token) >= 3: 
    tri = (token[-3], token[-2], token[-1])
    if tri in tri_dict_prob:
      probability[tri] = tri_dict_prob[tri]
  
  # bigram prob
  bi_dict_prob = ngram_dict_probabilities[1]
  if len(token) >= 2:
    bi = (token[-2], token[-1])
    if bi in bi_dict_prob:
      probability[bi] = bi_dict_prob[bi]

  # unigram prob
  uni_dict_prob = ngram_dict_probabilities[0]
  if len(token) >= 1:
    uni  = token[-1]
    if uni in uni_dict_prob:
      probability = uni_dict_prob[uni]

In [20]:
import operator

# Takes in a text of sentence and returns the highest likelihood
# transliterated sequence.
def word_ngram_transliterate(sentence):
  tokens = sentence.split()
  transliterated_tokens = {}
  for token in tokens:
    all_candidates = possibilities(token)
    valid_candidates = prune(all_candidates)
    if len(valid_candidates) == 0 and len(all_candidates) == 0:
      print(token)
    if token not in transliterated_tokens:
      transliterated_tokens[token] = valid_candidates

  transliterated_sentence = calculate_best_sequence(tokens, transliterated_tokens)

  return transliterated_sentence

# Takes a list of words in a sentence and a list of all the words' possible
# candidates and determines the best sequence of transliterated words.
def calculate_best_sequence(tokens, transliterated_tokens):
  # Create all possible sentences.
  sentence_candidates = create_all_sentences(tokens, transliterated_tokens)

  # Score all possible sentences. 
  all_scores = {}
  for sentence in sentence_candidates:
    score = calculate_sentence_score(sentence)
    all_scores[sentence] = score

  if len(sentence_candidates) == 0:
    return " "

  return max(all_scores.items(), key=operator.itemgetter(1))[0]

# Creates all possible sequences of words and returns a list. 
def create_all_sentences(tokens, transliterated_tokens):
  possible_sentences = {}
  for n in range(len(tokens)): 
    if n == 0:
      possible_sentences[tokens[n]] = [t for t in transliterated_tokens[tokens[n]]]
    else: 
      for t in possible_sentences[tokens[n-1]]:
        for next_t in transliterated_tokens[tokens[n]]:
          new_t = t + " " + next_t
          if tokens[n] in possible_sentences:
            possible_sentences[tokens[n]].append(new_t)
          else:
            possible_sentences[tokens[n]] = [new_t]
      # If transliterated_tokens[tokens[n]] is empty add content
      # from tokens[n-1]
      if possible_sentences.get(tokens[n]) is None:
        possible_sentences[tokens[n]] = possible_sentences[tokens[n-1]]
  return possible_sentences[tokens[len(tokens)-1]]


def calculate_ngram_score(ngram, ngram_dict_prob):
  ngram_list = list(ngram)
  avg_ngram_score = 0
  for ngram in ngram_list:
    if ngram in ngram_dict_prob:
      avg_ngram_score = ngram_dict_prob[ngram]
  return avg_ngram_score/len(ngram_list)

# Takes a sentence (sequential list of words) and returns a likelihood score
# based on its ngram sequences.
def calculate_sentence_score(sentence):
  unigrams = ngrams(sentence, 1)
  bigrams = ngrams(sentence, 2)
  trigrams = ngrams(sentence, 3)

  score = 0
  tri_weight = 2.0
  bi_weight = 1.0
  uni_weight = 0.5

  trigram_score = tri_weight * calculate_ngram_score(trigrams, tri_prob_dict)
  bigram_score = bi_weight * calculate_ngram_score(bigrams, bi_prob_dict)
  unigram_score = uni_weight * calculate_ngram_score(unigrams, uni_prob_dict)

  score += trigram_score + bigram_score + unigram_score
  score = score / len(sentence)

  return score

### Example usage

In [21]:
tokens = ["a", "b", "c"]
tt = {"a": ["m", "n"], "b":["m", "o"], "c":["q", "r", "s"]}

print(create_all_sentences(tokens, tt))

['m m q', 'm m r', 'm m s', 'm o q', 'm o r', 'm o s', 'n m q', 'n m r', 'n m s', 'n o q', 'n o r', 'n o s']


In [22]:
word_ngram_transliterate("inidihi sil . hulat sewochi liseliyu wade")
#እንዲህ ሲል ። ሁለት ሰዎች ሊጸልዩ ወደ

'እንዲህ ሲል ። ሁለት ሰዎች ሊጸልዩ ወደ'

# Tensorflow model

## Load the model

In [23]:
# Used a custom loss function so we need to load_model without compilation
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

model = tf.keras.models.load_model("char_model", compile=False)
model.compile(optimizer='adam', loss=loss)

## Transliteration tools

In [24]:
tf_prob_cached = {}
cont_cached = {}

In [25]:
# Convert predictions to probabilities

def get_next_char_probs(start_string):
    cache_key = start_string
    if cache_key in cont_cached.keys():
        pred_prob = cont_cached[cache_key]
    else:
        input_eval = [char2idx[s] for s in start_string]
        input_eval = tf.expand_dims(input_eval, 0)
        # print("input_eval is", input_eval)
        # Ask model to evaluate what's next, given start_string
        model.reset_states()
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0)
        
        # Convert scores to probabilities
        pred_prob = tf.nn.softmax(predictions[0])

        # Cache probabilities
        cont_cached[cache_key] = pred_prob
    return pred_prob

def continuation_proba(start_string, next_char):
    try:
        # Index of target for when we pull it out of the scores
        target_idx = char2idx[next_char]
    except:
        # If we've never seen that char before, get out of here!
        return 0
    
    # Check the cache
    cache_key = start_string
    if cache_key in cont_cached.keys():
        pred_prob = cont_cached[cache_key]
    else:
        # Convert characters into indices for tensorflow processing
        try:
            input_eval = [char2idx[s] for s in start_string]
        except:
            # We've never seen it!!! just exit!!
            return 0

        input_eval = tf.expand_dims(input_eval, 0)

        # Ask model to evaluate what's next, given start_string
        model.reset_states()
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0)
        
        # Convert scores to probabilities
        pred_prob = tf.nn.softmax(predictions[0])

        # Cache probabilities
        cont_cached[cache_key] = pred_prob

    
    proba = pred_prob[target_idx].numpy()
    return proba

def text_proba(text, length_adj=False):
    # Check the cache
    cache_key = text + ("-adj" if length_adj else "-unadj")
    if cache_key in tf_prob_cached.keys():
        return tf_prob_cached[cache_key]

    # Otherwise we'll calculate the probability
    proba = 1
    for i in range(1, len(text)):
        base_str = text[:i]
        next_char = text[i]
        proba = proba * continuation_proba(base_str, next_char)

    # Adjust for length
    if length_adj:
        proba = len(text) * proba

    # Store in cache
    tf_prob_cached[cache_key] = proba
    return proba

### Breakdown possibilities

In [26]:
def calc_breakdowns(segment):
    # Remove apostrophes
    options = []
    max_len = min([len(segment), 4])
    for i in range(1, max_len+1):
        potential = segment[:i]
        if potential in lat_eth.keys():
            remainder = segment[i:]
            if remainder == "":
                options.append([potential])
            else:
                enders = calc_breakdowns(remainder)
                if enders == []:
                    return []
                else:
                    options.extend([potential, *e] for e in enders)
    return options

def get_breakdowns(segment):
    results = calc_breakdowns(segment)
    if len(results) == 0:
        results = calc_breakdowns(segment.replace("'", "").replace("`", ""))
    return results

### Semi-greedy algorithm

In [27]:
# Given a list of 'current' states (e.g. top 3 'tibe' options)
# And a list of things to add on (e.g. all 'bi' options)
# What are the best n options for 'tibebi'
def best_next_steps(current_states, next_options, n=3):
    # Pairs keeps track of all texts + probabilities
    pairs = []
    
    # From each possible 
    for base in current_states:
        # Create all the text options we're looking at
        # Then calculate their probability
        texts = [f"{base}{follower}" for follower in next_options]
        probs = [text_proba(text) for text in texts]
        pairs.extend(zip(texts, probs))
    
    # Only return the top n options
    top = sorted(pairs, key=lambda pair: pair[1], reverse=True)[:n]

    return top

# Receive a list like ['ti', 'be', 'bi', 'ni']
# Tries out each Ethiopic option, returns top n most likely
def get_top_sequences(breakdown, n=3, lower_bound=None):
    eth_poss = [lat_eth[latin] for latin in breakdown]

    # current = eth_poss[0]
    # for next_options in eth_poss[1:]:
    # Starts with a space bc beginning of word
    current = " "
    for next_options in eth_poss:
        # If it's a space take all options
        n_steps = 10 if current == " " else n
        top = best_next_steps(current, next_options, n_steps)

        # Only keep ones above lower score bound
        # if you want to use them all, lower_bound should be None
        if lower_bound != None:
            current = [option[0] for option in top if option[1] >= lower_bound]
            # For debugging
            dropped = len(top) - len(current)
        else:
            current = [option[0] for option in top]
    
    # Remove spaces
    top = [(option[0][1:], option[1]) for option in top]
    return top

# Takes a list of potential breakdowns
# n is passed to get_top_sequences
# limit is used here
def get_top_sequences_all(breakdowns, limit=5, n=3):
    tops = []
    
    # shorter breakdowns go first
    breakdowns = sorted(breakdowns, key=lambda breakdown: len(breakdowns))

    lower_bound = 0
    # Visual readout of length
    for breakdown in breakdowns:
        top = get_top_sequences(breakdown, n, lower_bound=lower_bound)
        if len(top) > 0:
            worst_score = top[-1][1]
            if worst_score > lower_bound:
                lower_bound = worst_score
            tops.extend(top)
        
    return sorted(tops, key=lambda pair: pair[1], reverse=True)[:limit]

# Given Latin script, what is the best Ethiopic?
def top_transcriptions(text, limit=5, n=3):
    breakdowns = get_breakdowns(text)
    return get_top_sequences_all(breakdowns, limit, n)

def tokenize_text(text):
    sentence = [i for j in text.split() for i in (j, ' ')][:-1]
    cleaned = []
    for elmt in sentence:
        elmt_tokenized = word_tokenize(elmt)
        if elmt == ' ':
            cleaned.append(' ')
        elif len(elmt) == len(elmt_tokenized):
            cleaned.append(elmt)
        else:
            for i in elmt_tokenized:
                if i == "'":
                    elmt_tokenized.remove(i)
                cleaned += elmt_tokenized

    return cleaned

def tensorflow_transliterate(text):
    sent_trans = []

    cleaned = tokenize_text(text)
    for word in cleaned:
        sent_trans.append(tensorflow_transliterate_word(word))
    
    return "".join(sent_trans)

def tensorflow_transliterate_word(word, top_n=1):
    if word.isspace():
        return word
    if word in lat_eth.keys() and len(lat_eth[word]) == 0:
        return lat_eth[word][0]
    elif word in string.punctuation:
        return word
    elif word.isnumeric() == True:
        return word
    else:
        cleaned = word.lower()
        results = top_transcriptions(cleaned)
        if len(results) == 0:
            cleaned = re.sub(r'[^a-z]', '', cleaned)
            results = top_transcriptions(cleaned)

        if len(results) == 0:
            results = [word]

        if top_n == 1:
            if len(results) == 0:
                return [word]
            else:
                return results[0][0]
        else:
            return [result[0] for result in results[:top_n]]

### Usage examples

In [28]:
# Getting potential breakdowns
breakdowns = get_breakdowns('tibebini')
breakdowns

[['t', 'i', 'b', 'e', 'b', 'i', 'n', 'i'],
 ['t', 'i', 'b', 'e', 'b', 'i', 'ni'],
 ['t', 'i', 'b', 'e', 'bi', 'n', 'i'],
 ['t', 'i', 'b', 'e', 'bi', 'ni'],
 ['t', 'i', 'be', 'b', 'i', 'n', 'i'],
 ['t', 'i', 'be', 'b', 'i', 'ni'],
 ['t', 'i', 'be', 'bi', 'n', 'i'],
 ['t', 'i', 'be', 'bi', 'ni'],
 ['ti', 'b', 'e', 'b', 'i', 'n', 'i'],
 ['ti', 'b', 'e', 'b', 'i', 'ni'],
 ['ti', 'b', 'e', 'bi', 'n', 'i'],
 ['ti', 'b', 'e', 'bi', 'ni'],
 ['ti', 'be', 'b', 'i', 'n', 'i'],
 ['ti', 'be', 'b', 'i', 'ni'],
 ['ti', 'be', 'bi', 'n', 'i'],
 ['ti', 'be', 'bi', 'ni']]

In [29]:
# Calculate text probability
text_proba('ጥበብ')

4.103438150809997e-06

In [30]:
# Given the starting options of ['ጥበ', 'ጥብ', 'ትበ']
# What options out of ['ቢ', 'ብ'] are the best next step?
best_next_steps(
    ['ጥበ', 'ጥብ', 'ትበ'],
    ['ቢ', 'ብ']
)

[('ጥብብ', 4.8523289071696345e-06),
 ('ጥበብ', 4.103438150809997e-06),
 ('ጥብቢ', 8.614675079611741e-07)]

In [31]:
# get_top_sequences gets the top n options for a breakdown
breakdown = breakdowns[0]
print("Looking at", breakdown)
get_top_sequences(breakdown, 5)

Looking at ['t', 'i', 'b', 'e', 'b', 'i', 'n', 'i']


[('ጥእብኤብእንእ', 1.1083901216124514e-17),
 ('ትእብኤብእንእ', 3.663522329343407e-18),
 ('ጥዕብኤብእንእ', 1.0411927491793497e-18),
 ('ጥእብኤብእንዕ', 1.0411919735318827e-18),
 ('ጥእብኤብዕንእ', 1.0411919735318827e-18)]

In [32]:
# get_top_sequences_all does it for multiple breakdowns
print("Looking at", len(breakdowns), "breakdowns")
get_top_sequences_all(breakdowns, limit=3)

Looking at 16 breakdowns


[('ጥበቢኒ', 2.9720881868637036e-09),
 ('ጥበብኒ', 2.1003423397835974e-09),
 ('ጥበቢን', 1.5000106938408966e-09)]

In [33]:
top_transcriptions("tibebi")

[('ጥበቢ', 4.8237780542115944e-06),
 ('ጥበብ', 3.4089113942715496e-06),
 ('ትበቢ', 1.594386152385783e-06),
 ('ጥእበቢ', 3.0541725734773963e-07),
 ('ጥበብእ', 2.158353151351827e-07)]

In [34]:
tensorflow_transliterate("inidihi sil . hulat sewochi liseliyu wade")
#እንዲህ ሲል ። ሁለት ሰዎች ሊጸልዩ ወደ

'እኒዲሕ ሲል . ሁለጥ ሰኦጭ ሊሰሊዩ ወደ'

# OpenNMT

In [35]:
import subprocess

model_name = subprocess.run("ls -t *model*.pt | head -n 1",
                            shell=True,
                            stdout=subprocess.PIPE).stdout.decode("utf-8").strip()
print("Using model", model_name)

Using model demo-model_step_25000.pt


In [36]:
def opennmt_transliterate(text):
    # Split each word to separate line, spaces between chars
    cleaned = unidecode.unidecode(text).lower()
    t_words = [w for w in re.split(r"\s+", cleaned) if w]
    spaced = [' '.join(w) for w in t_words]
    
    with open("opennmt-test.txt", 'w') as f:
        f.write('\n'.join(spaced))

    !onmt_translate \
        -model {model_name} \
        -src opennmt-test.txt \
        -output opennmt-pred.txt \
        -replace_unk \
        > /dev/null 2>&1

    with open("opennmt-pred.txt") as f:
        results = f.read()

    # Remove spaces, return to being on one line
    results = results.replace(' ', '').replace('\n', ' ')
    
    return results

In [37]:
opennmt_transliterate("inidihi sil . hulat sewochi liseliyu wade")
#እንዲህ ሲል ። ሁለት ሰዎች ሊጸልዩ ወደ

'እንዲህ ሲል . ሁለት ሰዎች ሊጸልዩ ወደ '

# Evaluations

In [38]:
# this function takes two files (parallel texts) and generates
# two lists after stripping beginning/trailing whitespace
def extract_from_files(ethiopic_file, latin_file):
  eth_file = open(ethiopic_file)
  lat_file = open(latin_file)
  ethiopic = [line.rstrip() for line in eth_file.read().splitlines()]
  latin = [line.rstrip() for line in lat_file.read().splitlines()] 
  
  ethiopic = list(ethiopic)
  latin = list(latin)

  return ethiopic, latin

# this function takes two parallel lists and evaluates how
# our model performs, given a transliteration function
def evaluate(ethiopic, latin, translit_func):
  accuracies = []
  count = 0
  total_length = 0
  total_words = 0
  total_accurate_words = 0

  for line in ethiopic:
    total_length += len(line)

  paired = list(zip(ethiopic, latin))
  for correct, latin in tqdm_notebook(paired):
    predict = translit_func(latin)
    weight = len(correct) / total_length
    accuracy = Levenshtein.ratio(predict,correct) * weight
    accuracies.append(accuracy)
    
    words_correct = re.split(r'\s+', correct)
    words_predicted = re.split(r'\s+', predict)

    for word_correct, word_predicted in zip(words_correct, words_predicted):
        total_words += 1
        if word_correct == word_predicted:
            total_accurate_words += 1
    
  return {
      'lev_acc': sum(accuracies),
      'word_acc': total_accurate_words / total_words
  }

## Scoring

In [39]:
datasets = [
    {
        "name": "Simulated natural transliteration",
        "paths": [
            "raw/original.txt",
            "raw/transliterated.txt"
        ],
        "postprocess": (lambda rows: rows[:10])
    },
    {
        "name": "The Lord's Prayer",
        "paths": [
            "parallel_data/lords_prayer_am.txt",
            "parallel_data/lords_prayer_rom.txt"
        ]
    },
    {
        "name": '"Sera"',
        "paths": [
            "parallel_data/sera_am.txt",
            "parallel_data/sera_rom.txt"
        ]
    },
    {
        "name": '"Taitu"',
        "paths": [
            "parallel_data/taitu_am.txt",
            "parallel_data/taitu_rom.txt"
        ]
    },
    {
        "name": '"Tewodros"',
        "paths": [
            "parallel_data/tewodros_am.txt",
            "parallel_data/tewodros_rom.txt"
        ]
    },
    {
        "name": '"Yasstesseriyal"',
        "paths": [
            "parallel_data/yasstesseriyal_am.txt",
            "parallel_data/yasstesseriyal_rom.txt"
        ]
    }
]

In [41]:
results = []
methods = [
    ['Character N-Grams', char_ngram_transliterate],
    ['Tensorflow', tensorflow_transliterate],
#    ['Word N-Grams', word_ngram_transliterate],
   ['OpenNMT', opennmt_transliterate]
]

for dataset in datasets:
    print("Processing", dataset['name'])
    ethiopic, latin = extract_from_files(dataset['paths'][0], dataset['paths'][1])
    if 'postprocess' in dataset:
        ethiopic = dataset['postprocess'](ethiopic)
        latin = dataset['postprocess'](latin)

    for method in methods:
        result = evaluate(ethiopic, latin, translit_func=method[1])
        result['method'] = method[0]
        result['dataset'] = dataset['name']
        result['wordcount'] = len(re.split('\s+', ' '.join(latin)))
        result['charcount'] = len('\n'.join(latin))
        results.append(result)

scores = pd.DataFrame(results)
scores

Processing Simulated natural transliteration


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


Processing The Lord's Prayer


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


Processing "Sera"


HBox(children=(IntProgress(value=0, max=127), HTML(value='')))




HBox(children=(IntProgress(value=0, max=127), HTML(value='')))




HBox(children=(IntProgress(value=0, max=127), HTML(value='')))


Processing "Taitu"


HBox(children=(IntProgress(value=0, max=95), HTML(value='')))




HBox(children=(IntProgress(value=0, max=95), HTML(value='')))




HBox(children=(IntProgress(value=0, max=95), HTML(value='')))


Processing "Tewodros"


HBox(children=(IntProgress(value=0, max=56), HTML(value='')))




HBox(children=(IntProgress(value=0, max=56), HTML(value='')))




HBox(children=(IntProgress(value=0, max=56), HTML(value='')))


Processing "Yasstesseriyal"


HBox(children=(IntProgress(value=0, max=55), HTML(value='')))




HBox(children=(IntProgress(value=0, max=55), HTML(value='')))




HBox(children=(IntProgress(value=0, max=55), HTML(value='')))




Unnamed: 0,lev_acc,word_acc,method,dataset,wordcount,charcount
0,0.739485,0.465517,Character N-Grams,Simulated natural transliteration,175,1572
1,0.570635,0.08046,Tensorflow,Simulated natural transliteration,175,1572
2,0.917378,0.706897,OpenNMT,Simulated natural transliteration,175,1572
3,0.685354,0.121212,Character N-Grams,The Lord's Prayer,33,290
4,0.426812,0.0,Tensorflow,The Lord's Prayer,33,290
5,0.765185,0.142857,OpenNMT,The Lord's Prayer,33,290
6,0.687849,0.327225,Character N-Grams,"""Sera""",371,2630
7,0.477971,0.078534,Tensorflow,"""Sera""",371,2630
8,0.74263,0.410995,OpenNMT,"""Sera""",371,2630
9,0.448463,0.18,Character N-Grams,"""Taitu""",283,2284


### For manual comparison of texts saved as files

In [86]:
def compare_full_texts(preds, origs):
    total = 0
    matched = 0
    for pred, orig in zip(preds, origs):
        pred_words = [w for w in re.split('\s+', pred) if w]
        orig_words = [w for w in re.split('\s+', orig) if w]
        for word in pred_words:
            if word in orig_words:
                orig_words.remove(word)
                matched += 1
        total += matched + len(orig_words)

    return matched / total
        
pred = open("temp/taitu_pred.txt").read().replace("፡", "").strip().splitlines()
orig = open("parallel_data/taitu_am.txt").read().replace("፡", "").strip().splitlines()
compare_full_texts(pred, orig)

0.01828966880869995