In [1]:
import tensorflow as tf

import numpy as np
import os
import time

## Conversion tables

In [2]:
import json

with open('lat_eth.json') as f:
    lat_eth = json.load(f)

eth2lat = {}
for lat in lat_eth.keys():
    for eth in lat_eth[lat]:
        if eth not in eth2lat.keys():
            eth2lat[eth] = []
        eth2lat[eth].append(lat)

for key in eth2lat.keys():
    eth2lat[key] = set(eth2lat[key])

with open('char2idx.json') as f:
    char2idx = json.load(f)

with open('idx2char.json') as f:
    idx2char = np.array(json.load(f))


## The model

In [3]:
# Used a custom loss function so we need to load_model without compilation
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

model = tf.keras.models.load_model("char_model_wiki", compile=False)
model.compile(optimizer='adam', loss=loss)

## Probability calculations

Softmax converts the predictions into probabilities. 

In [4]:
# Convert predictions to probabilities

cont_cached = {}

def get_next_char_probs(start_string):
    cache_key = start_string
    if cache_key in cont_cached.keys():
        pred_prob = cont_cached[cache_key]
    else:
        input_eval = [char2idx[s] for s in start_string]
        input_eval = tf.expand_dims(input_eval, 0)
        print("input_eval is", input_eval)
        # Ask model to evaluate what's next, given start_string
        model.reset_states()
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0)
        
        # Convert scores to probabilities
        pred_prob = tf.nn.softmax(predictions[0])

        # Cache probabilities
        cont_cached[cache_key] = pred_prob
    return pred_prob

def continuation_proba(start_string, next_char):
    try:
        # Index of target for when we pull it out of the scores
        target_idx = char2idx[next_char]
    except:
        # If we've never seen that char before, get out of here!
        return 0
    
    # Check the cache
    cache_key = start_string
    if cache_key in cont_cached.keys():
        pred_prob = cont_cached[cache_key]
    else:
        # Convert characters into indices for tensorflow processing
        try:
            input_eval = [char2idx[s] for s in start_string]
        except:
            # We've never seen it!!! just exit!!
            return 0

        input_eval = tf.expand_dims(input_eval, 0)

        # Ask model to evaluate what's next, given start_string
        model.reset_states()
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0)
        
        # Convert scores to probabilities
        pred_prob = tf.nn.softmax(predictions[0])

        # Cache probabilities
        cont_cached[cache_key] = pred_prob

    
    proba = pred_prob[target_idx].numpy()
    return proba

start_string = "ኢንኢድኢህኢ"
next_char = "ን"

continuation_proba(start_string, next_char)

2.88072e-06

# Probability of each step multiplied out

Currently does not adjust for length

In [5]:
%%time

cached = {}

def text_proba(text, length_adj=False):
    # Check the cache
    cache_key = text + ("-adj" if length_adj else "-unadj")
    if cache_key in cached.keys():
        return cached[cache_key]

    # Otherwise we'll calculate the probability
    proba = 1
    for i in range(1, len(text)):
        base_str = text[:i]
        next_char = text[i]
        proba = proba * continuation_proba(base_str, next_char)

    # Adjust for length
    if length_adj:
        proba = len(text) * proba

    # Store in cache
    cached[cache_key] = proba
    return proba

print("Adjusted for length:")
print(text_proba("ጭም", length_adj=True))
print(text_proba("ጭምቅ", length_adj=True))
print(text_proba("ጭምኢ", length_adj=True))

print("\nNot adjusted for length:")
print(text_proba("ጭም", length_adj=False))
print(text_proba("ጭምቅ", length_adj=False))
print(text_proba("ጭምኢ", length_adj=False))

Adjusted for length:
1.2738985333271557e-06
2.6276173969082115e-13
1.7058100856634426e-13

Not adjusted for length:
6.369492666635779e-07
8.758724656360705e-14
5.686033618878142e-14
CPU times: user 52.2 ms, sys: 4.62 ms, total: 56.8 ms
Wall time: 334 ms


In [6]:
#
#
# REMOVED because the greedy one (below) works better
# but left here for posterity
#
#

# %%time
# import itertools
# MAX_SEQ_LENGTH = 5

# def predict(sequence, base=None):
#     # print("Base of", base)
#     if len(sequence) > MAX_SEQ_LENGTH:
#         starters = predict(sequence[:MAX_SEQ_LENGTH], base)
#         overall_results = []
#         for r in starters:
#             if base:
#                 text = base + r[0]
#             else:
#                 text = r[0]
#             results = predict(sequence[MAX_SEQ_LENGTH:], text)
#             overall_results.extend(results)
#         return sorted(overall_results,
#                       key=lambda result: result[1],
#                       reverse=True)[:5]
#     else:
#         expanded = itertools.product(*sequence[:MAX_SEQ_LENGTH])
#         amharic_options = [''.join(chars) for chars in expanded]
#         if base:
#             amharic_options = [base + text for text in amharic_options]
#         scores = []
#         for possible in amharic_options:
#             scores.append(text_proba(''.join(possible)))
        
#         top_5 = np.argsort(scores)[-5:][::-1]
#         amharic_options = np.array(amharic_options)
#         scores = np.array(scores)
        
#         return list(zip(amharic_options[top_5], scores[top_5]))

# def breakdown_and_predict(text):
#     top_fives = []
#     for segmentation in tqdm_notebook(get_breakdowns(text), position=0):
#         options = [lat_eth[segment] for segment in segmentation]
#         top_five = predict(options)
#         top_fives.extend(top_five)

#     # We'll return the top 10 top fives
#     return sorted(top_fives,
#               key=lambda result: result[1],
#               reverse=True)[:10]

# # ጥበብን
# breakdown_and_predict("tibebini")

In [7]:
def calc_breakdowns(segment):
    # Remove apostrophes
    options = []
    max_len = min([len(segment), 4])
    for i in range(1, max_len+1):
        potential = segment[:i]
        if potential in lat_eth.keys():
            remainder = segment[i:]
            if remainder == "":
                options.append([potential])
            else:
                enders = calc_breakdowns(remainder)
                if enders == []:
                    return []
                else:
                    options.extend([potential, *e] for e in enders)
    return options

def get_breakdowns(segment):
    results = calc_breakdowns(segment)
    if len(results) == 0:
        results = calc_breakdowns(segment.replace("'", "").replace("`", ""))
    return results

In [8]:
breakdowns = get_breakdowns("inidihi")
breakdowns

[['i', 'n', 'i', 'd', 'i', 'h', 'i'],
 ['i', 'n', 'i', 'd', 'i', 'hi'],
 ['i', 'n', 'i', 'di', 'h', 'i'],
 ['i', 'n', 'i', 'di', 'hi'],
 ['i', 'ni', 'd', 'i', 'h', 'i'],
 ['i', 'ni', 'd', 'i', 'hi'],
 ['i', 'ni', 'di', 'h', 'i'],
 ['i', 'ni', 'di', 'hi']]

# Semi-greedy algorithm

Note that text_proba does NOT work with single chars, so this might be off at some points?

## `best_next_steps`

It's like "given a string, what's our next best character option?", except we allow for multiple inputs.

In [9]:
# Given a list of 'current' states (e.g. top 3 'tibe' options)
# And a list of things to add on (e.g. all 'bi' options)
# What are the best n options for 'tibebi'
def best_next_steps(current_states, next_options, n=3):
    # Pairs keeps track of all texts + probabilities
    pairs = []
    
    # From each possible 
    for base in current_states:
        # Create all the text options we're looking at
        # Then calculate their probability
        texts = [f"{base}{follower}" for follower in next_options]
        probs = [text_proba(text) for text in texts]
        pairs.extend(zip(texts, probs))
    
    # Only return the top n options
    top = sorted(pairs, key=lambda pair: pair[1], reverse=True)[:n]

    return top

### `get_top_sequences` and `get_top_sequences_all`

Given a breakdown (or a list of breakdowns), what are the best Ethiopic options? Greedy-ish, only follows the top 3 by default but you can adjust it.

In [10]:
from tqdm import tqdm_notebook

# Receive a list like ['ti', 'be', 'bi', 'ni']
# Tries out each Ethiopic option, returns top n most likely
def get_top_sequences(breakdown, n=3, lower_bound=None):
    eth_poss = [lat_eth[latin] for latin in breakdown]

    # current = eth_poss[0]
    # for next_options in eth_poss[1:]:
    # Starts with a space bc beginning of word
    current = " "
    for next_options in eth_poss:
        # If it's a space take all options
        n_steps = 10 if current == " " else n
        top = best_next_steps(current, next_options, n_steps)

        # Only keep ones above lower score bound
        # if you want to use them all, lower_bound should be None
        if lower_bound != None:
            current = [option[0] for option in top if option[1] >= lower_bound]
            # For debugging
            dropped = len(top) - len(current)
        else:
            current = [option[0] for option in top]
    
    # Remove spaces
    top = [(option[0][1:], option[1]) for option in top]
    return top

# Takes a list of potential breakdowns
# n is passed to get_top_sequences
# limit is used here
def get_top_sequences_all(breakdowns, limit=5, n=5):
    tops = []
    
    # shorter breakdowns go first
    breakdowns = sorted(breakdowns, key=lambda breakdown: len(breakdowns))

    lower_bound = 0
    # Visual readout of length
#     for breakdown in tqdm_notebook(breakdowns):
    for breakdown in breakdowns:
        top = get_top_sequences(breakdown, n, lower_bound=lower_bound)
        if len(top) > 0:
            worst_score = top[-1][1]
            if worst_score > lower_bound:
                lower_bound = worst_score
            tops.extend(top)
        
    return sorted(tops, key=lambda pair: pair[1], reverse=True)[:limit]

# Given Latin script, what is the best Ethiopic?
def top_transcriptions(text, limit=5, n=5):
    breakdowns = get_breakdowns(text)
    return get_top_sequences_all(breakdowns, limit, n)


## Examples of everything

In [11]:
# Getting potential breakdowns
get_breakdowns('tibebini')

[['t', 'i', 'b', 'e', 'b', 'i', 'n', 'i'],
 ['t', 'i', 'b', 'e', 'b', 'i', 'ni'],
 ['t', 'i', 'b', 'e', 'bi', 'n', 'i'],
 ['t', 'i', 'b', 'e', 'bi', 'ni'],
 ['t', 'i', 'be', 'b', 'i', 'n', 'i'],
 ['t', 'i', 'be', 'b', 'i', 'ni'],
 ['t', 'i', 'be', 'bi', 'n', 'i'],
 ['t', 'i', 'be', 'bi', 'ni'],
 ['ti', 'b', 'e', 'b', 'i', 'n', 'i'],
 ['ti', 'b', 'e', 'b', 'i', 'ni'],
 ['ti', 'b', 'e', 'bi', 'n', 'i'],
 ['ti', 'b', 'e', 'bi', 'ni'],
 ['ti', 'be', 'b', 'i', 'n', 'i'],
 ['ti', 'be', 'b', 'i', 'ni'],
 ['ti', 'be', 'bi', 'n', 'i'],
 ['ti', 'be', 'bi', 'ni']]

In [12]:
# Converting from latin to potential ethiopic
# (only works for one breakdown at a time)
breakdown = breakdowns[-1]
[lat_eth[segment] for segment in breakdown]

[['ኢ', 'እ', 'ዒ', 'ዕ'],
 ['ኒ', 'ን', 'ኚ', 'ኝ'],
 ['ዲ', 'ድ', 'ዺ', 'ፂ'],
 ['ሂ', 'ህ', 'ሒ', 'ሕ', 'ኂ', 'ኅ', 'ኺ', 'ኽ', 'ⷒ']]

In [13]:
# Calculate text probability
text_proba('ጥበብ')

1.1294759933652892e-13

In [14]:
# Given the starting options of ['ጥበ', 'ጥብ', 'ትበ']
# What options out of ['ቢ', 'ብ'] are the best next step?
best_next_steps(
    ['ጥበ', 'ጥብ', 'ትበ'],
    ['ቢ', 'ብ']
)

[('ትበቢ', 8.690577709569554e-11),
 ('ትበብ', 6.622375520165555e-11),
 ('ጥብቢ', 1.372473027387372e-11)]

In [15]:
# get_top_sequences gets the top n options for a breakdown
print("Looking at", breakdown)
get_top_sequences(breakdown, 5)

Looking at ['i', 'ni', 'di', 'hi']


[('ዕኝድኽ', 1.62357156335283e-13),
 ('ዕኝድሂ', 7.477636447044843e-14),
 ('ዕኝድህ', 7.429985103788478e-14),
 ('ዕኝዲኽ', 7.153003101312422e-14),
 ('ዒኝድኽ', 6.501592972821455e-14)]

In [16]:
# get_top_sequences_all does it for multiple breakdowns
print("Looking at", len(breakdowns), "breakdowns")
get_top_sequences_all(breakdowns, limit=10)

Looking at 8 breakdowns


[('ዕኝድኽ', 1.62357156335283e-13),
 ('ዕኝድሂ', 7.477636447044843e-14),
 ('ዕኝድህ', 7.429985103788478e-14),
 ('ዕኝዲኽ', 7.153003101312422e-14),
 ('ዒኝድኽ', 6.501592972821455e-14),
 ('ዕኝዕድኽ', 3.1332750462177764e-16),
 ('ዕኝድዕኽ', 3.133273723157534e-16),
 ('ዕኝዕድሂ', 1.4430833979273464e-16),
 ('ዕኝድዕሂ', 1.4430827885693287e-16),
 ('ዕኝዕድህ', 1.4338873287109334e-16)]

In [17]:
# get_top_sequences_all does it for multiple breakdowns
print("Looking at", len(breakdowns), "breakdowns")
get_top_sequences_all(breakdowns, limit=10)

Looking at 8 breakdowns


[('ዕኝድኽ', 1.62357156335283e-13),
 ('ዕኝድሂ', 7.477636447044843e-14),
 ('ዕኝድህ', 7.429985103788478e-14),
 ('ዕኝዲኽ', 7.153003101312422e-14),
 ('ዒኝድኽ', 6.501592972821455e-14),
 ('ዕኝዕድኽ', 3.1332750462177764e-16),
 ('ዕኝድዕኽ', 3.133273723157534e-16),
 ('ዕኝዕድሂ', 1.4430833979273464e-16),
 ('ዕኝድዕሂ', 1.4430827885693287e-16),
 ('ዕኝዕድህ', 1.4338873287109334e-16)]

In [18]:
top_transcriptions("tibebi")

[('ጥብብ', 3.4980477533301296e-11),
 ('ጥበብ', 1.479104984137833e-11),
 ('ጢብብ', 9.103798573178485e-12),
 ('ጥቤብ', 5.090075612385105e-12),
 ('ጢበብ', 3.849425392022195e-12)]

### Worst-case scenario, totally cleared cache

In [20]:
cont_cached = {}
cached = {}

# The cache will fill up as we go along, so
# this word won't always take so long
#top_transcriptions("yamijadarguatkhawini")

# Actual transliterations

In [21]:
import string
from nltk.tokenize import word_tokenize

def tokenize_text(text):
    sentence = [i for j in text.split() for i in (j, ' ')][:-1]
    cleaned = []
    for elmt in sentence:
        elmt_tokenized = word_tokenize(elmt)
        if elmt == ' ':
            cleaned.append(' ')
        elif len(elmt) == len(elmt_tokenized):
            cleaned.append(elmt)
        else:
            for i in elmt_tokenized:
                if i == "'":
                    elmt_tokenized.remove(i)
                cleaned += elmt_tokenized

    return cleaned

def transliterate(text):
    sent_trans = []

    cleaned = tokenize_text(text)
    for word in cleaned:
        sent_trans.append(transliterate_word(word))
    
    return "".join(sent_trans)

def transliterate_word(word, top_n=1):
    if word.isspace():
        return word
    if word in lat_eth.keys() and len(lat_eth[word]) == 0:
        return lat_eth[word][0]
    elif word in string.punctuation:
        return word
    elif word.isnumeric() == True:
        return word
    else:
        cleaned = word.lower()
        results = top_transcriptions(cleaned)
        if len(results) == 0:
            cleaned = re.sub(r'[^a-z]', '', cleaned)
            results = top_transcriptions(cleaned)

        if top_n == 1:
            return results[0][0]
        else:
            return [result[0] for result in results[:top_n]]

In [22]:
transliterate_word("tibenini")

'ጥብኝኝ'

In [23]:
transliterate_word("tibenini", top_n=2)

['ጥብኝኝ', 'ጥበኝኝ']

In [24]:
transliterate("tibenini")

'ጥብኝኝ'

In [25]:
transliterate("inidihi sil . hulat sewochi liseliyu wade")
#እንዲህ ሲል ። ሁለት ሰዎች ሊጸልዩ ወደ

'ዕኝድኽ ሽል . ኹለጥ ሽዖጪ ልሽልዩ ዋዴ'

In [26]:
transliterate("ijasusme . 'eweneti iwnat `elacehualehu")
#ኢየሱስም ። እውነት እውነት እላችኋለሁ

'ዕያሱጽሜ . ዕዌኝጥ ዕውኛጥ ዕለጬኋልኹ'

In [27]:
top_transcriptions("ijasusme")

[('ዕያሱጽሜ', 9.281910896609981e-18),
 ('ዕያሱስሜ', 7.614574842925677e-18),
 ('ዕጀሱጽሜ', 7.018528586437527e-18),
 ('ዕጀሱስሜ', 5.757770334572038e-18),
 ('ዕጃሱጽሜ', 5.721845260082377e-18)]

# Evaluations

In [28]:
#!pip install python-levenshtein

In [29]:
# EVALUATION

import Levenshtein
from statistics import mean

# this function takes two files (parallel texts) and generates
# two lists after stripping beginning/trailing whitespace
def extract_from_files(ethiopic_file, latin_file):
    eth_file = open(ethiopic_file)
    lat_file = open(latin_file)
    ethiopic = [line.rstrip() for line in eth_file.readlines()]
    latin = [line.rstrip() for line in lat_file.readlines()] 

    ethiopic = list(ethiopic)
    latin = list(latin)

    return ethiopic, latin

# this function takes two parallel lists and evaluates how
# our model performs
def evaluate(ethiopic, latin):
    accuracies = []

    count = 0
    for line in tqdm_notebook(latin):
        predict = transliterate(line)
        correct = ethiopic[count]
        accuracy = Levenshtein.ratio(predict,correct)
        accuracies.append(accuracy) 
        count += 1

    return mean(accuracies)

import re
def evaluate_top_n(ethiopic, latin, top_n=2):
    accuracies = []

    total = 0
    correct = 0
    paired = list(zip(ethiopic, latin))
    for eth_line, lat_line in tqdm_notebook(paired):
        eth_tokens = re.split('\s+', eth_line.strip())
        lat_tokens = re.split('\s+', lat_line.strip())
        for eth_token, lat_token in zip(eth_tokens, lat_tokens):
            results = transliterate_word(lat_token, top_n=5)
            if eth_token in results:
                correct += 1
            total += 1
    return correct/total


In [30]:
ethiopic_tot, latin_tot = extract_from_files("raw/original.txt", "raw/transliterated.txt")

In [36]:
ethiopic_2 = ethiopic_tot[0:10]
latin_2 = latin_tot[0:10]

In [37]:
#cached = {}
#top_transcriptions("jadata", 15, 20)

In [38]:
evaluate(ethiopic_2, latin_2)

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

0.3725482577524574

In [39]:
evaluate_top_n(ethiopic_2, latin_2)

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

0.07471264367816093

In [40]:
ethiopic_3 = ethiopic_tot[0:100]
latin_3 = latin_tot[0:100]

In [None]:
# evaluate(ethiopic_3, latin_3)

In [41]:
ethiopic_manual, latin_manual = extract_from_files('parallel_data/taitu_am.txt','parallel_data/taitu_rom.txt')

In [42]:
evaluate(ethiopic_manual[:100], latin_manual[:100])

HBox(children=(IntProgress(value=0, max=95), HTML(value='')))

0.4873676417154896

In [43]:
evaluate_top_n(ethiopic_manual[:100], latin_manual[:100])

HBox(children=(IntProgress(value=0, max=95), HTML(value='')))

0.14666666666666667