In [1]:
import tensorflow as tf

import numpy as np
import os
import time

## Conversion tables

In [2]:
import json

with open('lat_eth.json') as f:
    lat_eth = json.load(f)

eth2lat = {}
for lat in lat_eth.keys():
    for eth in lat_eth[lat]:
        if eth not in eth2lat.keys():
            eth2lat[eth] = []
        eth2lat[eth].append(lat)

for key in eth2lat.keys():
    eth2lat[key] = set(eth2lat[key])

with open('char2idx.json') as f:
    char2idx = json.load(f)

with open('idx2char.json') as f:
    idx2char = np.array(json.load(f))


## The model

In [3]:
# Used a custom loss function so we need to load_model without compilation
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

model = tf.keras.models.load_model("char_model", compile=False)
model.compile(optimizer='adam', loss=loss)

## Probability calculations

Softmax converts the predictions into probabilities. 

In [4]:
# Convert predictions to probabilities

cont_cached = {}

def get_next_char_probs(start_string):
    cache_key = start_string
    if cache_key in cont_cached.keys():
        pred_prob = cont_cached[cache_key]
    else:
        input_eval = [char2idx[s] for s in start_string]
        input_eval = tf.expand_dims(input_eval, 0)
        print("input_eval is", input_eval)
        # Ask model to evaluate what's next, given start_string
        model.reset_states()
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0)
        
        # Convert scores to probabilities
        pred_prob = tf.nn.softmax(predictions[0])

        # Cache probabilities
        cont_cached[cache_key] = pred_prob
    return pred_prob

def continuation_proba(start_string, next_char):
    try:
        # Index of target for when we pull it out of the scores
        target_idx = char2idx[next_char]
    except:
        # If we've never seen that char before, get out of here!
        return 0
    
    # Check the cache
    cache_key = start_string
    if cache_key in cont_cached.keys():
        pred_prob = cont_cached[cache_key]
    else:
        # Convert characters into indices for tensorflow processing
        try:
            input_eval = [char2idx[s] for s in start_string]
        except:
            # We've never seen it!!! just exit!!
            return 0

        input_eval = tf.expand_dims(input_eval, 0)

        # Ask model to evaluate what's next, given start_string
        model.reset_states()
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0)
        
        # Convert scores to probabilities
        pred_prob = tf.nn.softmax(predictions[0])

        # Cache probabilities
        cont_cached[cache_key] = pred_prob

    
    proba = pred_prob[target_idx].numpy()
    return proba

start_string = "ኢንኢድኢህኢ"
next_char = "ን"

continuation_proba(start_string, next_char)

0.00088863523

# Probability of each step multiplied out

Currently does not adjust for length

In [5]:
%%time

cached = {}

def text_proba(text, length_adj=False):
    # Check the cache
    cache_key = text + ("-adj" if length_adj else "-unadj")
    if cache_key in cached.keys():
        return cached[cache_key]

    # Otherwise we'll calculate the probability
    proba = 1
    for i in range(1, len(text)):
        base_str = text[:i]
        next_char = text[i]
        proba = proba * continuation_proba(base_str, next_char)

    # Adjust for length
    if length_adj:
        proba = len(text) * proba

    # Store in cache
    cached[cache_key] = proba
    return proba

print("Adjusted for length:")
print(text_proba("ጭም", length_adj=True))
print(text_proba("ጭምቅ", length_adj=True))
print(text_proba("ጭምኢ", length_adj=True))

print("\nNot adjusted for length:")
print(text_proba("ጭም", length_adj=False))
print(text_proba("ጭምቅ", length_adj=False))
print(text_proba("ጭምኢ", length_adj=False))

Adjusted for length:
0.01510776486247778
0.0004449678877608077
2.7231123378621187e-05

Not adjusted for length:
0.00755388243123889
0.00014832262925360256
9.077041126207062e-06
CPU times: user 38.4 ms, sys: 3.35 ms, total: 41.7 ms
Wall time: 45.2 ms


In [6]:
#
#
# REMOVED because the greedy one (below) works better
# but left here for posterity
#
#

# %%time
# import itertools
# MAX_SEQ_LENGTH = 5

# def predict(sequence, base=None):
#     # print("Base of", base)
#     if len(sequence) > MAX_SEQ_LENGTH:
#         starters = predict(sequence[:MAX_SEQ_LENGTH], base)
#         overall_results = []
#         for r in starters:
#             if base:
#                 text = base + r[0]
#             else:
#                 text = r[0]
#             results = predict(sequence[MAX_SEQ_LENGTH:], text)
#             overall_results.extend(results)
#         return sorted(overall_results,
#                       key=lambda result: result[1],
#                       reverse=True)[:5]
#     else:
#         expanded = itertools.product(*sequence[:MAX_SEQ_LENGTH])
#         amharic_options = [''.join(chars) for chars in expanded]
#         if base:
#             amharic_options = [base + text for text in amharic_options]
#         scores = []
#         for possible in amharic_options:
#             scores.append(text_proba(''.join(possible)))
        
#         top_5 = np.argsort(scores)[-5:][::-1]
#         amharic_options = np.array(amharic_options)
#         scores = np.array(scores)
        
#         return list(zip(amharic_options[top_5], scores[top_5]))

# def breakdown_and_predict(text):
#     top_fives = []
#     for segmentation in tqdm_notebook(get_breakdowns(text), position=0):
#         options = [lat_eth[segment] for segment in segmentation]
#         top_five = predict(options)
#         top_fives.extend(top_five)

#     # We'll return the top 10 top fives
#     return sorted(top_fives,
#               key=lambda result: result[1],
#               reverse=True)[:10]

# # ጥበብን
# breakdown_and_predict("tibebini")

In [7]:
def get_breakdowns(segment):
    options = []
    max_len = min([len(segment), 4])
    for i in range(1, max_len+1):
        potential = segment[:i]
        if potential in lat_eth.keys():
            remainder = segment[i:]
            if remainder == "":
                options.append([potential])
            else:
                enders = get_breakdowns(remainder)
                if enders == []:
                    return []
                else:
                    options.extend([potential, *e] for e in enders)
    return options

In [8]:
breakdowns = get_breakdowns("inidihi")
breakdowns

[['i', 'n', 'i', 'd', 'i', 'h', 'i'],
 ['i', 'n', 'i', 'd', 'i', 'hi'],
 ['i', 'n', 'i', 'di', 'h', 'i'],
 ['i', 'n', 'i', 'di', 'hi'],
 ['i', 'ni', 'd', 'i', 'h', 'i'],
 ['i', 'ni', 'd', 'i', 'hi'],
 ['i', 'ni', 'di', 'h', 'i'],
 ['i', 'ni', 'di', 'hi']]

# Semi-greedy algorithm

Note that text_proba does NOT work with single chars, so this might be off at some points?

## `best_next_steps`

It's like "given a string, what's our next best character option?", except we allow for multiple inputs.

In [9]:
# Given a list of 'current' states (e.g. top 3 'tibe' options)
# And a list of things to add on (e.g. all 'bi' options)
# What are the best n options for 'tibebi'
def best_next_steps(current_states, next_options, n=3):
    # Pairs keeps track of all texts + probabilities
    pairs = []
    
    # From each possible 
    for base in current_states:
        # Create all the text options we're looking at
        # Then calculate their probability
        texts = [f"{base}{follower}" for follower in next_options]
        probs = [text_proba(text) for text in texts]
        pairs.extend(zip(texts, probs))
    
    # Only return the top n options
    top = sorted(pairs, key=lambda pair: pair[1], reverse=True)[:n]

    return top

### `get_top_sequences` and `get_top_sequences_all`

Given a breakdown (or a list of breakdowns), what are the best Ethiopic options? Greedy-ish, only follows the top 3 by default but you can adjust it.

In [27]:
from tqdm import tqdm_notebook

# Receive a list like ['ti', 'be', 'bi', 'ni']
# Tries out each Ethiopic option, returns top n most likely
def get_top_sequences(breakdown, n=3, lower_bound=None):
    eth_poss = [lat_eth[latin] for latin in breakdown]

    current = eth_poss[0]
    for next_options in eth_poss[1:]:
        top = best_next_steps(current, next_options, n)

        # Only keep ones above lower score bound
        # if you want to use them all, lower_bound should be None
        if lower_bound != None:
            current = [option[0] for option in top if option[1] > lower_bound]
            # For debugging
            dropped = len(top) - len(current)
        else:
            current = [option[0] for option in top]
    return top

# Takes a list of potential breakdowns
# n is passed to get_top_sequences
# limit is used here
def get_top_sequences_all(breakdowns, limit=5, n=3):
    tops = []
    
    # shorter breakdowns go first
    breakdowns = sorted(breakdowns, key=lambda breakdown: len(breakdowns))

    lower_bound = 0
    for breakdown in tqdm_notebook(breakdowns):
        top = get_top_sequences(breakdown, n, lower_bound=lower_bound)
        if len(top) > 0:
            worst_score = top[-1][1]
            if worst_score > lower_bound:
                lower_bound = worst_score
            tops.extend(top)
        
    return sorted(tops, key=lambda pair: pair[1], reverse=True)[:limit]

# Given Latin script, what is the best Ethiopic?
def top_transcriptions(text, limit=5, n=3):
    breakdowns = get_breakdowns(text)
    return get_top_sequences_all(breakdowns, limit, n)

## Examples of everything

In [11]:
# Getting potential breakdowns
get_breakdowns('tibebini')

[['t', 'i', 'b', 'e', 'b', 'i', 'n', 'i'],
 ['t', 'i', 'b', 'e', 'b', 'i', 'ni'],
 ['t', 'i', 'b', 'e', 'bi', 'n', 'i'],
 ['t', 'i', 'b', 'e', 'bi', 'ni'],
 ['t', 'i', 'be', 'b', 'i', 'n', 'i'],
 ['t', 'i', 'be', 'b', 'i', 'ni'],
 ['t', 'i', 'be', 'bi', 'n', 'i'],
 ['t', 'i', 'be', 'bi', 'ni'],
 ['ti', 'b', 'e', 'b', 'i', 'n', 'i'],
 ['ti', 'b', 'e', 'b', 'i', 'ni'],
 ['ti', 'b', 'e', 'bi', 'n', 'i'],
 ['ti', 'b', 'e', 'bi', 'ni'],
 ['ti', 'be', 'b', 'i', 'n', 'i'],
 ['ti', 'be', 'b', 'i', 'ni'],
 ['ti', 'be', 'bi', 'n', 'i'],
 ['ti', 'be', 'bi', 'ni']]

In [12]:
# Converting from latin to potential ethiopic
# (only works for one breakdown at a time)
breakdown = breakdowns[-1]
[lat_eth[segment] for segment in breakdown]

[['ኢ', 'እ', 'ዒ', 'ዕ'],
 ['ኒ', 'ን', 'ኚ', 'ኝ'],
 ['ዲ', 'ድ', 'ዺ', 'ፂ'],
 ['ሂ', 'ህ', 'ሒ', 'ሕ', 'ኂ', 'ኅ', 'ኺ', 'ኽ', 'ⷒ']]

In [13]:
# Calculate text probability
text_proba('ጥበብ')

0.0001135805060129536

In [14]:
# Given the starting options of ['ጥበ', 'ጥብ', 'ትበ']
# What options out of ['ቢ', 'ብ'] are the best next step?
best_next_steps(
    ['ጥበ', 'ጥብ', 'ትበ'],
    ['ቢ', 'ብ']
)

[('ጥበብ', 0.0001135805060129536),
 ('ጥብብ', 5.570733065839986e-05),
 ('ጥበቢ', 1.7590755614174365e-05)]

In [15]:
# get_top_sequences gets the top n options for a breakdown
print("Looking at", breakdown)
get_top_sequences(breakdown, 5)

Looking at ['i', 'ni', 'di', 'hi']


[('እኚዲህ', 3.228183081067021e-07),
 ('እንዲህ', 2.692246230842964e-07),
 ('እኚድህ', 1.6690833582038836e-07),
 ('እኚፂህ', 1.57611673325971e-07),
 ('እንድህ', 1.3919852955185693e-07)]

In [16]:
# get_top_sequences_all does it for multiple breakdowns
print("Looking at", len(breakdowns), "breakdowns")
get_top_sequences_all(breakdowns, limit=10)

Looking at 8 breakdowns


HBox(children=(IntProgress(value=0, max=8), HTML(value='')))




[('እኚዲህ', 3.228183081067021e-07),
 ('እንዲህ', 2.692246230842964e-07),
 ('እኚድህ', 1.6690833582038836e-07),
 ('እኚዲህኢ', 2.3856345637625673e-10),
 ('እኚዲህእ', 2.3244859594548903e-10),
 ('እንዲህኢ', 1.9895760250176246e-10),
 ('እንኢዲህ', 1.9895754405773796e-10),
 ('እንእዲህ', 1.9385786228808685e-10),
 ('እኚዽኢህ', 1.45560695875896e-10),
 ('እኚዽእህ', 1.4182976173606308e-10)]

In [17]:
top_transcriptions("tibenini")

HBox(children=(IntProgress(value=0, max=16), HTML(value='')))




[('ትበንን', 2.4012266244509255e-07),
 ('ጥበንን', 1.2694991862853193e-07),
 ('ትበንኚ', 9.789369922205556e-08),
 ('ትበንእን', 4.110395324151776e-09),
 ('ትእበንን', 4.110394925480327e-09)]

### Worst-case scenario, totally cleared cache

In [31]:
cont_cached = {}
cached = {}

# The cache will fill up as we go along, so
# this word won't always take so long
top_transcriptions("yamijadarguatkhawini")

HBox(children=(IntProgress(value=0, max=576), HTML(value='')))

[('ያምዣደርጓትኸዊን', 2.2041002574484945e-21),
 ('ያምዣደርጓጥኸዊን', 1.6694631837741595e-21),
 ('ያምዣፀርጓትኸዊን', 1.3377501132239324e-21),
 ('ያምዕዣደርጓትኸዊን', 2.3625769779701323e-23),
 ('ያምዣደርጓትኸዊንዕ', 2.3625757615350668e-23)]