In [49]:
import tensorflow as tf

import numpy as np
import os
import time

In [113]:
import json

with open('lat_eth.json') as f:
    lat_eth = json.load(f)

eth2lat = {}
for lat in lat_eth.keys():
    for eth in lat_eth[lat]:
        if eth not in eth2lat.keys():
            eth2lat[eth] = []
        eth2lat[eth].append(lat)

for key in eth2lat.keys():
    eth2lat[key] = set(eth2lat[key])

with open('char2idx.json') as f:
    char2idx = json.load(f)

with open('idx2char.json') as f:
    idx2char = np.array(json.load(f))

eth2lat

{'ሀ': {'ha', 'he'},
 'ሄ': {'he'},
 'ህ': {'h', 'he', 'hi'},
 'ሐ': {'ha', 'he'},
 'ሔ': {'he'},
 'ሕ': {'h', 'he', 'hi'},
 'ኄ': {"'he", 'he'},
 'ኅ': {"'h", 'he', 'hi'},
 'ኼ': {'', 'he', 'hye', 'ke', 'khe', 'xe'},
 'ⷔ': {'', 'he', 'xe'},
 'ⷕ': {'', 'he', 'x'},
 'ሃ': {'ha'},
 'ሓ': {'ha'},
 'ኀ': {"'he", 'ha'},
 'ኃ': {"'ha", 'ha'},
 'ኸ': {'', 'ha', 'ka', 'ke', 'kha', 'xa'},
 'ኻ': {'', 'ha', 'ka', 'kha', 'xa'},
 'ⷐ': {'', 'ha', 'xe'},
 'ⷓ': {'', 'ha', 'xa'},
 'ሁ': {'hu'},
 'ሑ': {'hu'},
 'ኁ': {"'hu", 'hu'},
 'ኹ': {'', 'hu', 'khu', 'ku', 'xu'},
 'ⷑ': {'', 'hu', 'xu'},
 'ሂ': {'hi'},
 'ሒ': {'hi'},
 'ኂ': {"'hi", 'hi'},
 'ኺ': {'', 'hi', 'khi', 'ki', 'xi'},
 'ኽ': {'', 'hi', 'k', 'ke', 'khe'},
 'ⷒ': {'', 'hi', 'xi'},
 'ሆ': {'ho', 'hwo'},
 'ሖ': {'ho', 'hwo'},
 'ኆ': {"'ho", 'ho', 'hwo'},
 'ኾ': {'', 'ho', 'hwo', 'kho', 'ko', 'xo'},
 'ⷖ': {'', 'ho', 'xo'},
 'ለ': {'la', 'le'},
 'ሌ': {'le', 'lye'},
 'ል': {'l', 'le', 'li'},
 'ላ': {'la'},
 'ሉ': {'lu'},
 'ሊ': {'li'},
 'ሎ': {'lo', 'lwo'},
 'ሏ': {'', 'lwa'},
 'ሗ'

In [51]:
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

In [52]:
model = tf.keras.models.load_model("char_model", compile=False)
model.compile(optimizer='adam', loss=loss)

In [57]:
def get_breakdowns(segment):
    options = []
    max_len = min([len(segment), 4])
    for i in range(1, max_len+1):
        potential = segment[:i]
        if potential in lat_eth.keys():
            remainder = segment[i:]
            if remainder == "":
                options.append([potential])
            else:
                enders = get_breakdowns(remainder)
                if enders == []:
                    return []
                else:
                    options.extend([potential, *e] for e in enders)
    return options

In [20]:
breakdowns = get_breakdowns("inidihi")
breakdowns

[['i', 'n', 'i', 'd', 'i', 'h', 'i'],
 ['i', 'n', 'i', 'd', 'i', 'hi'],
 ['i', 'n', 'i', 'di', 'h', 'i'],
 ['i', 'n', 'i', 'di', 'hi'],
 ['i', 'ni', 'd', 'i', 'h', 'i'],
 ['i', 'ni', 'd', 'i', 'hi'],
 ['i', 'ni', 'di', 'h', 'i'],
 ['i', 'ni', 'di', 'hi']]

# Softmax converts the predictions into probabilities

Then we create a function where, given a string and a next_char, what is the probability of that being the next character

In [126]:
# Convert predictions to probabilities

cont_cached = {}

def get_next_char_probs(start_string):
    cache_key = start_string
    if cache_key in cont_cached.keys():
        pred_prob = cont_cached[cache_key]
    else:
        input_eval = [char2idx[s] for s in start_string]
        input_eval = tf.expand_dims(input_eval, 0)
        print("input_eval is", input_eval)
        # Ask model to evaluate what's next, given start_string
        model.reset_states()
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0)
        
        # Convert scores to probabilities
        pred_prob = tf.nn.softmax(predictions[0])

        # Cache probabilities
        cont_cached[cache_key] = pred_prob
    return pred_prob

def continuation_proba(start_string, next_char):
    try:
        # Index of target for when we pull it out of the scores
        target_idx = char2idx[next_char]
    except:
        # If we've never seen that char before, get out of here!
        return 0
    
    # Check the cache
    cache_key = start_string
    if cache_key in cont_cached.keys():
        pred_prob = cont_cached[cache_key]
    else:
        # Convert characters into indices for tensorflow processing
        try:
            input_eval = [char2idx[s] for s in start_string]
        except:
            # We've never seen it!!! just exit!!
            return 0

        input_eval = tf.expand_dims(input_eval, 0)

        # Ask model to evaluate what's next, given start_string
        model.reset_states()
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0)
        
        # Convert scores to probabilities
        pred_prob = tf.nn.softmax(predictions[0])

        # Cache probabilities
        cont_cached[cache_key] = pred_prob

    
    proba = pred_prob[target_idx].numpy()
    return proba

start_string = "ኢንኢድኢህኢ"
next_char = "ን"

continuation_proba(start_string, next_char)

0.00088863523

# Probability of each step multiplied out

Current does not adjust for length

In [129]:
%%time

cached = {}

def text_proba(text):
    # Check the cache
    cache_key = text
    if cache_key in cached.keys():
        return cached[cache_key]

    # Otherwise we'll calculate the probability
    proba = 1
    for i in range(1, len(text)):
        base_str = text[:i]
        next_char = text[i]
        proba = proba * continuation_proba(base_str, next_char)
    cached[cache_key] = proba
    return proba

print(text_proba("ጭምቅ"))
print(text_proba("ጭምኢ"))

0.00014832262925360256
9.077041126207062e-06
CPU times: user 1.55 ms, sys: 36 µs, total: 1.58 ms
Wall time: 1.6 ms


In [60]:
from tqdm import tqdm_notebook

possible_eths = ["ጭምቅ", "ጭምኢ"]
scores = []
for possible in tqdm_notebook(possible_eths):  
    scores.append(text_proba(possible))
len(scores)

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))

2

In [61]:
scores = np.array(scores)
possible_eths = np.array(possible_eths)
top_5 = np.argsort(scores)[-5:]

list(zip(possible_eths[top_5], scores[top_5]))

[('ጭምኢ', 9.077041126207062e-06), ('ጭምቅ', 0.00014832262925360256)]

In [62]:
breakdowns = get_breakdowns("inidihi")
len(breakdowns)

8

In [77]:
%%time
import itertools
MAX_SEQ_LENGTH = 5

def predict(sequence, base=None):
    # print("Base of", base)
    if len(sequence) > MAX_SEQ_LENGTH:
        starters = predict(sequence[:MAX_SEQ_LENGTH], base)
        overall_results = []
        for r in starters:
            if base:
                text = base + r[0]
            else:
                text = r[0]
            results = predict(sequence[MAX_SEQ_LENGTH:], text)
            overall_results.extend(results)
        return sorted(overall_results,
                      key=lambda result: result[1],
                      reverse=True)[:5]
    else:
        expanded = itertools.product(*sequence[:MAX_SEQ_LENGTH])
        amharic_options = [''.join(chars) for chars in expanded]
        if base:
            amharic_options = [base + text for text in amharic_options]
        scores = []
        for possible in amharic_options:
            scores.append(text_proba(''.join(possible)))
        
        top_5 = np.argsort(scores)[-5:][::-1]
        amharic_options = np.array(amharic_options)
        scores = np.array(scores)
        
        return list(zip(amharic_options[top_5], scores[top_5]))

def breakdown_and_predict(text):
    top_fives = []
    for segmentation in tqdm_notebook(get_breakdowns(text), position=0):
        options = [lat_eth[segment] for segment in segmentation]
        top_five = predict(options)
        top_fives.extend(top_five)

    # We'll return the top 10 top fives
    return sorted(top_fives,
              key=lambda result: result[1],
              reverse=True)[:10]

# ጥበብን
breakdown_and_predict("tibebini")

HBox(children=(IntProgress(value=0, max=16), HTML(value='')))

CPU times: user 8.36 s, sys: 89.8 ms, total: 8.45 s
Wall time: 8.28 s


[('ጥበብን', 3.280546537670618e-07),
 ('ጥብብን', 1.6089952151951464e-07),
 ('ጥበብኚ', 1.5551292882675773e-07),
 ('ጥበብኝ', 7.773054829150377e-08),
 ('ጥበብኒ', 7.654066773465777e-08),
 ('ትበብእን', 9.345996652290154e-10),
 ('ትእበብን', 9.345995679616942e-10),
 ('ትበብንእ', 9.34598883008878e-10),
 ('ጥዕበብን', 7.604349570773747e-10),
 ('ጥበብንዕ', 7.604344154347199e-10)]

In [39]:
sequence = get_breakdowns("tibebini")[0]
sequence

['t', 'i', 'b', 'e', 'b', 'i', 'n', 'i']

In [40]:
[lat_eth[segment] for segment in sequence]

[['ት', 'ጥ'],
 ['ኢ', 'እ', 'ዒ', 'ዕ'],
 ['ብ'],
 ['ኤ', 'ዔ'],
 ['ብ'],
 ['ኢ', 'እ', 'ዒ', 'ዕ'],
 ['ን', 'ኝ'],
 ['ኢ', 'እ', 'ዒ', 'ዕ']]

In [48]:
sequences = get_breakdowns("yamijadarguatkhawini")
[lat_eth[segment] for segment in sequences[0]]

[['ይ'],
 ['አ', 'ኣ', 'ኧ', 'ዐ', 'ዓ'],
 ['ም'],
 ['ኢ', 'እ', 'ዒ', 'ዕ'],
 ['ጅ'],
 ['አ', 'ኣ', 'ኧ', 'ዐ', 'ዓ'],
 ['ድ', 'ዽ'],
 ['አ', 'ኣ', 'ኧ', 'ዐ', 'ዓ'],
 ['ር'],
 ['ግ', 'ጝ'],
 ['ኡ', 'ዑ'],
 ['አ', 'ኣ', 'ኧ', 'ዐ', 'ዓ'],
 ['ት', 'ጥ'],
 ['ክ', 'ኽ'],
 ['ህ', 'ሕ'],
 ['አ', 'ኣ', 'ኧ', 'ዐ', 'ዓ'],
 ['ው'],
 ['ኢ', 'እ', 'ዒ', 'ዕ'],
 ['ን', 'ኝ'],
 ['ኢ', 'እ', 'ዒ', 'ዕ']]

In [51]:
breakdown_and_predict("kealemi")

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))

HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))

HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))

HBox(children=(IntProgress(value=0, max=60), HTML(value='')))

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))

HBox(children=(IntProgress(value=0, max=120), HTML(value='')))

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))

HBox(children=(IntProgress(value=0, max=8), HTML(value='')))

HBox(children=(IntProgress(value=0, max=24), HTML(value='')))

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))

HBox(children=(IntProgress(value=0, max=90), HTML(value='')))

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))

HBox(children=(IntProgress(value=0, max=180), HTML(value='')))

HBox(children=(IntProgress(value=0, max=540), HTML(value='')))

HBox(children=(IntProgress(value=0, max=270), HTML(value='')))

[('ክዓልም', 7.803842710079035e-07),
 ('ኬዓልሚ', 5.179787820593345e-07),
 ('ኬዓሌሚ', 5.123903266222173e-07),
 ('ኬአልሚ', 4.105589789794878e-07),
 ('ኬአሌሚ', 4.0612947213904664e-07),
 ('ክኧልም', 1.5400259140580158e-07),
 ('ኽኧለም', 1.357493189736178e-07),
 ('ክኧሌም', 6.683262448934249e-08),
 ('ክኧለም', 5.813822283047255e-08),
 ('ኽኧልም', 4.41697032535445e-08)]