## Build transliteration table

Pull table from riptsource.org/cms/scripts/page.php?item_id=entry_detail&uid=vsytndbyev and build both forward and reserve transliteration tables.

In [1]:
import requests
import pandas as pd
import unidecode
import numpy as np

In [2]:
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
headers = {'User-Agent': user_agent}
response = requests.get('https://scriptsource.org/cms/scripts/page.php?item_id=entry_detail&uid=vsytndbyev', headers=headers)

In [3]:
results = pd.read_html(response.text)
df = results[1]
df.head()

Unnamed: 0,Glyph,USV,ALA-LC,Alone-Stokes,B&D,Campbell,Chaîne,Cohen,Dawkins,Dept of State,ethiop,SERA
0,,1200,ha,hā,hä,ha,hă,ha,hä,ha,ha,he
1,,1201,hu,hu,hu,hū,hu,hu,hu,hu,hu,hu
2,,1202,hi,hī,hi,hī,hi,hi,hi,hi,hi,hi
3,,1203,hā,hā,ha,hā,ha,ha,ha,ha,hā,ha
4,,1204,hé,hē,he,hē,he,he,he,he,hē,hE


We can't see the glyph because it was an image! So we'll convert the Unicode code - `USV` - to the character.

In [4]:
df.Glyph = df.USV.apply(lambda val: chr(int(val, 16)))
df.head()

Unnamed: 0,Glyph,USV,ALA-LC,Alone-Stokes,B&D,Campbell,Chaîne,Cohen,Dawkins,Dept of State,ethiop,SERA
0,ሀ,1200,ha,hā,hä,ha,hă,ha,hä,ha,ha,he
1,ሁ,1201,hu,hu,hu,hū,hu,hu,hu,hu,hu,hu
2,ሂ,1202,hi,hī,hi,hī,hi,hi,hi,hi,hi,hi
3,ሃ,1203,hā,hā,ha,hā,ha,ha,ha,ha,hā,ha
4,ሄ,1204,hé,hē,he,hē,he,he,he,he,hē,hE


We'll save this one for now as a CSV (although we probably won't use it)

In [5]:
df.to_csv("transliteration-table.csv", index=False)

## Simplified transliteration table

In [6]:
df = pd.read_csv("transliteration-table.csv")
df = df.drop(columns=['USV'])
df = df.set_index('Glyph')
df

Unnamed: 0_level_0,ALA-LC,Alone-Stokes,B&D,Campbell,Chaîne,Cohen,Dawkins,Dept of State,ethiop,SERA
Glyph,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ሀ,ha,hā,hä,ha,hă,ha,hä,ha,ha,he
ሁ,hu,hu,hu,hū,hu,hu,hu,hu,hu,hu
ሂ,hi,hī,hi,hī,hi,hi,hi,hi,hi,hi
ሃ,hā,hā,ha,hā,ha,ha,ha,ha,hā,ha
ሄ,hé,hē,he,hē,he,he,he,he,hē,hE
...,...,...,...,...,...,...,...,...,...,...
ᎎ,,,,,,,,,pwē,pWE
ᎏ,,,,,,,,,pwe,pW
ፘ,rya,,,,,rya,rya,,~ri,
ፙ,mya,,,,,mya,,,~mA,


We're really just looking at ASCII Latin transliteraitons, so we'll turn `hā` into `ha`, etc.

In [7]:
def simplify(value):
    try:
        return unidecode.unidecode(value).lower()
    except:
        return np.nan
    
simplified = df.applymap(simplify)
simplified

Unnamed: 0_level_0,ALA-LC,Alone-Stokes,B&D,Campbell,Chaîne,Cohen,Dawkins,Dept of State,ethiop,SERA
Glyph,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ሀ,ha,ha,ha,ha,ha,ha,ha,ha,ha,he
ሁ,hu,hu,hu,hu,hu,hu,hu,hu,hu,hu
ሂ,hi,hi,hi,hi,hi,hi,hi,hi,hi,hi
ሃ,ha,ha,ha,ha,ha,ha,ha,ha,ha,ha
ሄ,he,he,he,he,he,he,he,he,he,he
...,...,...,...,...,...,...,...,...,...,...
ᎎ,,,,,,,,,pwe,pwe
ᎏ,,,,,,,,,pwe,pw
ፘ,rya,,,,,rya,rya,,~ri,
ፙ,mya,,,,,mya,,,~ma,


In [17]:
# Break apart h/ha into h and ha
uniqued = simplified.fillna('').apply(lambda row: list(set([v for v in row.values if v and '/' not in str(v)])), axis=1)

# Remove anything that involves @ or ~
uniqued = uniqued.apply(lambda row: [v for v in row if '@' not in v])

# Remove ~ (but keep the rest)
uniqued = uniqued.apply(lambda row: [v.replace('~', '') for v in row])

uniqued

Glyph
ሀ     [he, ha]
ሁ         [hu]
ሂ         [hi]
ሃ         [ha]
ሄ         [he]
       ...    
ᎎ        [pwe]
ᎏ    [pwe, pw]
ፘ    [ri, rya]
ፙ    [ma, mya]
ፚ    [fi, fya]
Length: 368, dtype: object

### Add punctuation

In [18]:
punctuation = {
    '።': ['.'],
    '፡': [' '],
    '፣': [','],
    '፤': [';'],
    '፥': [':'],
    '፧': ['?'],
    '፦': [':']
}

uniqued = pd.concat([pd.Series(punctuation), uniqued])
uniqued

።          [.]
፡          [ ]
፣          [,]
፤          [;]
፥          [:]
       ...    
ᎎ        [pwe]
ᎏ    [pwe, pw]
ፘ    [ri, rya]
ፙ    [ma, mya]
ፚ    [fi, fya]
Length: 375, dtype: object

### Save

In [27]:
uniqued.to_json("eth_lat.json")

In [28]:
import json

with open("eth_lat.json") as f:
    eth_lat = json.load(f)

eth_lat

{'።': ['.'],
 '፡': [' '],
 '፣': [','],
 '፤': [';'],
 '፥': [':'],
 '፧': ['?'],
 '፦': [':'],
 'ሀ': ['he', 'ha'],
 'ሁ': ['hu'],
 'ሂ': ['hi'],
 'ሃ': ['ha'],
 'ሄ': ['he'],
 'ህ': ['he', 'h', 'hi'],
 'ሆ': ['hwo', 'ho'],
 'ለ': ['la', 'le'],
 'ሉ': ['lu'],
 'ሊ': ['li'],
 'ላ': ['la'],
 'ሌ': ['lye', 'le'],
 'ል': ['l', 'li', 'le'],
 'ሎ': ['lwo', 'lo'],
 'ሏ': ['lwa'],
 'ሐ': ['he', 'ha'],
 'ሑ': ['hu'],
 'ሒ': ['hi'],
 'ሓ': ['ha'],
 'ሔ': ['he'],
 'ሕ': ['he', 'h', 'hi'],
 'ሖ': ['hwo', 'ho'],
 'ሗ': ['hwa'],
 'መ': ['ma', 'me'],
 'ሙ': ['mu'],
 'ሚ': ['mi'],
 'ማ': ['ma'],
 'ሜ': ['me', 'mye'],
 'ም': ['m', 'me', 'mi'],
 'ሞ': ['mo', 'mwo'],
 'ᎀ': ['mwe', 'mwa'],
 'ᎁ': ['mwi'],
 'ሟ': ['mwa'],
 'ᎂ': ['mwe'],
 'ᎃ': ['mwe', 'mw'],
 'ሠ': ['sa', "'se"],
 'ሡ': ['su', "'su"],
 'ሢ': ['si', "'si"],
 'ሣ': ['sa', "'sa"],
 'ሤ': ['se', 'sye', "'se"],
 'ሥ': ['se', "'s", 'si'],
 'ሦ': ["'so", 'swo', 'so'],
 'ሧ': ["'swa", 'swa'],
 'ረ': ['re', 'ra'],
 'ሩ': ['ru'],
 'ሪ': ['ri'],
 'ራ': ['ra'],
 'ሬ': ['re', 'rye'],
 'ር': ['re', 'r',

In [29]:
lat_eth = {}
for eth in eth_lat.keys():
    for lat in eth_lat[eth]:
        if lat not in lat_eth.keys():
            lat_eth[lat] = []
        lat_eth[lat].append(eth)

for key in lat_eth.keys():
    lat_eth[key] = list(set(lat_eth[key]))

lat_eth

{'.': ['።'],
 ' ': ['፡'],
 ',': ['፣'],
 ';': ['፤'],
 ':': ['፦', '፥'],
 '?': ['፧'],
 'he': ['ሐ', 'ኼ', 'ⷔ', 'ኅ', 'ⷕ', 'ኄ', 'ሔ', 'ህ', 'ሄ', 'ሕ', 'ሀ'],
 'ha': ['ሓ', 'ኀ', 'ሐ', 'ሃ', 'ⷐ', 'ኻ', 'ኸ', 'ⷓ', 'ሀ', 'ኃ'],
 'hu': ['ⷑ', 'ሑ', 'ኁ', 'ሁ', 'ኹ'],
 'hi': ['ሒ', 'ሂ', 'ኅ', 'ኂ', 'ⷒ', 'ህ', 'ሕ', 'ኺ', 'ኽ'],
 'h': ['ሕ', 'ህ'],
 'hwo': ['ሆ', 'ኾ', 'ሖ', 'ኆ'],
 'ho': ['ⷖ', 'ሖ', 'ኆ', 'ኾ', 'ሆ'],
 'la': ['ላ', 'ለ'],
 'le': ['ለ', 'ሌ', 'ል'],
 'lu': ['ሉ'],
 'li': ['ል', 'ሊ'],
 'lye': ['ሌ'],
 'l': ['ል'],
 'lwo': ['ሎ'],
 'lo': ['ሎ'],
 'lwa': ['ሏ'],
 'hwa': ['ኋ', 'ሗ', 'ዃ', 'ኈ'],
 'ma': ['መ', 'ማ', 'ፙ'],
 'me': ['መ', 'ም', 'ሜ'],
 'mu': ['ሙ'],
 'mi': ['ሚ', 'ም'],
 'mye': ['ሜ'],
 'm': ['ም'],
 'mo': ['ሞ'],
 'mwo': ['ሞ'],
 'mwe': ['ᎂ', 'ᎃ', 'ᎀ'],
 'mwa': ['ሟ', 'ᎀ'],
 'mwi': ['ᎁ'],
 'mw': ['ᎃ'],
 'sa': ['ሸ', 'ሣ', 'ፀ', 'ሠ', 'ጸ', 'ፃ', 'ሰ', 'ሻ', 'ጻ', 'ሳ'],
 "'se": ['ሤ', 'ፀ', 'ሠ', 'ፄ'],
 'su': ['ሡ', 'ፁ', 'ጹ', 'ሹ', 'ሱ'],
 "'su": ['ሡ', 'ፁ'],
 'si': ['ጽ', 'ፂ', 'ስ', 'ሽ', 'ሲ', 'ፅ', 'ሥ', 'ጺ', 'ሢ', 'ሺ'],
 "'si": ['ፂ', 'ሢ'],
 "'sa": ['ሣ'

In [26]:
with open("lat_eth.json", 'w') as f:
    json.dump(lat_eth, f)