# Import lookup table

In [93]:
import random
import json
import string
import unidecode
import regex

In [2]:
with open("raw/lookup-table.json") as f:
    lookup = json.load(f)

lookup

{'።': '.',
 '፡': ' ',
 '፣': ',',
 '፤': ';',
 '፥': ':',
 '፧': '?',
 '፦': ':',
 'ሀ': ['he', 'ha'],
 'ሁ': ['hu'],
 'ሂ': ['hi'],
 'ሃ': ['ha'],
 'ሄ': ['he'],
 'ህ': ['he', 'hi', 'h'],
 'ሆ': ['hwo', 'ho'],
 'ለ': ['la', 'le'],
 'ሉ': ['lu'],
 'ሊ': ['li'],
 'ላ': ['la'],
 'ሌ': ['lye', 'le'],
 'ል': ['l', 'li', 'le'],
 'ሎ': ['lo', 'lwo'],
 'ሏ': ['lwa'],
 'ሐ': ['he', 'ha'],
 'ሑ': ['hu'],
 'ሒ': ['hi'],
 'ሓ': ['ha'],
 'ሔ': ['he'],
 'ሕ': ['he', 'hi', 'h'],
 'ሖ': ['hwo', 'ho'],
 'ሗ': ['hwa'],
 'መ': ['ma', 'me'],
 'ሙ': ['mu'],
 'ሚ': ['mi'],
 'ማ': ['ma'],
 'ሜ': ['mye', 'me'],
 'ም': ['m', 'mi', 'me'],
 'ሞ': ['mwo', 'mo'],
 'ᎀ': ['mwa', 'mwe'],
 'ᎁ': ['mwi'],
 'ሟ': ['mwa'],
 'ᎂ': ['mwe'],
 'ᎃ': ['mw', 'mwe'],
 'ሠ': ['sa', "'se"],
 'ሡ': ['su', "'su"],
 'ሢ': ["'si", 'si'],
 'ሣ': ["'sa", 'sa'],
 'ሤ': ['se', 'sye', "'se"],
 'ሥ': ["'s", 'se', 'si'],
 'ሦ': ["'so", 'swo', 'so'],
 'ሧ': ["'swa", 'swa'],
 'ረ': ['ra', 're'],
 'ሩ': ['ru'],
 'ሪ': ['ri'],
 'ራ': ['ra'],
 'ሬ': ['re', 'rye'],
 'ር': ['re', 'ri', 'r'],
 'ሮ': 

# Tools to build randomized transliterations

In [48]:
import re

failures = set([])

def select_char(c):
    # If it's not in the lookup table, that's ok
    try:
        return random.choice(lookup[c])
    except:
        failures.add(c)
        return unidecode.unidecode(c).lower()

def transliterate(word):
    selected = ''.join([select_char(c) for c in word])
    selected = selected.replace("፡", " ")
    selected = re.sub(r'\s+', ' ', selected)
    return selected.strip()

In [49]:
text = "ማንም፡ሰው፡ቢሆን፡የጭካኔ፡ስቃይ፡እንዳይደርስበት፡ወይም፡ከሰብዓዊ፡አፈጻጸም፡ውጭ፡የሆነ፡የተዋረድ፡ተግባር፡ወይም፡ቅጣት፡አይፈጸምበትም።"

transliterate(text)

"manimi sewe bihone yecekanye skayi indayedarisibete weyim kasabeawi `afetsasem wtch'e yahone yetawaredi tegbare weyemi kitate 'ayfasemebatm."

In [50]:
for _ in range(5):
    result = transliterate(text)
    print(f"* {result}")

* manm sawi bihwone yacekane sqay `enedaydarisebate weyeme kaseb'awi `afatsasam weci yehona yatawared tegebari wayemi qtat `ajefasamebatme.
* manem sew bihon yetch'ekanye seqayi 'enedaydarsbati wejem kasabawi 'afatsasami wici yahwona yetawarade tagibare wayim qtati ayefatsambatme.
* manimi sawi bihon jacekanye skayi `enedajederesebate weym kesebe`awi 'afesaseme wce jahona yetewaradi tegibare weyime qtati 'ajefesemebatme.
* maneme sewe bihwon jachikanye siqayi indayedarisbeti wayim kasab'awi afasatsam wce jahone yetawarad tegbare weyemi qetate `ayfesemebatme.
* manime sew bihone jacekane sqaje inedayiderisibete weyime kasebe'awi `afasatsame wec jahwone yatewarede tegbar wayem qtati ayfasamibetimi.


# Use it

Using data from `new-am.txt`, which is Bible + legal + news.

In [51]:
#text = open("corpuscrawler-master/corpus/am.txt").readlines()
text = open("raw/new-am.txt").read().strip().splitlines()
text[:5]

['ጠ/ሚ መለስ ዜናዊ "ጦርነት ኳስ ጨዋታ አይደለም!" አሉ',
 'ሰሞኑን በሕወሓት/ኢሕአዴግ ግምገማ ውስጥ ዋነኛው የግምገማ በትር ያረፈው በጠ/ሚ መለስ ዜናዊ ላይ መሆኑ ተደጋግሞ እየተሰማ ነው።',
 'ከዚሁ ጋር ተያይዞ የጠ/ሚንስትሩ ጋርዶች በሌሎች መቀየራቸው፣  ከአቶ መለስ ዜናዊ ጋር የሚያገናኙ የቤተ መንግሥት የስልክ ግንኙነቶች መቋረጣቸው በሰፊው እየተነገረ ሲሆን፣ማንኛውንም የወቅቱን ጉዳይ አስመልክቶ መነጋገር የሚቻለው ከውጭ ጉዳይ ሚ/ር መ/ቤት ጋር መሆኑ ታውቋል።',
 'አቶ መለስ ዜናዊ በዚህ ሁኔታ ውስጥ መሆናቸው እየተነገረ ባለበት ወቅት ነው ከአሜሪካ ሬዲዮ ጋር ቃለ ምልልስ ያደረጉት።',
 'በቃለ ምልልሱም ወቅት በሕወሓት አመራር አካል የኤርትራን ጥያቄ ልዩነት ነበር ይባላል።']

In [52]:
# Drop duplicates
uniques = [line for line in list(set(text)) if line]
len(uniques)

48651

# Transliterate and save

In [122]:
import pandas as pd

def clean(text):
    # Remove invalid unicode chars
    # note it's the regex library, not re
    cleaned = regex.sub(r'(\p{Cn}|\p{Co}|\p{Cf}|\p{Cc})', '', text)
    
    # Replace space char with spaces
    cleaned = text.replace("፡", " ").strip()

    # Remove duplicate spaces
    cleaned = re.sub(r'\s+', ' ', cleaned)
    return cleaned

# Do you want to transliterate each row in multiple ways?
NUM_TRANSLITERATIONS = 2

sources = []
transliterateds = []
skipped = 0
for text in uniques:
    cleaned = clean(text)
    options = [transliterate(cleaned) for _ in range(NUM_TRANSLITERATIONS)]
    for transliteration in options:
        # Make sure we can split afterwards, some of these texts are real weird
        if len(cleaned.split(" ")) != len(transliteration.split(" ")):
            skipped = skipped + 1
        else:
            sources.append(cleaned)
            transliterateds.append(transliteration)

print("Skipped", skipped)
print("Fail chars:", failures)

Skipped 22
Fail chars: {'፶', '5', ':', '(', '\uf033', '፮', '\ufeff', 'f', 'E', 'י', '8', 'p', 'h', '፳', '&', '3', 'D', 'd', 'ז', '\x81', '—', 'M', '6', '9', '፹', ',', '\uf032', 'Í', 'ש', 'e', 'V', '“', 'r', '*', 'צ', 'ת', '′', '`', '̧', 'z', 'x', '/', 'c', '\x9d', '፰', '$', '¾', 's', '»', 'm', '”', '‘', 'g', '^', 'B', 'Z', '́', 'ê', 'P', 'ד', '÷', 'ª', 'y', 'Q', '፬', 'K', '-', 'I', '፵', '\uf038', '’', '፭', '!', 'נ', 'ע', 'Y', '¬', 'R', 'G', '፲', '4', ' ', '@', '×', '፪', '̃', '፯', '\uf036', 'L', '፴', '#', ';', 'Ø', 'Ñ', 'n', '"', 'א', '1', '¨', '፸', '፩', 'S', '\uf037', 'u', 'q', 'v', 'l', '\xa0', ')', 'ƒ', 'U', 'w', '፱', '%', '¡', 'Ó', 'ב', '[', '2', "'", 'A', '̈', 'T', 'j', '\uf020', '©', '+', '<', '7', 'i', '=', '\uf035', '፷', 'F', '‹', '?', '.', '\uf031', '>', '\uf034', 'o', '\u2060', '፻', 'b', '0', '፺', 'W', 'ק', '￼', 'Á', 'k', 'ל', '›', 'Ï', 'a', 'ו', 'C', '«', '…', 'X', 't', 'Ÿ', 'ח', 'N', 'H', 'O', '፫', '}', '_', '\uf039', 'ነ', 'J', ']'}


In [123]:
# Shuffle so it's randomized
df = pd.DataFrame({
    'source': sources,
    'transliterated': transliterateds
}).sample(frac=1).reset_index(drop=True)

print(df.shape)
df.head()

(97280, 2)


Unnamed: 0,source,transliterated
0,ቀዝቃዛ ውኃ የዛለችን ነፍስ እንደሚያረካ ሁሉከሩቅ አገር የመጣ መልካም ወ...,kazeqaza weha yazalatchene nefsi inidamiyaraka...
1,በመልካም ሁኔታ የሚያስተዳድሩ በተለይ ደግሞ በመናገርና በማስተማር ተግተው...,bemalkam huneta jamiyasitadaderu batelaye dagi...
2,መጽሐፍ ቅዱስ በመጨረሻዎቹ ቀናት ክፋት በከፍተኛ ሁኔታ እንደሚባባስ ይነግ...,mashefe qduse bemeceraxawotchu qana'te kefat b...
3,የአክሱም ሐውልት ሁለተኛው ክፋይ አክሱም ገባ,ye'akesum hawlte huletanaw kfaye akisume geba
4,የሴም ወንዶች ልጆች ኤላም፣ አሹር፣ አርፋክስድ፣ ሉድ እና አራም ነበሩ።,"yasyeme wanidwoce lgoce 'elam, `ashure, `arifa..."


In [130]:
with open('raw/original.txt', 'w') as f:
    f.write('\n'.join(df.source))
    
with open('raw/transliterated.txt', 'w') as f:
    f.write('\n'.join(df.transliterated))

In [None]:
!head -n 2 raw/original.txt

In [None]:
!head -n 2 raw/transliterated.txt