# Set up ICU

In [1]:
%%capture
!pip install pyicu

In [2]:
import icu
def transliterator_from_rules(name, rules):
    fromrules = icu.Transliterator.createFromRules(name, rules)
    icu.Transliterator.registerInstance(fromrules)
    return icu.Transliterator.createInstance(name)

# Download data

In [3]:
_URL = "https://www3.smo.uhi.ac.uk/oduibhin/oideasra/lasid/lasid.zip"

In [4]:
%%capture
!wget {_URL}

In [5]:
%%capture
!unzip lasid.zip

In [6]:
file = open("mapdata.dat", "rb")

In [7]:
lines = file.readlines()

In [33]:
for line in lines:
  if b'\x09' in line:
    print(line)

b'066 R \t i h \xb2 n *\r\n'
b'088 k l \t i:   k l e i *\r\n'
b'018 b\xef \t \xaf u n *\r\n'
b'015 s \xd5\xef \t u: n *\r\n'
b'88S f \t i: *\r\n'
b'073 \xce t\xef \xad N t \t i   \xce h a r t *\r\n'
b'043 \xce f\xef \xb9: \t r b\xef i \xce k e: r d \xb2 \xce x\xef @ u: r \x11 \xb2 s t u \xfa \xad m *\r\n'


# LASID conversion

In [64]:
lasid_icu = """
\x07 → ᵏ ;
\\\t → ᵉ ; # \x09
\x11 → ʰ ;
\x12 → ⁱ ;
\x1e → ᶾ ;
\x1f → ᵊ ;
\: → ː ; # \x3a
\? → ʔ ; # \x3f
\@ → ʲ ; # \x40
E → ᴇ ; # \x45
I → ɪ ; # \x49
L → ʟ ;
N → ɴ ;
R → ʀ ;
\_ → ǰ ; # crane, 021 # \x5f
\~ → ᵑ ; # dreaming, 078; maybe ⁿ̠ ? # \x7e
\x80 → ɸ ;
\x83 → ɛ \u0300 ;
\x87 → u̜ ; # finger-nails, 043
\x88 → ʈ ; # looks like t̜ : toothache, 033
\x8a → è ;
\x8b → ï ;
\x8d → ɔ̜ ;
\x94 → ö ;
\x97 → α \u0323 ; # FIXME (maybe α̩  or ɑ̜ ?)
\x99 → t̠ ; # toothache, 021
\x9a → r̠ ;
\x9b → ø ;
\xa6 → d̠ ; # wedge, 021
\xa9 → ŏ ;
\xac → ĕ ;
\xad → o̤ ;
\xae → λ ;
\xaf → α ;
\xb0 → ɔ ;
\xb2 → ə ;
\xb6 → ᾰ ;
\xb7 → ă ;
\xb8 → λ \u0323 ; # FIXME
\xb9 → ɛ ;
\xc7 → t \u0323 ; # probably t̞
\xcc → m̥ ; # awake, 001
\xce → ˈ ;
\xd1 → s \u0323 ; # FIXME
\xd4 → ᴇ̀ ;
\xdd → ö̤ ;
\xde → k \u0323 ; # FIXME
\xe1 → g \u0323 ; # FIXME
\xe2 → e \u0323 ; # FIXME
\xe8 → ỹ ;
\xea → λ̃ ;
\xef → ′ ;
\xf0 → ″ ;
\xf1 → ö̤̃ ; # dreaming, 078
\xf7 → n̥ ; # awake, 059
\xf9 → ʃ ;
\xfa → ɣ ;
\xfe → ŋ ;
"""

In [65]:
lasid = transliterator_from_rules('lasid_icu', lasid_icu)


In [66]:
test = b'066 R \t i h \xb2 n *\r\n'

In [67]:
lasid.transliterate(test.decode('ISO-8859-1').rstrip())

'066 ʀ ᵉ i h ə n *'