# Set up ICU

In [74]:
%%capture
!pip install pyicu

In [75]:
import icu
def transliterator_from_rules(name, rules):
    fromrules = icu.Transliterator.createFromRules(name, rules)
    icu.Transliterator.registerInstance(fromrules)
    return icu.Transliterator.createInstance(name)

# Download data

In [76]:
_URL = "https://www3.smo.uhi.ac.uk/oduibhin/oideasra/lasid/lasid.zip"

In [77]:
%%capture
!wget {_URL}

In [78]:
%%capture
!unzip lasid.zip

In [79]:
file = open("mapdata.dat", "rb")

In [80]:
lines = file.readlines()

In [None]:
title = ""
for key in b'\xab\xb1\xb4\xba\xbe\xc5\xc6\xc8\xc9\xcd\xcf\xd0\xd1\xd2\xd3\xd5\xd7\xd8\xdb\xdc\xdf\xe3\xe4\xe5\xe6\xe7\xeb\xed\xee\xf2\xf3\xf4\xf5\xf6\xf8\xfb\xfd':
  for line in lines:
    if b'{M' in line:
      title = line
    if key in line:
      print("\\x{:02x} → ".format(key))
      print(title)
      print(line)
      test = line
      break

# LASID conversion

In [282]:
lasid_icu = """
\x07 → ᵏ ;
\\\t → ᵉ ; # \x09
\x0e → ᴵ ;
\x11 → ʰ ;
\x12 → ⁱ ;
\x13 → ᵒ ;
\x14 → ᵒ̤ ;
\x15 → ʳ ;
\x16 → ˢ ;
\x17 → ᶴ ;
\x18 → ᵗ ;
\x19 → ᵘ ;
\x1a → ᵘ̯ ;
\x1c → ᵛ ;
\x1d → ʷ ;
\x1e → ᶾ ;
\x1f → ᵊ ;
\# → ᶠ ; # \x23
\$ → ᵠ ; # \x24
\% → ᵍ ; # \x25
\& → ᵞ ; # \x26 ˠ for IPA
\' → ’ ; # \x27
\: → ː ; # \x3a
\< → ⁱ̈ ; # \x3c
\= → ⁱ̯ ; # \x3d
\? → ʔ ; # \x3f
\@ → ʲ ; # \x40
E → ᴇ ; # \x45
I → ɪ ; # \x49
L → ʟ ;
N → ɴ ;
R → ʀ ;
\^ → ᵐ ; # \x5e
\_ → ǰ ; # crane, 021 # \x5f
\` → ɛ̀̃ ; # limekiln, 078: \x60
\| → ⁿ ; # lamb, 055: \x7c
\~ → ᵑ ; # dreaming, 078; maybe ⁿ̠ ? # \x7e
\x7f → ᴇ̃ ;
\x80 → φ ; # ɸ
\x81 → ü ;
\x83 → ɛ \u0300 ;
\x84 → è \u0323 ; # FIXME
\\\x85 → è̃ ; # this is �, so it needs to be escaped
\x86 → ũ̜ ; # lamb, 038
\x87 → u̜ ; # finger-nails, 043
\x88 → ʈ ; # looks like t̜ : toothache, 033
\x89 → ᵃ ; # eggs, 066
\x8a → è ;
\x8b → ï ;
\x8c → ɔ̜̃ ; # grandmother, 007
\x8d → ɔ̜ ;
\x8e → ɔ̆ ; # before i go, 078
\x8f → õ̜ ; # as cute, 062
\x91 → æ ;
\x92 → o̜ ;
\x93 → ɖ ;
\x94 → ö ;
\x95 → ɑ̜̃ ;
\x96 → û ; # milking, 067
\x97 → ɑ \u0323 ; # FIXME (maybe α̩  or ɑ̜ ?)
\x98 → v̠ ;
\x99 → t̠ ; # toothache, 021
\x9a → r̠ ;
\x9b → ø ;
\x9c → ɴ̠ ; # sick, 034
\x9d → ŋ̠ ; # grazing, 002
\x9e → n̠ ;
\x9f → l̠ ; # plumage, 068
\xa4 → k̠ ; # plumage, 068
\xa5 → g̠ ;
\xa6 → d̠ ; # wedge, 021
\xa7 → ŭ ;
\xa8 → ö̆ ;
\xa9 → ŏ ;
\xaa → ĭ ;
\xab → ɛ̆ ;
\xac → ĕ ;
\xad → o̤ ;
\xae → λ ;
\xaf → ɑ ; # α in the software
\xb0 → ɔ ;
\xb1 → ɑ̆ \u0323 ; # FIXME
\xb2 → ə ;
\xb4 → ᵈ ; # tail, 007
\xb6 → ɑ̆ ; # ᾰ in the software
\xb7 → ă ;
\xb8 → λ \u0323 ; # FIXME
\xb9 → ɛ ;
\xba → ʃ \u030c ; # calling, 067
\xbb → š ;
\xbc → ř ;
\xbd → ɑ̃ ;
\xbe → ẽ ; # tied, 88N
\xc1 → ′ ; # superscript prime
\xc5 → ᴍ̠ ; # fart, 071
\xc6 → ã ; # calf, 046
\xc7 → t \u0323 ; # probably t̞
\xc8 → λ̯ ; # mane, 067
\xc9 → o̯ ; # hare, 088
\xca → Ɫ ; # loaf, 001
\xcb → ɫ ; # loaf, 003
\xcc → m̥ ; # awake, 001
\xcd → ʀ̥ ; # thieving, 003
\xce → ˈ ;
\xcf → ˌ ; # cattle, 040
\xd0 → ð ; # boar, 88N
\xd1 → s \u0323 ; # FIXME # slime 008
\xd2 → r \u0323 ; # FIXME # bulls 067
\xd3 → ɪ̆ ; # suit of clothes 039
\xd4 → ᴇ̀ ;
\xd5 → p \u0323 ; # FIXME # castrating 053
\xd7 → ɪ̃ ; # slime, 007
\xd8 → ɪ̈ ; # calf 027
\xdb → o \u0323 ; # FIXME # cow 028
\xdc → ŋ \u0323 ; # FIXME # tied 078
\xdd → ö̤ ;
\xde → k \u0323 ; # FIXME
\xdf → i \u0323 ; # FIXME # sick 069
\xe1 → g \u0323 ; # FIXME
\xe2 → e \u0323 ; # FIXME
\xe3 → d \u0323 ; # FIXME # agut 052
\xe4 → õ ; # I shall tie 062
\xe5 → b \u0323 ; # FIXME # castrating 071
\xe6 → ɑ̃ \u0323 ; #FIXME # barking 049
\xe7 → ɑ \u0323 ; # FIXME # slime 008
\xe8 → ỹ ;
\xea → λ̃ ;
\xec → ũ ;
\xed → ɔ̃ ; # cow 074
\xee → õ̤ ; # barking 055
\xef → ′ ;
\xf0 → ″ ;
\xf1 → ö̤̃ ; # dreaming, 078
\xf4 → ĩ ; # sick 001
\xf5 → ɣ̃ ; # tied 075
\xf6 → ɛ̃ ; # tied 067
\xf7 → n̥ ; # awake, 059
\xf8 → r̥ ; # slime 002
\xf9 → ʃ ;
\xfb → · ; # slime 058
\xfa → ɣ ;
\xfc → χ ; # limekiln, 080
\xfe → ŋ ;
"""

In [261]:
lasid_titles_icu = """
\xb5 → Á ;
\xd6 → Í ;
\x90 → É ;
\xe0 → Ó ;
"""

In [167]:
lasid_spacing = """
$sp = '\u0020';
$sp $sp $sp $sp $sp → \_;
[^[0-9]] { $sp → ;
::null;
\_  → $sp ;
"""

In [284]:
lasid = transliterator_from_rules('lasid_icu', lasid_icu)
#test=b'049 t \xe6 \x80 \xb2 n *\r\n'
#test=b'055 t a \x80 \xee n *\r\n'
test=b'057 \xe7 L: \xb2 x *\r\n'
#test=b'011 l \xeb n\xef i h \xac *\r\n'
#test=b'034 l \xf3 n\xef h \xb2 *\r\n'
#test=b'074 g\xef \xf2 v\xef \xb2 s *\r\n'
#test=b'054 d\xef \xfd I v\xef \xb2 s *\r\n'

lasid.transliterate(test.decode('ISO-8859-1').rstrip())

'055 t a φ õ̤ n *'

In [217]:
lasid.transliterate(test.decode('ISO-8859-1').rstrip())

"075 ' ɔ h ɔ ĭ h′ *"