In [81]:
import re
import sys
import os
import xml.etree.ElementTree as ET
import xml.dom.minidom
import xml
import requests
import json
from pathlib import Path
import PyPDF2 as ppdf
import string
import pickle
from tqdm import tqdm
from ratelimit import limits, sleep_and_retry

In [82]:
# define vars
URL = "https://e-dictionary.ilrdf.org.tw/wsReDictionary.htm"

original_dict = {
    2: 'Amis',
    6: 'Atayal',
    24: 'Paiwan',
    22: 'Bunun',
    38: 'Puyma',
    28: 'Rukai',
    35: 'Tsou',
    13: 'Saisiyat',
    42: 'Yami',
    14: 'Thao',
    34: 'Kavalan',
    33: 'Truku',
    43: 'Sakizaya',
    16: 'Seediq',
    37: 'Saaroa',
    36: 'Kanakanavu'
}

# Create a new dictionary with keys and values swapped
TRIBES = {v: k for k, v in original_dict.items()}
NAMES = sorted([
    'Amis',
    'Atayal',
    'Paiwan',
    'Bunun',
    'Puyma',
    'Rukai',
    'Tsou',
    'Saisiyat',
    'Yami',
    'Thao',
    'Kavalan',
    'Truku',
    'Sakizaya',
    'Seediq',
    'Saaroa',
    'Kanakanavu'
])

INTERVAL = 500

RATE_LIMIT = 25
RATE_PERIOD = 1

NAME_TO_IDX = {name: i for i, name in enumerate(NAMES)}

PICKLE_FOLDER = '.PickleScrapes/'

TRIBE_STR_TO_ISO = {
    'Amis': 'ami',
    'Atayal': 'tay',
    'Paiwan': 'pwn',
    'Bunun': 'bnn',
    'Puyma': 'pyu',
    'Rukai': 'dru',
    'Tsou': 'tsu',
    'Saisiyat': 'xsy',
    'Yami': 'tao',
    'Thao': 'ssf',
    'Kavalan': 'ckv',
    'Truku': 'trv',
    'Sakizaya': 'ais',
    'Seediq': 'sdq',
    'Saaroa': 'sxr',
    'Kanakanavu': 'xnb'
}


In [83]:
def getPickles(tribe:str):
    ckpt_good, ckpt_fail = None, None

    with open(os.path.join(PICKLE_FOLDER, tribe, tribe + '_ckpt_END.pkl'), 'rb') as f: 
        ckpt_good = pickle.load(f)

    with open(os.path.join(PICKLE_FOLDER, tribe, tribe + '_fails_END.pkl'), 'rb') as f: 
        ckpt_fail = pickle.load(f)

    return ckpt_good, ckpt_fail

In [84]:
def printDataRatios():
    for name in NAMES:
        g, f = getPickles(name)
        ratio = (len(g) - len(f)) / len(g)
        print(f'{name[0:6]}\t{ratio:.4f}')

In [85]:
def makeLists(tribe: str) -> (list, list, list):
    all, bad = getPickles(tribe)

    goodwords = []
    for q, w in all.items():
        if w != 'FAIL':
            goodwords.append(q)

    assert len(goodwords) + len(bad) == len(all)

    singles, multi = [], []
    for w in goodwords:
        query = all[w]
        if isinstance(query, list):
            multi.append(query)
        elif isinstance(query, dict):
            singles.append(query)

    return (goodwords, singles, multi)


In [86]:
def handleHelper(sent):
    # check all key components
    for key in ['Original', 'Chinese', 'File']:
        if key not in sent.keys():
            return False

    # extract text
    audio = sent['File']
    fr_tx = sent['Original']
    cn_tx = sent['Chinese']
    return [fr_tx, cn_tx, audio]

def createElemHelp(tr, count, r):
    # xml setup
    s = ET.Element('S')
    s.set('id', str(tr) + "_" + str(count))
    count += 1

    form = ET.SubElement(s, 'FORM') # sentence
    tl = ET.SubElement(s, 'TRANSL') # translation
    tl.set('xml:lang', 'zho')
    audio = ET.SubElement(s, 'AUDIO') # audio link

    tl.text = r[1]
    form.text = r[0]

    # yami will produce a list so pick 1st choice
    if isinstance(r[2], list):
        r[2] = r[2][0]
    audio.set('url', r[2]['Path'])

    return s, count, form.text

def wrapperXML(sent, root, count, seen):
    # process every request thru same pipeline
    if (r := handleHelper(sent)):
        s, count, te= createElemHelp(tr, count, r)
        # memoization
        if te in seen:
            return count-1
        else:
            seen[te] = ''
            root.append(s)  
    return count

def handleExplanation(expl, root, tr: str, count: int, seen): # assume expl is dict
    # expl is dict
    if isinstance(expl, dict):
        if 'Sentence' not in expl.keys(): 
            return count, seen
        sent = expl['Sentence']

        # dict case
        if isinstance(sent, dict):        
            count = wrapperXML(sent, root, count, seen)
        # list case
        elif isinstance(sent, list):
            for n in sent:
                count = wrapperXML(n, root, count, seen)

    # expl is list
    elif isinstance(expl, list):
        for explan in expl:
            if 'Sentence' not in explan.keys(): 
                continue
            sent = explan['Sentence']

            # dict case
            if isinstance(sent, dict):        
                count = wrapperXML(sent, root, count, seen)
            # list case
            elif isinstance(sent, list):
                for n in sent:
                    count = wrapperXML(n, root, count, seen)
    return count, seen

In [87]:
for tr in NAMES:
    # XML setup
    root = ET.Element("TEXT")
    root.set("id", tr)
    root.set("citation", "財團法人原住民族語言研究發展基金會. (2021). ILRDF Webservice Dictionary. Retrieved June 25, 2024, from https://e-dictionary.ilrdf.org.tw/wsReDictionary.htm")
    root.set("copyright", "Creative Commons")
    root.set("xml:lang", TRIBE_STR_TO_ISO[tr])

    # make lists and init vars
    gw, singles, multis = makeLists(tr)
    count = 0
    seen = {}

    # iterate thru entries
    for w in singles:
        if 'Explanation' in w.keys():
            count, seen = handleExplanation(w['Explanation'], root, tr, count, seen)

    for ww in multis:
        for w in ww:
            if 'Explanation' in w.keys():
                count, seen = handleExplanation(w['Explanation'], root, tr, count, seen)

    # print error margins
    g, f = getPickles(tr)
    g = len(g)
    print(f"{tr[0:6]}\t{g}\t{count}\t{g - count}")

    # convert to str
    tree = ET.ElementTree(root)
    xml_str = ET.tostring(root, encoding='utf-8')

    # convert str to pretty using minidom
    dom = xml.dom.minidom.parseString(xml_str)
    pretty_xml_str = dom.toprettyxml(indent="  ")

    # make path and mkdir
    dir = os.path.join(os.getcwd(), '.PanglossXML', tr)
    if not os.path.exists(dir):
        os.makedirs(dir)

    f = os.path.join(dir, f"{tr}.xml")
    with open(f, 'w', encoding='utf-8') as file:
        file.write(pretty_xml_str)
        

Amis	5850	5482	368
Atayal	5646	6393	-747
Bunun	5547	8714	-3167
Kanaka	4008	5639	-1631
Kavala	6670	8241	-1571
Paiwan	5505	5365	140
Puyma	7359	7073	286
Rukai	10683	10454	229
Saaroa	4425	4115	310
Saisiy	5012	6102	-1090
Sakiza	5703	5360	343
Seediq	5470	5478	-8
Thao	3693	6419	-2726
Truku	31294	4682	26612
Tsou	2836	2884	-48
Yami	6320	6437	-117


In [88]:
# # for debug purposes. change tr to target a tribe and check the output
# tr = 'Yami' # set to your tribe you want to debug
# goodwords, singles, multis = makeLists(tr)

# root = ET.Element("TEXT")
# root.set("xml:lang", TRIBE_STR_TO_ISO[tr])
# root.set("id", tr) 

# count = 0
# seen = {}
# for w in singles:
#     count, seen = handleExplanation(w['Explanation'], root, tr, count, seen)

# for ww in multis:
#     for w in ww:
#         count, seen = handleExplanation(w['Explanation'], root, tr, count, seen)

# # print error margins
# g, w = getPickles(tr)
# l = len(g) + len(w)
# print(f"{tr[0:6]}\t{l}\t{count}\t{l - count} unaccounted")

# tree = ET.ElementTree(root)

# # convert to str
# xml_str = ET.tostring(root, encoding='utf-8')

# # convert str to pretty using minidom
# dom = xml.dom.minidom.parseString(xml_str)
# pretty_xml_str = dom.toprettyxml(indent="  ")

# # print
# print(pretty_xml_str)