In [1]:
import re
import sys
import os
import xml.etree.ElementTree as ET
import xml
import requests
import json
from pathlib import Path
import PyPDF2 as ppdf
import string
import pickle
from ratelimit import limits, sleep_and_retry

In [14]:
# define vars
URL = "https://e-dictionary.ilrdf.org.tw/wsReDictionary.htm"

original_dict = {
    2: 'Amis',
    6: 'Atayal',
    24: 'Paiwan',
    22: 'Bunun',
    38: 'Puyma',
    28: 'Rukai',
    35: 'Tsou',
    13: 'Saisiyat',
    42: 'Yami',
    14: 'Thao',
    34: 'Kavalan',
    33: 'Truku',
    43: 'Sakizaya',
    16: 'Seediq',
    37: 'Saaroa',
    36: 'Kanakanavu'
}

# Create a new dictionary with keys and values swapped
TRIBES = {v: k for k, v in original_dict.items()}
NAMES = sorted([
    'Amis',
    'Atayal',
    'Paiwan',
    'Bunun',
    'Puyma',
    'Rukai',
    'Tsou',
    'Saisiyat',
    'Yami',
    'Thao',
    'Kavalan',
    'Truku',
    'Sakizaya',
    'Seediq',
    'Saaroa',
    'Kanakanavu'
])

NAME_TO_IDX = {name: i for i, name in enumerate(NAMES)}

In [3]:
# rate vars
# this mostly shouldn't matter since API times are already slow-ish
RATE_LIMIT = 25
RATE_PERIOD = 1

In [4]:
def getWords(index): # scrape, do once -> provides wordlist
    # Get path
    folders = [folder for folder in os.listdir(os.getcwd()) if os.path.isdir(folder) and not folder[0] == "."]
    folder = folders[index] # index
    get_pdf = [file for file in os.listdir(folder)]
    get_pdf = [file for file in get_pdf if re.search(r".*\.pdf", file.lower())]
    filepath = os.path.join(os.getcwd(), folder, get_pdf[0])

    # Open it + scrape
    all_tx = []
    with open(filepath, 'rb') as f:
        reader = ppdf.PdfReader(f)

        for num in range(len(reader.pages)):
            page = reader.pages[num]
            all_tx.append(page.extract_text())

    # Get words
    fullstring = ""
    for line in all_tx:
        fullstring += line
    
    # Split + identify words
    sep = fullstring.split("\n")
    words = [word for word in sep if "★" in word]
    return [re.search(r"^([a-z\^A-Z']+)", word).group(1) for word in words]


In [5]:
# @sleep_and_retry -- should not matter unless ratelimiting, if so uncomment
# @limits(calls=RATE_LIMIT, period=RATE_PERIOD)
def getData(tribeName, qw):
    ask = {
        "FMT": 1,
        "account": "E202403005",
        "TribesCode": TRIBES[tribeName],
        "qw": qw
    }

    jsn_response = requests.post(URL, data=ask)
    text = json.loads(jsn_response.text)
    try:
        assert jsn_response.status_code == 200
        return text["GenericData"]['DATA']
    except:
        return "FAIL"

In [6]:
def extractSentences(entry): # helper for processRequest
    word = entry["Name"]
    check = entry['Explanation']
    fr, zh = '', ''

    try:
        if isinstance(check, list):
            check = check[0] # it works
            if isinstance(check['Sentence'], dict):
                fr = check['Sentence']['Original']
                zh = check['Sentence']['Chinese']

            elif isinstance(check['Sentence'], list):
                fr = check['Sentence'][0]['Original']
                zh = check['Sentence'][0]['Chinese']
        
        elif isinstance(check, dict):
            if isinstance(check['Sentence'], dict):
                fr = check['Sentence']['Original']
                zh = check['Sentence']['Chinese']

            elif isinstance(check['Sentence'], list):
                fr = check['Sentence'][0]['Original']
                zh = check['Sentence'][0]['Chinese']
        return {word: (fr, zh)}
    except:
        return "FAIL"

In [7]:
def processRequest(response): # response into dict of {word: (fr, zh)}
    ret = {}
    fails = []
    if isinstance(response, list): # multiple entries
        for i, entry in enumerate(response):
            result = extractSentences(entry)
            word = entry["Name"]
            if result == "FAIL":
                print(word + " has failed in extraction.") # comment out if annoying 
                fails.append(word)
            else:
                ret.update(result)

    elif isinstance(response, dict): # only 1 entry
        result = extractSentences(response)
        word = response["Name"]
        if result == "FAIL":
            print(word + " has failed in extraction.") # comment out if annoying 
            fails.append(word)
        else:
            ret.update(result)
        
    return ret, fails

In [7]:
# set path var + make folders
base_path = os.path.join(os.getcwd(), '.PickleScrapes')

for name in NAMES:
    check = os.path.join(base_path, name)
    if not os.path.exists(check):
        os.mkdir(check)

In [None]:
# change me each run
scrapeTribe = 'Amis'

# fetch wordlist
words = set(getWords(NAME_TO_IDX[scrapeTribe]))

In [47]:
# main loop
word_sent_dict = {}
fails = []
seen = set()
tribepath = os.path.join(base_path, scrapeTribe, scrapeTribe)

for i, query in enumerate(words):
    try:
        if query in set():
            pass
        else:
            seen.add(query)
            response = getData(scrapeTribe, query) # change tribe
            if response == 'FAIL':
                fails.append(query)
                pass

            result, bad = processRequest(response)
            fails.extend(bad)
            if result == "FAIL":
                fails.append(query)
                pass

            word_sent_dict.update(result)

        if i % 500 == 0 or i == len(words) - 1:
            with open(tribepath + '_ckpt_{0}.pkl'.format(i), 'w', encoding='utf8') as f: # change tribe
                pickle.dump(word_sent_dict, f)
            with open(tribepath + '_fails_{0}.pkl'.format(i), 'w', encoding='utf8') as f: # change tribe
                pickle.dump(fails, f)
    except:
        print(f"Error occurred at iteration {i}: {query}")

with open(tribepath + '_ckpt_END.pkl', 'wb') as f: # change tribe
    pickle.dump(word_sent_dict, f)
with open(tribepath + '_fails_END.pkl', 'wb') as f: # change tribe
    pickle.dump(fails, f)

Error occurred at iteration 0: kalona
Error occurred at iteration 2: maropayay


In [10]:
# 170m runtime for 6k words

In [9]:
# open archived files
wsd_pickle = None
with open(tribepath + '_fails_END.pkl', 'rb') as f: # change tribe
    wsd_pickle = pickle.load(f)

notgood = None
with open(tribepath + '_fails_END.pkl', 'rb') as f: # change tribe
    notgood = pickle.load(f)

In [10]:
# XML setup
root = ET.Element("TEXT")
root.set("xml:lang", "fr")
root.set("id", scrapeTribe) # change second arg to lang name

In [11]:
# XML adding
for i, (word, sentpair) in enumerate(wsd_pickle.items()):
    zh = sentpair[1]
    fr = sentpair[0]

    name = scrapeTribe + str(i) # change tribename
    s = ET.SubElement(root, 'S')
    s.set('id', name)

    form = ET.SubElement(s, "FORM")
    form.text = fr

    transl = ET.SubElement(s, "TRANSL")
    transl.set("xml:lang", "zh")
    transl.text = zh

In [16]:
# write somewhere
tree = ET.ElementTree(root)
ET.indent(tree, space="\t", level=0)
writepath = os.path.join(os.getcwd(), '.PanglossXML')
tree.write(os.path.join(writepath, '{0}.xml'.format(scrapeTribe)), encoding="utf-8") # change to path later